#!/usr/bin/perl -w
#
#NAME
#  abcbot - search the Web for ABC tunes
#
#SYNOPSIS
#  abcbot  [options] [host...]
#
#DESCRIPTION
#  This program is a web explorer robot that looks for ABC music.
#
#  This program works from a "hosts" database which is currently kept in  the
#  "hst/" subdirectory, one file per host.  Each file contains a line per URL
#  at that host, possibly followed by one or more  lines  giving  information
#  that was extracted from that URL.
#
#  Each pass of this program gets a list of URLs and/or host names.  URLs are
#  read from standard input, and all we do with them is append  them  to  the
#  host's   file.    Thus   http://foo.bar.com/abc/   results   in  the  file
#  hst/foo.bar.com having a line added giving the URL.
#
#  There are two distinct ways of running this program:  with  or  without  a
#  host  list  on  the command line.  If there are no hosts listed, we run in
#  "initialization" mode. The input stream should contain a list of URLs (and
#  possibly other information such as hosts to avoid).  For each URL, we make
#  sure that the host's file exists, and we append a  line  for  the  URL  at
#  depth 1. The next pass should then find these URLs and scan them for links
#  or tunes.
#
#  Here's how I do the initialization:
#    abcbot +CURLs >& abcbot.out &
#
#  If  called  with  hosts on the command line, we are in "update" mode.  The
#  input may contain hosts to avoid, but URLs there will be ignored. Instead,
#  we  do  a scan of each of the listed hosts and update its file in the host
#  directory.  For each host, we move its file to backup (by appending '-' to
#  its  name), and then we read this backup file and write a new file for the
#  host.  For each URL in the host's file, we fetch  the  file,  and  extract
#  hyperlinks and ABC tunes.
#
#  Hyperlinks   are   accumulated,   and  at  the  end  we  will  repeat  the
#  "initialization" and append the links to their hosts' files.  For each ABC
#  tune  found, we add a line showing the "interesting" information about the
#  tune.  What is considered interesting may change from time to time.
#
#  Because of the difficulties in preventing  infinite  loops  with  URLs  we
#  implement  two  ways  of  limiting  the URLs that may be followed: You can
#  restrict the depth of recursion with the -D option, and you  can  restrict
#  the hostname(s) with the +H option.
#
#REQUIRES
#	$GetCmd = './webcat';		# Program to write a URL to STDOUT
	require "V.pm";				# Verbose/debug package
	use HTML::Entities;			# For handling HTML "entity" encodings
	binmode(STDOUT,':utf8');	# Convert output to UTF-8 encoding
	require "DT.pm";			# Date/time routine(s)
	require "namesubs.pm";		# Name-munging routines
	use htmlsubs;				# HTML-text conversion
	my $hs = new htmlsubs;		# We're Object-Oriented!
#
#  We call webcat as a subprocess to fetch files from the  web.   You  should
#  find  it  in  the same directory.  This was done so that we could properly
#  time out zombie connections to some of the broken web sites out there.  It
#  turns out that you can only abort a connect() with sig('ALRM'), and if you
#  attempt to close the socket after an alarm, you may die a horrible  death.
#  With  that  isolated  in  a  subprocess,  we can continue to run past such
#  disasters and continue with the next URL.
#
#  This program no longer uses the LWP::Simple modules.  I've found a simpler
#  approach.   But you'll have to download these modules, and possibly change
#  push to say where you put them:
#
	$ENV{'PATH'} = ".:sh:$ENV{'PATH'}";
	push @INC, '.', 'sh', split(':',$ENV{'PATH'});
	use Backup;		# File backup routine.
	use abcCode;	# Calculates tune codes.
	use DT;			# Date/Time routine.
#	use HTTPcon;	# Makes HTTP connection to server.
#	use URLdata;	# Opens URL and returns file handle.
	use URLhref;	# Combines URL + HREF -> new URL.
	use URLtrim;	# Shrinks URLs.
	use HTMLdir;	# HTML directory listing.
#
#  They'll have to be in your @INC path;  by  default  we  add  $HOME/sh  and
#  $HOME/pl to @INC, so those are good places to put them.
#
#ENVIRONMENT
#  We read the following from the environment:
#
#  V_abcbot=<l><file>
#    If defined, this defines our "verbose" level and output file.  The level
#    <l>  is  a number (which defaults to 1 or 2, depending on shat I want at
#    the moment), the optional <file> (which defaults to STDERR) is where the
#    output is written.  Note that this variable's name consists of 'V_' plus
#    the program's name.  If you call this program by some  other  name,  you
#    should of course use 'V_' plus that name.
#
#INPUT
#  We always read from stdin, so if you don't  want  to  provide  any  input,
#  you'll  need to redirect our input to /dev/null.  The input is scanned for
#  URLs, and they are added to our starting list (at depth 1).
#
#  As a special aid in limiting searches, the  input  may  contain  lines  of
#  these forms (with or without the colons):
#    done:   http://foo.bar.com/xyz
#    ignore  http://foo.bar.com/xyz
#    avoid:  http://foo.bar.com/xyz
#  These are ways of telling abcbot to ignore certain URLs.  The  "done"  and
#  "ignore"  commands  give  specific  URLs  that  are to be avoided; this is
#  implemented by simply listing them as "already done".   With  the  "avoid"
#  command,  we  extract  the  host  name, and URLs for that host will not be
#  used.
#
#OUTPUT
#
#OPTIONS
#  Options start with '-' or '+' plus a letter,  with  possibly  a  parameter
#  (and no embedded spaces).  Some of the options take an initial '+' to mean
#  "enable" and '-' to mean "disable".  For others, the '-'  or  '+'  is  not
#  relevant.   If  '+'  is  shown in the list below,, then it is significant.
#  Capitalization of the option letters doesn't matter (but it may matter  in
#  an argument string if there is one).
#
#  -<n>
#    where <n> is an integer, means a timeout of <n> seconds.  The default is
#    currently:
#
	$ABCtmout = $ENV{'ABCtmout'} || 120;	# Was 600
#
#  -d<depth>
#    This restricts the depth of directory  searches  to  <depth>.   This  is
#    mostly to avoid infinite loops.  The default is 3.  Experience has shown
#    that each depth level produces at least a factor of 10 increase  in  run
#    time,  so  you  should be careful with this.  It's much faster to have a
#    shallow depth and a long list of starting URLs.  One recommendation: use
#    the  previous  output  as  input,  so  all  the  successes  then will be
#    re-scanned (at depth 2) in the current run.
#
#  +h<host>
#    Allow URLs for <host>.  Default: All hosts allowed.  If there is one  or
#    more +h options, then only these hosts are allowed.
#
#  -s
#  +s<n>
#    Skip over <n> URLs while searching.  This has the effect of not making a
#    lot  of requests in succession of a single server.  It is implemented by
#    moving n-1 URLs to the end of the URL list before each attempt to  fetch
#    a URL.
#
#LOCALHOST
#  The following host names are rewritten:
#
	%hostsub = (
#		'ecf-guest.mit.edu' => 'localhost',
#		'trillian.mit.edu' => 'localhost',
#		'jc.tzo.net:1742' => 'localhost',
#		'lochaber.tullochgorm.com' => 'localhost',
	);
#
#EXAMPLES
#
#SIGNALS
#  There are various ways that this  program  may  get  hung  up  because  of
#  misbehavior  (or  behavior that may be valid but I don't understand it) on
#  the part of web servers.  You can "kick" this program by sending it  these
#  signals:
#
#  CONT
#    Abandon the current URL by closing the connection.
#  INT or HUP
#    Abandon the search and write the output files.
#  QUIT
#    Abandon the current document and host, finish up, and exit.
#  USR1
#    Decrease the verbose level ($V) by 1.  Also, we write a dump of the call
#    stack to the verbose log.
#  USR2
#    Increase the verbose level ($V) by 1.  Also, we write a dump of the call
#    stack to the verbose log.
#    
#
#MISC
#  Lines in a file starting with "%%noindex" tell abcbot to ignore tunes.  If
#  a  "%%noindex"  line is inside a tune, only that one tune will be ignored.
#  If a "%%noindex" line is found ouside a tune, it means to ignore the  rest
#  of the file. [Added by JC 2007-10-08]
#
#BUGS
#  This program is highly experimental, in alpha state, and all that.  Use it
#  at your own risk.  (Not much risk, there, actually, but I thought I'd give
#  the usual friendly warnings.) Just don't write the output  back  over  the
#  input,  and check its output with a browser or two, and there shouldn't be
#  many problems.
#
#  Of course, there are constant problems with spelling variations. Musicians
#  are  atrocious spellers.  This program doesn't even attempt to tackle this
#  issue.
#
#AUTHOR:
#  John Chambers <jc@trillian.mit.edu> http://trillian.mit.edu/~jc/

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Assorted initializations:

#ENV{"V_$GetCmd"} = 1;

($P = $0) =~ s'.*/'' unless defined($P);	# This program's name
($myhost = `hostname`) =~ s/\s+$//;			# Our hostname
&Vopt($ENV{"V_$P"} || '2');					# Verbose level

$| = 1;			# Auto-flush stdout
$, = "\n";		# Is this still used?
$" = "\n\t";	# How about this?
$exitstat = 0;	# Anyone can set this to nonzero

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Some global vars for controlling actions:

$allowcgi      =   0;	# We usually don't look at cgi URLs
$articles      = '-';	# Don't include initial articles
$chkuplinks    =   1;	# Check for "parent/home/back" lines
$currhost      =  '';	# The host we're processing right now
%Disprefix     =  ();	# List of URL initial strings to reject
%RejectPfx     =  ();	# List of URL initial strings for this host to reject
$followUpLink  =   0;	# Whether to follow links that contain "/../"
#HDRkludge     =   1; 	# Try to ignore HDR files
$HTTPcontime   =   0;	# When we last tried an HTTP connect
#listabchosts  =   1;	# Collect list of hosts with abc files
$purgebad      =   1;	# Drop URIs that get 404 (Not found)
$saveunmatched =   0;	# If true, unmatched chunks will be preserved
$schedule      =   0;	# If >0, schedule a rerun after this many minutes
$SCDkludge     =   0; 	# Try to ignore SCD dance-form titles
#urlskip       =   1;	# Set > 1 to scatter URLs

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Initialize the module to calculate tune encodings:
#
$abcCode = new abcCode;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Counter default values:
#
$doctunes   = 0;	# Number of X: lines discovered in current doc
$doclinks   = 0;	# Number of hyperlinks discovered in current doc
$doctitls   = 0;	# Number of T: lines discovered in current doc
$filemax    = 0;	# Is this used?
$ignoretune = 0;	# If true, ignore all tunes in this file
$ignorefile = 0;	# If true, ignore the current tune
$inHTTPhdrs = 0;	# If true, we expect HTTP headers
$linkcnt    = 0;	# Total links  at this host
$scancnt    = 0;	# Total scans  of this host
$tunecnt    = 0;	# Total tunes  at this host
$titlcnt    = 0;	# Total titles at this host

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# These are passed to $GetCmd.  There are a lot of problems with web  servers
# that require a specific HTTP version number.  If we get nothing from a site
# that should have ABC tunes, try setting the HTTPversion to '1.0'.

$HTTPdelay   = &env('HTTPdelay',      1);	# Was 0
$HTTPtimeout = &env('HTTPtimeout',   60);	# Was 600
$HTTPversion = &env('HTTPversion', '1.1');
print V "$P: HTTP delay $HTTPdelay.\n" if $V>1;
print V "$P: HTTP timeout $HTTPtimeout.\n" if $V>1;
print V "$P: HTTP version $HTTPversion.\n" if $V>1;
$TOopen = $TOread = -1;	# Timeout intervals

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Initialize the tune cache.

$cachetunes  = &env('ABCcache' , 1);	# Writing cache?
#cachebase   = '.';				# Where to put the cache
$cachetmp    = "cache$$.data";	# Cache file while reading
print V "$P: Caching tunes.\n" if $V>3 && $cachetunes;
$hs->setOption("+cache=$cachetmp")	# Tell htmlsubs to use our cache file
	if $cachetunes;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Recursion control:
#
$abcdepth = 4;	# Depth limit for *.abc files
$hstdepth =		# Default depth limit for current host
$maxdepth = 3;	# Default depth limit for directories [jc 20061212]
$maxurls  = 0;	# If >0, give up after this many URLs
$urlcount = 0;	# Number of URLs processed so far
$depth    = 1;	# The current depth in directories

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#
#maxlinedflt =
$maxlines  = 2000;	# Give up if no ABC in this many lines of text
$maxscans  =    1;	# Max number of scans before giving up on host
$showlinks =    0;	# Include links in host files

$smryfile = "Summary.txt";	# Where to write 1-line summary info

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# commands for fetching web files:

$dfltget = "webcat";	# Default command to get a file via http
print V "$P: dfltget=\"$dfltget\"\n" if $V>3;

# Before this point should be only simple assignments of initial values.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here are our global arrays and tables:

%BadHost  = ();	# Hosts which should not be accessed at all
%BadPath  = ();	# Paths on this host to avoid
@oldchunk = ();	# lines of old hst/* file entry
@newchunk = ();	# lines of new hst/* file entry
%Depth    = ();	# link depth for a URL
%DepthHost= ();	# link depth limit for a host
%Done     = (); # time that URL (full or short) was scanned
%h2d      = ();	# list of depths for the h2u URIs
%h2n      = ();	# number of URLs for each host
%h2u      = ();	# list of URIs for each host
%inithost = ();	# hostnames from the command line
%Opt      = ();	# Runtime options for host
%outlink  = ();	# URLs that have already been seen in this file
@tune     = ();	# lines of current ABC tune
#URLts    = ();	# timestamp when a URL was last read

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We implement a special kludge to handle local URLs:

($thishost = `hostname`) =~ s/\s*$//;
print V "$P: $thishost is our host name.\n" if $V>2;
%local = (
	"http://localhost/~jc/"         => ($ENV{HOME} . "/public_html/"),
##	"http://$thishost/~jc/"         => ($ENV{HOME} . "/public_html/"),
#	"http://dmz.atsbank.com/~jc/"   => ($ENV{HOME} . "/public_html/"),
#	"http://ecf-guest.mit.edu/~jc/" => ($ENV{HOME} . "/public_html/"),
#	"http://trillian.mit.edu/~jc/"  => ($ENV{HOME} . "/public_html/"),
);
@local = sort(keys(%local));

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here are the MIME types that we look for.  Since most file formats have more
# than one MIME type, we map the MIME type to a simple word.

$notearchives = 1;	# Whether to look for zip files, etc.
#%MIMEtype = (		# MIME -> type mapping
#	'application/gzip' => 'gzip',
#	'application/x-gzip' => 'gzip',
#	'application/x-zip-compressed' => 'zip',
#	'application/zip' => 'zip',
#	'multipart/gzip' => 'gzip',
#	'multipart/zip' => 'zip',
#);

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Stuff dealing with time:

$spm = 60;	# Seconds per minute
$mph = 60;	# Minutes per hour
$hpd = 24;	# Hours per day
$spd = $spm * $mph * $hpd;		# Should be 86400

$mintime = ($V>6) ? 10 :  0;	# Minimum wait time before rereading a URL
$maxdays = ($V>6) ?  0 :  0;	# Maximum wait time in days
$maxtime = $maxdays * $spd;		# Maximum wait time before purging entries

$oScanY = $oScanM = 0;	# For remembering year and month of scan

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's where we initialize our signal handling.

$SIG{CONT} = 'sigCONT';	# CONT causes stack dump and abandons current URL
$SIG{HUP}  = 0;			# HUP  was 'sigINT' but is now ignored
$SIG{INT}  = 'sigINT';	# INT  causes stack dump and terminate
$SIG{QUIT} = 'sigQUIT';	# QUIT causes stack dump, abandons everything and exits
$SIG{USR1} = 'sigUSR1';	# USR1 causes stack dump and decrements $V
$SIG{USR2} = 'sigUSR2';	# USR2 causes stack dump and increments $V

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Scan the command-line arguments, processing them as we go.  Input files are #
# read  and  used  to  build  tables.  Any URLs discovered are accumulated in #
# @URLs.  Options are processed as read, so they will only affect  things  to #
# their right, except for URLs, which we save for last.                       #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
for $a (@ARGV) {
	print V "$P: Arg \"$a\"\n" if $V>2;
	if (($fl,$opt) = ($a =~ m'^([-+])(.*)'i)) {
# - - - -
# +art  include articles
# -art  exclude articles
#   This option controls whether articles (the, a, an, le, la,  etc.)
#   are to be stripped from the start of titles. The default is -art,
#   which does nothing.
		if ($opt =~ m'^art'i) {
			$articles = $fl;
			print V "$P: " . ($articles eq '-' ? 'Ignoring' : 'Including') . " articles." if $V>3;
# - - - -
# -C<file>
#   Read a bot config file.  This is a config file giving starting URLS, plus
#   allowed and disallowed URLs and hostnames.
		} elsif ($opt =~ m'^C(.*)'i) {
			print V "$P: URL file '$1'\n" if $V>3;
			&cfgbot($1);
# - - - -
# -D<N>  recursion depth limit.
#   The "depth" of a URL is how many hops it is from our input URL list.  The
#   URLs  in  this  list are at depth D:1; the URLs they link to are at depth
#   D:2, and so on.  This depth is recorded in the hst/* files.  The default
#   is  -D3, which experience shows is a practical limit.  Note:  Files whose
#   names end with ".abc" are allowed to be one level deeper.
		} elsif ($opt =~ m'^D(\d*)$'i) {
			$maxdepth = $1;
			print V "$P: maxdepth='$maxdepth'\n" if $V>3;
			$abcdepth = $maxdepth + 1;
# - - - -
# +L  show links from a URL.
#   This is primarily a debugging hook. If enabled, the +L options means that
#   all  the  hyperlinks  will be listed after a URL, with a ">" flag to show
#   that they are hyperlinks.  This increases the size of  the  hst/*  files
#   significantly, so the default is -L.
		} elsif ($opt =~ m'^L$'i) {
			$showlinks  = ($fl eq '-') ? 0 : $1;
			print V "$P: showlinks='$showlinks'\n" if $V>3;
# - - - -
# -<N>
#   This  is  a  timeout  in seconds.  If we can't get a URL in this time, we
#   abandon it and go on to the next.  The default is currently 60 seconds.
		} elsif ($opt =~ s'^(\d+)$'') {
			$ABCtmout = $1;
			print V "$P: ABCtmout=$ABCtmout\n" if $V>3;
# - - - -
# +P  purge bad URLs.
		} elsif ($opt =~ m'^P$'i) {
			$purgebad  = ($fl eq '-') ? 0 : 1;
			print V "$P: purgebad='$purgebad'\n" if $V>3;
# - - - -
# -S<N>  schedule next host after <N> minutes.
#   When we finish, we will schedule another instance of this program for the
#   next  host  after  <N>  minutes.   The next host is the one in the hst/*
#   directory that is lexically next; if there is none,  we  pick  the  first
#   host.   The  default  if  <N>  is  missing is to call a routine to try to
#   determine the interval.
		} elsif ($opt =~ m'^S(\d*)$'i) {
			$schedule = $1 || &getschedule();
			print V "$P: schedule='$schedule'\n" if $V>3;
			$abcdepth = $schedule + 1;
# - - - -
# -T
# +T
# -T<N>
#   This sets the timeout for opening URLs to <N> seconds.  The default is 30
#   seconds.  Special cases: -T means -T10 and +T means +T60. If <N> is given
#   you may use '-' or '+' interchangeably.
		} elsif ($opt =~ m'^T(\d*)$'i) {
			$HTTPtimeout = $1 || (($fl eq '-') ? 10 : 60);
			print V "$P: HTTPtimeout='$HTTPtimeout'\n" if $V>3;
# - - - -
# -U<N>
#   This sets the max number of URLs that we attempt to process.  This is a
#   debug hook only.  We abandon our task, clean up, and exit when $urlcount
#   passes this number.  If $maxurls is zero, it means no limit.
		} elsif ($opt =~ m'^U(\d*)$'i) {
			$maxurls = (length($1)>0) ? int($1) : 10;
			print V "$P: maxurls=$maxurls.\n" if $V>3;
# - - - -
# -V<version>
#   This is the HTTP version number. The default is 1.0, but some web servers
#   are picky about this and require 1.1.  We try to discover this by looking
#   at the version returned in HTTP messages, but  it's  faster  if  you  can
#   specify  it on the command line.  We can also get this from the cfg/$host
#   file.
		} elsif ($opt =~ m'^V([0-9.]*)$'i) {
			$HTTPversion = $1 ? $1 : '1.1';
			print V "$P: HTTPversion='$HTTPversion' from cmdline option.\n" if $V>3;
# - - - -
# -W<N>  min wait: wait at least <N> seconds before rereading a URL.
# +W<N>  max wait: reread a URL after <N> seconds.
#   These options control when we are allowed to reread a  URL.   -W  is  the
#   minimum  time;  i.e.,  we shouldn't reread a URL until this much time has
#   passed since we last read it.  +W gives the time after  which  a  URL  is
#   considered  obsolete and should be read again.  If <N> is null, we reread
#   everything.
		} elsif ($opt =~ m'^W(\d*)$'i) {
			if ($1 eq '') {
				$mintime =  $maxtime = $maxdays = 0;
				print V "$P: Rereading everything.\n" if $V>3;
			} elsif ($fl eq '-') {
				$mintime = $1;
				print V "$P: Min timeout is $mintime sec.\n" if $V>3;
			} else {
				$maxtime = ($maxdays = $1) * $spd;
				print V "$P: Max timeout is $maxtime sec.\n" if $V>3;
			}
			print V "$P: mintime=$mintime maxtime=$maxtime (maxdays=$maxdays)\n" if $V>3;
# - - - -
# The notation '<file' is treated as input redirection.  Doing  this  ourself
# can be useful at times.
		} elsif ($a =~ m'^<(.*)') {
			unless (open(STDIN,$1)) {
				print V "$P: Can't read '$1' ($!)\n" if $V>3;
			}
# - - - -
# None of these patterns matched the option string.
		} else {
			print V "$P: Option \"$fl$opt\" not understood.\n" if $V>1;
		}
# - - - -
# Args that don't start with '-' or '+' are treated as host names.  Here,  we
# just accumulate them for later processing.
	} else {
		$a =~ s"^.*/+""i;		# Strip of any directories
		$a =~ s"-+$"";			# Trim "backup" host names
		$inithost{lc($a)} ++;	# Remember lower-case host name
	}
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
my %URLcode = (
	"\t" => '%09',	# HT Horizontal Tab
	"\n" => '%0A',	# LF Line Feed, newline
	"\r" => '%0D',	# CR Carriage Return
	" "  => '%20',	# SPACE
	'"'  => '%22',	# DOUBLE QUOTE
	"'"  => '%27',	# APOSTROPHE
	"%"  => '%25',	# PERCENT
	"&"  => '%26',	# AMPERSAND
	"+"  => '%2B',	# PLUS SIGN
	"<"  => '%3C',	# LESS THAN
	"="  => '%3D',	# EQUAL SIGN
	">"  => '%3E',	# GREATER THAN
#	"?"  => '%3F',	# QUESTION MARK
);

if ($V>1) {
	print V "$P:   abcdepth=$abcdepth.\n";
	print V "$P:   articles=$articles.\n";
	print V "$P:   hstdepth=$hstdepth.\n";
	print V "$P:    maxdays=$maxdays.\n";
	print V "$P:   maxdepth=$maxdepth.\n";
	print V "$P:    maxtime=$maxtime.\n";
	print V "$P:    maxurls=$maxurls.\n";
	print V "$P:    mintime=$mintime.\n";
	print V "$P:   purgebad=$purgebad.\n";
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Now all the hosts in our list should have their own files in hst/$host  and #
# these files should be filled with the URLs for the host.  Next, we expect a #
# list of host names on the command line.  We run through these hosts and run #
# thru each one's hst/$host file, and process each URL we find there.         #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

if (%inithost) {
	print V "$P: HOST processing ...\n" if $V>3;
	%h2d = %h2n = %h2u = ();	# Forget about input list of URLs
hostfile:
	for $h (sort keys %inithost) {
		print V "$P: HOST \"$h\" ...\n" if $V>3;
		if ($OLDopen) {close OLD; $OLDopen = 0}
		if ($HSTopen) {&CloseHST($HostT0{$h},time)}
		last if ($closeDoc || $finishup);
		if ($BadHost{$h}) {
			print V "$P: Host \"$h\" is in BadHost list.\n" if $V>3;
			next hostfile;
		}
		$hstdepth = $maxdepth;	# Default depth limit
		&host($h);
		print V "Host \"$h\" done. " . `date -u` if $V>0;
	}
	print V "All hosts done.\n" . `date -u` if $V>0;
} elsif (%h2n) {
	print V "$P: There are no hosts to process, but we have a URL list.\n" if $V>3;
	&saveURLs;					# Save the info from the initial URLs
	%h2d = %h2n = %h2u = ();	# Forget about this set of URLs
} else {
	print V "$P: There are no hosts to process and no URL list.\n" if $V>3;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
if (%h2n) {
	print V "$P: There are more hosts and URLs to remember.\n" if $V>3;
	&saveURLs;			# Save the info from the initial URLs
}

# Should we schedule a new run of this robot?

if ($schedule > 0) {
	print V "$P: $esep\n" if $V>3;
	print V "$P: Schedule another run after $schedule minutes ...\n" if $V>3;
	@hosts = grep(!/(-|\.LCK)\s*$/,glob("hst/*"));
	$hosts = int(@hosts); print V "$P: We have $hosts hosts.\n"; # if $V>3;
	$nexthost = '';
host:
	foreach $host (@hosts) {
		print V "$P: host file \"$host\"\n" if $V>6;
		$host =~ s"^hst/([-.\w]+)\s*$"$1";
		if ($x = $hostsub{$host}) {$host = $x}
		print V "$P: host \"$host\"\n" if $V>3;
		if ($host gt $currhost) {
			print V "$P: host \"$host\" > \"$currhost\"\n" if $V>5;
			$nexthost = $host;
			last host;
		}
	}
	$nexthost = $host[0] if !$nexthost;
	if ($nexthost) {
		print V "$P: Next host is \"$host\".\n" if $V>3;
		$log = "log/$host";
		$ENV{"V_abcbot"} = "$V$log";
		$atcmd = "at now + $schedule $P +S $host '<BadURLs'";
		print V "$P: atcmd=\"$atcmd\"\n" if $V>5;
		if (system $atcmd) {
			$exitstat = $!;
			print V "$P: FAILED \"$atcmd\" ($!)\n" if $V>3;
			print V "$P: Exit status was $?.\n" if $V>3;
		}
	}
}

if (-f $cachetmp) {unlink $cachetmp}
print V "\n" if $V>1;
print V "$P: Exit with status $exitstat.\n" . `date -u` if $V>0;
exit $exitstat;
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = #

##	sub abc2html {
##	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
##	# Convert the abc escape sequences to HTML.
##	# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
##		local($s) = @_;
##		$s =~ s#\\(o)#\&${1}slash;#ig;
##		$s =~ s#\\a(a)#\&${1}ring;#ig;
##		$s =~ s#\\"(\w)#\&${1}uml;#ig;
##		$s =~ s#\\'(\w)#\&${1}acute;#ig;
##		$s =~ s#\\`(\w)#\&${1}grave;#ig;
##		$s =~ s#\\,(\w)#\&${1}cedille;#ig;
##		$s =~ s#\\~(\w)#\&${1}tilde;#ig;
##		$s;
##	}

sub BASE {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Save the contents of a <BASE ...> tag.  We may need this when we try to use #
# a relative URL later on in the file.  We save it in the global $base var.   #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($atts) = @_;
	local($att,$val,$lup);
	$atts .= ' ';
	print V "BASE: atts=\"$atts\"\n" if $V>7;
	while ($atts =~ s/\s*^(\w+)="*([^"\s]+)["\s]+//) {
		$att = uc($1);
		$val = $2;
		print V "BASE: att=\"$att\" val=\"$val\"\n" if $V>6;
		print V "BASE: atts=\"$atts\"\n" if $V>7;
		if (uc($att) eq 'HREF') {
			$base = $val;
			print V "BASE: base=\"$base\"\n" if $V>3;
			return $base;	# This is all we want
		}
		if (++$lup > 5) {
			print V "BASE: Looped $lup times; giving up.\n" if $V>3;
			return undef;
		}
	}
	print V "BASE: Quit with atts=\"$atts\"\n" if $atts && $V>3;
	return undef;
}

sub CheckEnd {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Terminate the checking.  We first write the log message $m if  the  verbose #
# level is greater than $v. We then kill the check process and close the pipe #
# from its stdout.  The value of $r is the caller's  intended  return  value, #
# which we may print (or not).                                                #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($r,$v,$m) = @_;
	print V "$m\n" if $V>$v;
	kill 9, $chkpid;
	close CHK;
	return $r;
}

sub CheckHost {my $F='CheckHost';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Do miscellaneous validations on a host.  The return value is the number  of #
# problems  found.   0  means  there are no objections to the host; a nonzero #
# return means there is some problem and we should skip this host.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($h) = @_;
	local($chk,$chktim,$n,$t);
	local($httplvl,$errcode,$errmsg,$path);
	@robotstxt = ();	# Contents of robots.txt file, if any
	$hstmsg = '';
	if ($n = $BadHost{$h}) {
		$hstmsg = "Host \"$h\" in blacklist";
		print V "$P: $hstmsg.\n" if $V>0;
		return 1;
	}
#	$GetCmd = $Getcmd{"C:$h"} || "$dfltget -V$HTTPversion";
	$GetCmd = $dfltget;
	$chk = "$GetCmd -T$ABCtmout 'http://$h/robots.txt'";
	print V "$F: \"$chk\"\n" if $V>3;
	$w3timedout = 0;			# Global set true to flag timeout
	$chktim = time;				# When we started this check
	if ($HTTPtimeout > 0) {		# Timeout in effect?
	#	$savsig = $SIG{ALRM};	# Save old alarm routine
		$SIG{ALRM} = 'W3tmout';	# Establish alarm routine
		alarm $HTTPtimeout;		# Set alarm
		&dt();
		print V "$F: Set alarm after $HTTPtimeout sec at $now.\n" if $V>5;
		$CHKopen = 1;			# Triggers close on timeout
	}
	$HTTPcontime = time;		# Note time we last tried to connect
	unless ($chkpid = open(CHK,"$chk |")) {
		$hstmsg = "\"$chk\" failed ($!)";
		print V "$F: $hstmsg\n" if $V>0;
		$CHKopen = 0;
		return 2;
	}
	print V "$F: Process $chkpid \"$chk\" running.\n" if $V>6;
	print V "$F: Read 'http://$h/robots.txt'\n" if $V>2;
#	$inHTTPhdrs = 1;			# The +H option produces headers
	$CHKopen = 1;			# Triggers close on timeout
line:
	while ($line = <CHK>) {
		if ($w3timedout) {
			&dt();
			$hstmsg = "#### TIMEOUT in for loop at $now ####";
			print V "$F: $hstmsg\n" if $V>0;
			$t = ($now = time) - $chktim;
			print V "$F: chktim=$chktim now=$now t=$t\n" if $V>3;
			return &CheckEnd(3,1,"$F: Can't connect to \"$h\" in $t sec. (return 3)");
		}
		$line =~ s/[\r\s]+$//;;
		print V "$F: +++ \"$line\"\n" if $V>7;
		if ($inHTTPhdrs) {
			print V "$F: HDR \"$line\"\n" if $V>3;
			$t = ($now = time) - $chktim;
			if (!$line) {
				print V "$F: End of headers.\n" if $V>3;
				$inHTTPhdrs = 0;
			}
			if ($line =~ m"^<!--.*Can't connect to .*-->") {
				print V "$F: chktim=$chktim now=$now t=$t\n" if $V>3;
				return &CheckEnd(4,1,"$F: Can't connect to \"$h\" in $t sec. (return 4)");
			}
			if ($line =~ m"^<!--.*No route to .*--> ") {
				return &CheckEnd(4,1,"$F: Can't connect to \"$h\" in $t sec. (return 4)");
			}
			if ($line =~ m"^<!--.*ALARM after (\d+) sec .*-->") {
				return &CheckEnd(4,1,"$F: Can't connect to \"$h\" in $1 sec. (return 4)");
			}
			if ((($httplvl,$errcode,$errmsg) = ($line =~ m"^HTTP/([0-9.]+)\s+ERR\s+(\d+)\s+(.*)$"i))
			||	(($httplvl,$errcode,$errmsg) = ($line =~ m"^HTTP/([0-9.]+)\s+(\d+)\s+(.*)$"i))
			) {
				print V "$F: httplvl=$httplvl errcode=$errcode errmsg=\"$errmsg\"\n" if $V>3;
				if ($errcode >= 400) {
					$hstmsg = "No robots.txt file found";
					print V "$F: $hstmsg\n" if $V>3;
					return &CheckEnd(0,1,"$F: HTTP/$1 ERR $2 ($3)");
				}
				print V "$F: robots.txt file found.\n" if $V>0;
				print V "$F: HTTP/$1 ERR $2 ($3) -- accepted.\n" if $V>3;
				next line;
			}
			print V "$F: Not an ERR line.\n" if $V>7;
			if ($line =~ m"^HTTP/([0-9.]+)\s") {
#				unless ($Getcmd{"C:$h"}) {
#					$HTTPversion = $1;
#					print V "$F: HTTP version $HTTPversion (from HTTP header)\n" if $V>3;
#				}
			#	$getcmd = $Getcmd{"C:$h"} || "$dfltget -V$HTTPversion";
				$getcmd = $dfltget;
			} elsif ($line =~ m"^Server:\s*(.*)") {
				$Server{$h} = $1;
				print V "$F: Server for \"$h\" is \"$Server{$h}\"\n" if $V>6;
			}
		} else {
			print V "$F: TXT \"$line\"\n" if $V>1;
			push @robotstxt, $line;
			if ($line =~ /^User-agent:\s*(.*)$/i) {
				$agentpat = $1;
				$agentmatch = 0;
				if (($agentpat eq '*') || ($agentpat =~ /abcbot/i)) {
					print V "$F: User-agent \"$agentpat\" matches us.\n" if $V>0;
					$agentmatch = 1;
				}
			} elsif ($line =~ /^Disallow:*\s*(.*)$/) {
				($path = $1) =~ s/[\r\s\n\t]+$//;
				if ($path) {
					$Disprefix{$path}++;
				#	$hstmsg = "Disallow \"$path\"";
					print V "$F: DISALLOW \"$path\"\n" if $V>0;
				}
			}
		}
	}
	print V "$F: Done with \"http://$h/robots.txt\"\n" if $V>3;
	if ($w3timedout) {
		$t = time - $chktim;
		$hstmsg = "#### TIMEOUT (at for loop after $t sec) ####";
		print V "$F: $hstmsg\n" if $V>0;
		return &CheckEnd(6,1,"$F: Can't connect to \"$h\" in $t sec. (return 6)");
	}
	return &CheckEnd(0,0,"$F: $h OK");
}

sub cfgbot {my $F='cfgbot';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Read in a config file for this program.  It should contain a list  of  URLs #
# that need special treatment.                                                #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($cfgfile) = @_;
	local($path,$prot,$host,$rest);
	print V "$F: Read CFG file '$cfgfile' ...\n" if $V>3;
	if (open(CFG,$cfgfile)) {
		print V "$F: Read CFG file '$cfgfile'\n" if $V>3;
	} else {
		print V "$F: Can't read '$cfgfile' ($!)\n" if $V>3;
		return 0;
	}
	print V "$F: myhost \"$myhost\"\n" if $V>2;
	print V "$F: currhost \"$currhost\"\n" if $V>2;
	while ($line = <CFG>) {					# Input contains config directives
		$line =~ s/[\r\n\s]*$/ /;			# We want exactly one space at end of line
		print V "$F: line \"$line\"\n" if $V>7;
		if ($line =~ /^\s*#/) {				# Ignore comments
		} elsif ($line =~ /^\s*$/) {		# Ignore blank lines
		} elsif ($line =~ s"^\s*((http|ftp)://\S+)\s"$1"i) {
			print V "$F: URL: $1\n" if $V>3;
			&URL($1,1); # unless %inithost;	# Add it as a level-1 URL to be examined
		} elsif ($line =~ s"^(scan|search):*\s*(\S+)\s"$2"i) {
			print V "$F: SCAN $2\n" if $V>3;
			&URL($2,1); # unless %inithost;	# Add it as a level-1 URL to be examined
		} elsif ($line =~ s"^(dead|done|gone|ignore):*\s*(\S+)/*\s"$2"i) {
			$Depth{$2} = 1;					# Mark this one as "already done"
			$Done{$2}  = $now;				# Use current time as when we did it
			print V "$F: DONE \"$2\" at $now.\n" if $V>0;
		} elsif ($line =~ s"^avoid:*\s*(\S+)\s.*"$1"i) {	# Hosts or URLs to avoid
			print V "$F: Avoid \"$line\"\n" if $V>5;
			if (($prot,$host,$rest) = ($line =~ m"(http|ftp)://([-_:.\w]+)(.*)"i)) {
				if ($rest eq '') {				# http://host
					$BadHost{$host}++;			# It's just a host "to be avoided"
					print V "$F: Avoid host: \"$host\"\n" if $V>2;
				} elsif ($rest eq '/') {		# http://host/
					$BadHost{$host}++;			# It's just a host "to be avoided"
					print V "$F: Avoid host/ \"$host\"\n" if $V>2;
				} elsif ($host eq $myhost) {	# http://host/path
					$BadPath{"$rest"}++;	# Host + path "to be avoided"
					print V "$F: AVOID PATH: $host \"$rest\"\n" if $V>2;
				} else {
				#	$BadPath{"$host$rest"}++;	# Host + path "to be avoided"
					print V "$F: Avoid path: $host \"$rest\"\n" if $V>2;
				}
			} elsif ($line =~ /^([-_:.\w]+)\s*$/) {
				$BadHost{$1} ++;			# Mark this host as "to be avoided"
				print V "$F: AVOID HOST: \"$1\"\n" if $V>2;
			} else {
				print V "$F: Avoid \"$line\" IGNORED (can't parse).\n" if $V>0;
			}
		} elsif ($line =~ /^Disallow:*\s*(.*)\s*$/i) {
			($path = $1) =~ s/[\r\s\n\t]+$//;
			if ($path) {
				print V "$F: Disallow \"$path\"\n" if $V>2;
				$Disprefix{$path}++;
			}
		} else {							# Otherwise it's a comment
			print V "$F: \"$line\" IGNORED (can't parse).\n" if $V>0;
		}
	}
	close CFG;
	return 1;
}

sub CloseHST {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Finish writing all files related to the current host and close  the  files. #
# The params are the start and finish timestamps for this host.               #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($t0,$t1) = @_;
	local($hr,$mn,$sc,$tm);
	local($ss,$mm,$hh,$DD,$MM,$CY);
	local($gcmd,$pfile,$srvr);
	++$scancnt;
	print HST "\n$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost\n";
	print V   "\n$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost\n" if $V>0;
	unless ($tunemax>0 && $titlmax>0) {
		print V "$P: No ABC ever found at host $currhost in pass $scancnt.\n" if $V>0;
		print HST "\n$now # No ABC found at $currhost in pass $scancnt\n";
		if ($scancnt > $maxscans) {
			$pfile = "nul/$currhost";
			print V "$P: Move \"$hfile\" to \"$pfile\"\n" if $V>3;
			&Backup($pfile) if -e $pfile;
			unless (rename($hfile,$pfile)) {
				print V "$P: Can't rename  \"$hfile\" to \"$pfile\" ($!)\n" if $V>0;
			}
		}
	}
	$tm = $t1 - $t0;
	$sc = $tm % 60;
	$mn = int($tm/60) % 60;
	$hr = int($tm/3600);
	printf HST "\n$now Scanned $currhost at $cymdhms in $tm sec (%d:%02d:%02d)\n",$hr,$mn,$sc;
	printf V     "$now Scanned $currhost at $cymdhms in $tm sec (%d:%02d:%02d)\n",$hr,$mn,$sc if $V>0;
	close HST;
#
	if (open(LOG,">>$smryfile")) {
		local($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900;
		$summary = sprintf("%04d%02d%02d %02d:%02d %8d sec (%2d:%02d:%02d) %6d files %6d tunes %6d titles at $currhost\n"
			,$CY,$MM,$DD,$hh,$mm
			,$tm,$hr,$mn,$sc
			,$filecnt,$tunecnt,$titlcnt);
		printf LOG $summary;
		close LOG;
	}
	$HSTopen = 0;
#	$gcmd = $Getcmd{"C:$currhost"};
	$gcmd = $dfltget;
	$srvr = $Server{$currhost};
	print V "$P: Command for \"$currhost\" is \"$gcmd\"\n" if $V>3;
	print V "$P: Server  for \"$currhost\" is \"$srvr\"\n" if $V>3;
	print V "$P: Depth   for \"$currhost\" is \"$hstdepth\"\n" if $hstdepth != $maxdepth && $V>3;
}

sub LoadLinks {my $F='LoadLinks';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my $file;
	my($l,$ts,$fl,$dpth,$urlpath);
	for $file (@_) {
		print V "$F: Load \"$file\"\n" if $V>2;
		if (open(LINKFILE,$file)) {
			while ($l = <LINKFILE>) {
				$l =~ s/[\r\s]*$/ /;	# Strip white stuff
				next if $l eq ' ';		# Ignore blank lines
				print V "$F: LINKFILE line: $l" if $V>2;
				if ($l =~ /^\s*#/) {	# Comment
					print V "$F: Drop \"$l" if $V>3;
				} elsif ((($ts,$fl,$dpth,$urlpath) = ($l =~ m'^(\d+)\s+([-#\w]) D:(\d+) *(.*) $'))
				||   (($ts,$dt,$fl,$dpth,$urlpath) = ($l =~ m'^(\d+)=(\d+) ([-#\w]) D:(\d+) *(.*) $'))) {
					print V "$F: fl='$fl' D:'$dpth' \"$urlpath\"\n" if $V>2;
					print V "$F: urlpath=\"$urlpath\"\n" if $V>2;
					&NewU($urlpath,1,$ts);	# Was $dpth, now all hosts start at depth 1
				} else {
					print V "$F: BAD \"$l\"" if $V>3;
				}
			}
			close LINKFILE;
			print V "$F: Done \"$file\"\n" if $V>2;
		} else {
			print V "$F: Can't read \"$file\" ($!)\n" if $V>0;
		}
	}
}

sub Max {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is a dumb loop that finds the max numeric value in a list. Why doesn't #
# perl  have  this as a builtin?  It has nearly everything else.  We only use #
# this with non-negative integers (times), so we assume a min value of 0.     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($n) = shift || 0;
	for (@_) {
		$n = $_ if defined$_ && ($_ > $n);
		shift;
	}
	return $n;
}

sub TIMEOUT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's an alarm handler for reads from DOC.  When  a  timeout  happens,  we #
# close  the  DOC  file  and  return,  which  should cause abandonment of the #
# current document.                                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	$endDoc = 1;
	$w3timedout = 1;
	&main::V("TIMEOUT called with V=$V.\n") if $V>1;
}

sub NewU {my $F='NewU';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Add a URI to our list of added/unprocessed URIs.  We keep track of the  min #
# depth,  and  at  the  end, any URIs in the @Left list will be scanned.  The #
# timestamp is currently included, but not actually used,  since  we  fake  a #
# timestamp of zero to force a scan of all new URIs.                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($urlpath,$dpth,$ts) = @_;
	local($d);
	print V "$F: Add D:$dpth U:$urlpath\n" if $V>5;
	if ($Done{$urlpath}) {
		print V "$F: \"$urlpath\" already marked done at depth $Depth{$urlpath}.\n" if $V>3;
		return;
	}
	if (defined($d = $Depth{$urlpath})) {
		print V "$F: Dup Depth{$urlpath}='$Depth{$urlpath}' is '$d'\n" if $V>7;
		if ($Depth{$urlpath} > $dpth) {
			$Depth{$urlpath} = $dpth;	# Note minimum depth for URI
			print V "$F: Depth{$urlpath}=$dpth\n" if $V>5;
		}
	} else {	# New URI; set its depth
		$Depth{$urlpath} = $dpth;	# Note its depth
		print V "newU: Depth{$urlpath}=$dpth\n" if $V>3;
		push @Left, $urlpath;		# List of unprocessed URIs
		print V "$F: New Depth{$urlpath}='$Depth{$urlpath}'\n" if $V>5;
	}
}

sub URL {my $F=':URL';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Register a new URL for later scanning.  We can do some weeding out here  if #
# we  so desire.  We return 0 if we reject the URL; 1 if we accept it, though #
# callers don't yet use this info. We implement a special ABC kludge here: If #
# the URL ends with .abc, we accept it even if it's beyond the maximum depth. #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($urlfull,$depth) = @_;
	local($done,$h,$hh,$hst,$lh,$lu,$lx,$n,$p,$u,$urlpath,$x);
	print V "$F: urlfull={" . &esc($urlfull) . "}\n" if $V>2;
	print V "$F: \"$urlfull\" d=$depth.\n" if $V>3;
	if ($Done{$urlfull}) {
		print V "$F: \"$urlfull\" already marked done at depth $Depth{$urlfull}.\n" if $V>2;
		return 0;
	}
	if (!$followUpLink && ($urlfull =~ /\/\.\.\//)) {
		print V "$F: \"$urlfull\" ignored (/../)\n" if $V>2;
		return 0;
	}
	if ($depth > $hstdepth) {
		if ($urlfull =~ /\.abc$/i) {		# ABC files are special
			if ($depth > $abcdepth) {	# They may be one level deeper
				print V "$F: \"$urlfull\" ignored (depth $depth > $abcdepth && .abc file)\n" if $V>2;
				return 0;
			}
			print V "$F: \"$urlfull\" accepted (depth $depth && .abc file)\n" if $V>2;
		} elsif ($depth >= $hstdepth) {	# Reject non-ABC files at depth limit
			print V "$F: \"$urlfull\" ignored (depth $depth >= $hstdepth)\n" if $V>2;
			return 0;
		}
	}
	print V ">->-> \"$urlfull\" [$depth]\n" if ($depth>$hstdepth && $V>2);
	$urlfull = &URLtrim($urlfull);				# Shorten the URL if possible
	print V "----> $depth '$urlfull'\n" if $V>2;
	if ($urlfull !~ m'^(http|file):'i) {	# Accept only these protocols
		print V "$F: \"$urlfull\" ignored (http|file rule)\n" if $V>2;
		return undef;
	}
	if ($urlfull =~ m'\.(exe)\b/'i) {		# Don't try to fetch MS executables
		print V "$F: \"$urlfull\" ignored (.exe rule)\n" if $V>1;
		return undef;
	}
	if ($urlfull =~ m'\b(bin|tmp)/'i) {		# Don't try to fetch from bin or tmp directories
		print V "$F: \"$urlfull\" ignored (bin|tmp rule)\n" if $V>2;
		return undef;
	}
	unless ($allowcgi) {
		if ($urlfull =~ m'\bcgi\b'i) {		# Ignore cgi scripts
			print V "$F: \"$urlfull\" ignored (cgi rule)\n" if $V>2;
			return undef;
		}
		if ($urlfull =~ m/[<>\?;#"]/) {		# Ignore URLs that look like HTML or CGI calls
			print V "$F: \"$urlfull\" ignored (HTML/CGI rule)\n" if $V>2;
			return undef;
		}
	}
	if ($urlfull =~ m'/\?\w=\w$') {			# Ignore apache listing URLs
		print V "$F: \"$urlfull\" ignored (/?X=Y\$ rule)\n" if $V>2;
		return undef;
	}
	if ($urlfull =~ m'jc/.*\.hdr$'i) {		# Ignore jc's HDR files
		print V "$F: \"$urlfull\" ignored (hdr rule)\n" if $V>2;
		return undef;
	}
	print V "$F: urlfull=\"$urlfull\"\n" if $V>2;
	if (($p,$h,$urlpath) = ($urlfull =~ m'^(http|ftp)://([^/]+)(.*)$'i)) {	# FTP doesn't work yet
		print V "$F: urlpath=\"$urlpath\"\n" if $V>2;
		unless ($h =~ /^[-_:.\w]*$/) {
			print V "$0: Bogus host \"$h\" ignored.\n" if $V>2;
			return undef;
		}
		$hst = lc($h);
		if ($depth <= 1) {
			$hh = "http://$hst$urlpath";
			$AllowURL{$hh}++;	# Note that this one is explicitly allowed
			print V "$F: Allow \"$hh\"\n" if $V>2;
		}
		if ($BadHost{$hst}) {
			print V "$F: \"$urlfull\" ignored (BadHost \"$hst\")\n" if $V>2;
			return undef;
		}
		if (%BadHost) {
			if ($BadHost{$h}) {
				print V "$F: \"$urlfull\" ignored (bad host \"$h\")\n" if $V>2;
				return undef;
			}
		}
		$lh = length($h);	# Length of hostname in URL
		$lu = length($urlpath);	# Length of pathname in URL
		if (%BadPath) {
			for $x (%BadPath) {		# Run thru forbidden paths
				$lx = length($x);	# Length of this path
				if (($lu <= $lx) && (substr($urlpath,0,$lx) eq $x)) {
					print V "$F: \"$urlfull\" ignored (bad path \"$x\")\n" if $V>2;
					return undef;	# URI starts with forbidden path
				}
			}
		}
		print V "$F: hst='$hst' currhost='$currhost'\n" if $V>3;
		if ($hst eq $currhost) {
			print V "$F: URI \"$urlpath\" at depth $depth is local.\n" if $V>2;
			&NewU($urlpath,$depth,($now = time));
			return 1;
		} else {
			print V "$F: URI \"$urlpath\" at depth $depth is non-local.\n" if $V>2;
		}
		if ($h =~ m'(__|\.\.)') {
			print V "$F: \"$urlfull\" ignored (host contains \"$1\")\n" if $V>2;
			return undef;
		}
		$n = $h2n{$hst} || 0;
		$h2u{$hst}->[$n] = $urlpath;	# Note URI, not the full URL
		$h2d{$hst}->[$n] = $depth;	# Note each URL's depth
		$h2n{$hst} ++;				# Count the URLs for each host
		$doclinks ++;				# Count the links from the current document
		print V "$F: host $hst URL $h2n{$hst} is \"$urlfull\" [$doclinks links]\n" if $V>2;
		if ($showlinks) {
			unless ($outlink{"$currURL:$urlfull"}) {
				&dt();
				push @newchunk, ("$now > D:$depth " . (($hst eq $currhost) ? $urlpath : $urlfull));
#				if ($hst eq $currhost) {
#					push @newchunk, "$now > D:$depth $urlpath";
#				} else {
#					push @newchunk, "$now > D:$depth $urlfull";
#				}
			}
			++ $outlink{"$currURL:$urlfull"};	# Number of times we've encountered this URL
			print V "----> $depth '$urlfull'\n" if $V>1;
		}
	}
	return 1;
}

sub URLenc {my $F='URLenc';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# do the HTTP encoding to convert URL special chars to %XX.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my($c,$d,$l,$v);
	for (@_) {
		$l = $_;	# Copy the arg.
		print V "$F: +++ \"$l\"<br>\n" if $V>6;
		$l =~ s/([\t\n\r "'%&+<=>])/$URLcode{$1}/eg;
		$v .= $l;
	}
	return $v;
}

sub W3tmout {my $F='W3tmout';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's an alarm handler for reads from DOC.  When  a  timeout  happens,  we #
# close  the  DOC  file  and  return,  which  should cause abandonment of the #
# current document.                                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	&dt();
	$TOopen = $now - $TMopen;	# Time since we opened the current file
	$TOread = $now - $TMread;	# Time since we last read from the file
	print V "$F: Called with TOopen=$TOopen TOread=$TOread sec [HTTPtimeout=$HTTPtimeout]\n" if $V>2;
	if ($TOread < $HTTPtimeout) {
		print V "$F: Timeout ignored: only $TOread sec since last read [HTTPtimeout=$HTTPtimeout]\n" if $V>1;
	} else {
		print V "$F: TIMEOUT after $TOopen/$TOread sec [HTTPtimeout=$HTTPtimeout]\n" if $V>1;
		$endDoc = 1;
		$w3timedout = 1;
	}
}

sub done {my $F='done';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Mark a URI/URL as done at a specific depth and time.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($d,$t,$urlpath,$urlfull) = @_;
	if ($Done{$urlpath}) {
		print V "$F: \"$urlpath\" already marked done at time $Done{$urlpath} depth $Depth{$urlpath}.\n"
			if $V>0;
		return;
	}
	if ($Done{$urlfull}) {
		print V "$F: \"$urlfull\" already marked done at time $Done{$urlfull} depth $Depth{$urlfull}.\n"
			if $V>0;
		return;
	}
	$d = $depth        unless defined($d) && $d > 0;
	$t = ($now = time) unless defined($t) && $t > 0;
	$Depth{$urlfull} = $d;
	$Done{$urlfull}  = $t;
	print V "$F: URLfull \"$urlfull\" marked done at time $t depth $d.\n" if $V>3;
	$Depth{$urlpath} = $d;
	$Done{$urlpath}  = $t;
	print V "$F: URLpath \"$urlpath\" marked done at time $t depth $d.\n" if $V>3;
}

sub chunk {my $F="chunk";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A "chunk" starts with a line giving a timestamp and URL, plus  some  little
# fields  giving  the  URL's  depth  and what we last did with it, optionally
# followed by data about that URL. The 1-char flags that follow the time are:
#   U new URL, not processed yet.
#   > link to another URL.
#   - URL not read for some reason.
#   # Error trying to read URL.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($depth,$newuline,$olduline,$links,$rest,$titls,$tunes,$urlpath,$urlfull);
	local($d,$dt,$fl,$l,$oB,$oL,$oX,$oT,$ts,$x,$y);
	local($sec,$min,$hour,$mday,$mon,$year); # gmtime() return list
	print V "CHUNK:\n\t@oldchunk\n" if $V>7;
	$olduline = shift @oldchunk;
	print V "$F: \"$olduline\"\n" if $V>6;
	$dt = $oB = $oL = $oX = $oT = '';
	$docbytes =
	$matched  = 0;
	# We now attempt to match several different "U" lines that we have used in
	# various versions of this bot.
	if ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d*)( L:\d*)( X:\w*)( T:\d*) (.*)$') {
		$ts    = $1;	# Time of last update
		$fl    = $2;	# Line-type flag, 'U' for URLs
		$depth = $3;	# Hyperlink depth
		$oB    = $4;	# Old byte count
		$oL    = $5;	# Old tune count
		$oX    = $6;	# Old tune count
		$oT    = $7;	# Old title count
		$urlpath   = $8;	# URL minus protocol and host
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d*)( X:\w*)( T:\d*) (.*)$') {
		$ts    = $1;	# Time of last update
		$fl    = $2;	# Line-type flag, 'U' for URLs
		$depth = $3;	# Hyperlink depth
		$oB    = $4;	# Old byte count
		$oX    = $5;	# Old tune count
		$oT    = $6;	# Old title count
		$urlpath   = $7;	# URL minus protocol and host
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d+) (.*)$') {
		$ts    = $1;
		$fl    = $2;
		$depth = $3;
		$oB    = $4;	# Old byte count
		$urlpath   = $5;
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+) (.*)$') {
		$ts    = $1;
		$fl    = $2;
		$depth = $3;
		$urlpath   = $4;
		$matched = 1;
	} elsif ($olduline =~ m'^(\d+)=(\d+) ([-#\w]) D:(\d+) (.*)$') {
		$ts    = $1;
		$dt    = $2;
		$fl    = $3;
		$depth = $4;
		$urlpath   = $5;
		$matched = 1;
	} else {
		print V "$F: Line not matched.\n" if $V>6;
	}
	if ($matched) {
		$urlpath =~ s/[\r\s.]+$//;			# Trim trailing junk
		print V "$F: dt $fl D=$depth oB=$oB $oL=$oL oX=$oX oT=$oT urlpath=\"$urlpath\"\n" if $V>6;
	#	if ($Done{$urlpath}) {
	#		print V "$F: \"$urlpath\" already done at $Done{$urlpath} depth $Depth{$urlpath}.\n" if $V>0;
	#		print V "$F: DROP \"$urlpath\"\n" if $V>3;
	#		return;
	#	}
		if ($urlpath =~ /\.(bak|old|fmt|gif|png|ps|pdf|ppt|swf|midi*|jpe*g|zip|g*z|au|mp\d*|wav|wmv)$/i) {
			print V "----> '$urlpath' ignored ($1 suffix).\n" if $V>2;
			return;
		}
		$urlpath =~ s":80/"/";				# Drop default port
		$urlfull = "http://$currhost$urlpath";	# Construct full URL
		print V "$F: urlpath=\"$urlpath\"\n" if $V>3;
		print V "$F: urlfull=\"$urlfull\"\n" if $V>3;
		if ($Done{$urlfull}) {
			print V "$F: \"$urlfull\" already done at $Done{$urlfull} depth $Depth{$urlfull}.\n" if $V>0;
			print V "$F: DROP \"$urlfull\"\n" if $V>3;
			return;
		}
		if (defined($d = $Depth{$urlpath})) {
			print V "$F: \"$urlpath\" is at depth $d.\n" if $V>5;
			if ($d < $depth) {			# Adjust URI's depth
				print V "$F: \"$urlpath\" changed from depth $depth to $d.\n" if $V>5;
				$depth = $d;			# URI's Use min depth
			}
		}
		if ($x = defined($y = $Done{$urlfull}) && $y && defined($x)) {	# Have we seen this URL already?
			print V "$F: \"$urlfull\" already done at $x\n" if $V>3;
			@oldchunk = ();				# Suppress the chunk entirely
			return;
		} elsif ($fl eq '-') {			# Old comment lines
			$newuline = $olduline;
			@newchunk = @oldchunk;
			print V "$F: Mark \"$urlfull\" done now (-).\n" if $V>5;
			&done($d,$ts,$urlpath,$urlfull);	# Mark this URI/URL done
			print V "$F: $ts URL \"$urlfull\" marked done now (-).\n" if $V>5;
		} elsif (@oldchunk && (($x = &dt() - $ts) < $mintime)) {
			print V "$F: $ts only $x < $mintime sec.\n" if $V>5;
			$newuline = $olduline;
			@newchunk = grep(!/- (too soon|rescan|obsolete) /,@oldchunk);
			unshift @newchunk, "$now - too soon ($x < $mintime)" if $V>3;
			print V "$F: Mark \"$urlfull\" done now ($x < mintime=$mintime).\n" if $V>5;
			&done($d,$ts,$urlpath,$urlfull);	# Mark this URI/URL done
			print V "$F: $ts URL \"$urlfull\" marked done now ($x < mintime=$mintime)\n" if $V>5;
			for $l (@newchunk) {		# Look for previous link and tune counts
				if (($links,$tunes)  = ($l =~ /(\d+) links, (\d+) ABC tune/)) {
					$linkcnt += $links; $linkmax = $linkcnt if $linkmax < $linkcnt;
					$tunecnt += $tunes; $tunemax = $tunecnt if $tunemax < $tunecnt;
					$filecnt ++ if $tunecnt>0;
				} elsif (($links,$tunes,$titls) = ($l =~ /(\d+) links, (\d+) tunes, (\d+) titles/)) {
					$linkcnt += $links; $linkmax = $linkcnt if $linkmax < $linkcnt;
					$tunecnt += $tunes; $tunemax = $tunecnt if $tunemax < $tunecnt;
					$titlcnt += $titls; $titlmax = $titlcnt if $titlmax < $titlcnt;
					$filecnt ++ if $tunecnt>0 || $titlcnt>0;
				}
			}
		} elsif ($x > $maxtime) {
			print V "$F: $ts $x > maxtime=$maxtime.\n" if $V>5;
			&dt();
			$newuline = "$now U D:$depth$oB$oL$oX$oT $urlpath";
			push @newchunk, "$now - rescan ($x > $maxtime)" if $V>7;
			print V "$F: Scan \"$urlfull\" now ($x > maxtime=$maxtime).\n" if $V>5;
			&scanURL($urlfull);			# Rescan it
			print V "$F: Mark \"$urlfull\" done now ($x > maxtime=$maxtime).\n" if $V>5;
			&done($depth,$now,$urlpath,$urlfull);
			print V "$now URL \"$urlfull\" marked done now ($x > maxtime=$maxtime)\n" if $V>5;
		} else {					# It's a URL that we should examine
			print V "$F: SCAN \"$urlfull\"\n" if $V>6;
			&dt();
			$newuline = "$now U D:$depth$oB$oL$oX$oT $urlpath";
			print V "$F: Scan \"$urlfull\" now (else).\n" if $V>5;
			&scanURL($urlfull);			# Scan it
			print V "$F: Mark \"$urlfull\" done now (after scanURL).\n" if $V>5;
			&done($depth,$now,$urlpath,$urlfull);
		}
		if ($maxurls > 0 && $urlcount > $maxurls) {	# Debug hook: exit after $maxurls URLs
			print V "$F: $ts Abort after $urlcount URLs.\n" if $V>0;
			return;
		}
	} elsif ($olduline =~ / T L:(\d+) X:(\w+) T:(\d+) F:(\d+) H:(.*)/) {
		$linkmax = $1 if $linkmax < $1;
		$tunemax = $2 if $tunemax < $2;
		$titlmax = $3 if $titlmax < $3;
		++$scancnt;					# Count the number of times we've done this host
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} elsif (($ts,$fl,$rest) = ($olduline =~ m'^(\d+) ([-+T]) (.*)$'i)) {
		print V "$F: ts=$ts $fl $rest\n" if $V>6;
		($sec,$min,$hour,$mday,$mon,$year) = gmtime($1);
		if ($oScanY == $year && $oScanM == $mon) {	# [jc] 20030331
			# We've seen a timestamp for this month; make trivial change to indicate it:
			$olduline = "";	# "$ts t $rest";	# Note 't' rather than 'T'
		}
		$oScanY = $year;	# Note year and month of scan
		$oScanM = $mon;
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} elsif (($ts,$dt,$fl,$rest) = ($olduline =~ m'^(\d+)=(\d+) ([-+T]) (.*)$')) {
		print V "$F: dt $fl $rest\n" if $V>6;
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} elsif (($ts,$dt) = ($olduline =~ m'^(\d+)=(\d+)$')) {
		print V "$F: dt timestamp.\n" if $V>6;
		$newuline = $olduline;
		@newchunk = @oldchunk;
	} else {
		print V "$F: Unmatched:", $olduline, @oldchunk, "\n" if $V>3;
		if ($saveunmatched) {
			$newuline =  $olduline;
			@newchunk =  @oldchunk;
			unshift @newchunk, "$now - Unmatched";
		} else {
			$newuline = '';
			@newchunk = ();
		}
	}
	if ($newuline || @newchunk) {
		print HST "\n";
		print HST "$newuline\n" if $newuline;
		print HST  @newchunk    if @newchunk;
		print HST "\n"          if @newchunk;
	}
	@oldchunk = ();
	@newchunk = ();
	print V "$F: done.\n" if $V>7;
}

sub env {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Return an environment value, if it's defined.  If not, set it  to  the  2nd #
# arg, and return that value.  It's best if the value is a string.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($name,$dflt) = @_;
	if (defined $ENV{$name}) {
		return  $ENV{$name};
	} else {
		return($ENV{$name} = $dflt);
	}
}

sub errchunk {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($erruline) = @_;
	&dt();
	push @newchunk, $erruline;
	print V "$P: erruline=\"$erruline\"\n" if $V>5;
	$newuline = $olduline;
	print V "$P: newuline=\"$newuline\"\n" if $V>5;
}

sub getschedule {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here we try to determine the schedule interval for repeated  runs  of  this #
# robot.  We first try to extract the interval from a file. If that fails, we #
# return a constant; this should only happen during debugging.  Note that  if #
# the  value  returned  is zero, no rescheduling is done.  This is useful for #
# stopping runaways. Note that the time interval is in minutes.               #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($l,$n);
	$schedfile = 'abcbot.sch' unless $schedfile;
	if (open(SCHED,$schedfile)) {
		while ($l = <SCHED>) {if ($l =~ /^(\d+)/) {$n = $1; close SCHED; last}}
	} else {
		print V "getschedule: Can't read \"$schedfile\" ($!); using 1 min.\n" if $V>0;
		$n = 1;
	}
	print V "getschedule: Return $n min.\n" if $V>3;
}

sub host {my $F='host';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here's the main routine to process one host name..
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($h,$cmd) = @_;
	local($base,$flg,$i,$init,$suff);
	$currhost = $h;
	$HostT0{$h} = $HostT0{$currhost} = time;
	print V "\n" if $V>2;
	print V "$hsep\n" if $V>2;
	print V "----> HOST \"$h\"\n" if $V>0;
	&hostunlock if $lfile;	# Unlock previous host
#	%Disprefix = ();			# Forget any disallows for previous host
	%RejectPfx = ();		# Rejected initial strings for this host
	for $init (%Disprefix) {	# Find the disallowed initial strings for this host
		if ($init =~ /\b$currhost\b/) {
			$RejectPfx{$init} = 1;
			print V "$F: RejectPfx{$init}\n" if $V>1;
		}
	}
	%AllowURI = ();			# Find the allows for this host
#	print V "$F: ### AllowURL is empty!\n" unless %AllowURL;
	print V "$F: AllowURL contains URLs\n" if %AllowURL && $V>3;
	for $allow (keys %AllowURL) {
		print V "$F: Allow \"$allow\"\n" if $V>6;
		if (($pp,$hh,$uu) = ($allow =~ m"^(\w+)://([-_:.\w]+)(/.*)")) {
			print V "$F: Allow h=\"$h\" vs hh=\"$hh\" (pp=\"$pp\"\n" if $V>6;
			if ($h eq lc($hh)) {
				$AllowURI{$uu}++;	# Allow this URI for this host
				print V "$F: Allow host \"$h\" uri \"$uu\"\n" if $V>5;
			} else {
				print V "$F: host \"$h\" uri \"$uu\" don't match\n" if $V>5;
			}
		}
	}
	$cfile = "cfg/$h";		# Config info this host
	$hfile = "hst/$h";		# Current data for this host
	$afile = "add/$h";		# Additional URLs for host
	$nfile = "new/$h";		# Newly added URLs for host
	$ofile = "old/$h";		# Backup file name
	$lfile = "lck/$h";		# Lock file name
	if (-f $lfile) {		# Someone's working on it
		print V "$F: Lock file exists lfile=\"$lfile\" ($!)\n" if $V>0;
		return;
	}
	unless (open(LCK,">$lfile")) {
		print V "$F: Can't write lfile=\"$lfile\" ($!)\n" if $V>0;
		return;
	}
	&hostlock($lfile);
	$TMopen = 		# Time since we opened the current file
	$TMread = 		# Time since we last read from the file
		&dt();		# Make these default to right now.
	if (&CheckHost($h)) {
		print V "$F: Host \"$h\" failed checks.\n" if $V>0;
		print V "$F: $hstmsg\n" if $V>0;
		return;
	}
	if (@robotstxt) {
		print V "$F: Got 'http://$h/robots.txt' file.\n" if $V>0;
	}
	if (-f $cfile) {
	#	require "cfgload.pm";
		$i = require "cfgload.pm";	# Routines to deal with cfg/* files
		print V "$F: Require returned $i.\n" if $V>3;
		&cfgload($cfile,$h);
		$hstdepth = $DepthHost{$h} || $maxdepth;
		if ($flg = $Opt{'CGI'}) {
			$allowcgi = ($flg eq '+');
			print V "$F: Set CGI option to '$allowcgi'\n" if $V>1;
		}
		if ($flg = $Opt{'tagP'}) {
			print V "$F: Set tagP option to '$flg'\n" if $V>2;
			$hs->setOption("$flg" . 'tagP');
		}
		print V "$F: Max depth for $h is $hstdepth.\n" if $V>3;
	}
	&Backup($ofile) if -e $ofile;
	if (rename($hfile,$ofile)) {
		print V "$F: Moved \"$hfile\" to \"$ofile\"\n" if $V>3;
	} else {
		print V "$F: Can't rename(\"$hfile\",\"$ofile\") ($!)\n" if $V>0;
	}
	unless (open(OLD,$ofile)) {
		print V "$F: Can't read \"$ofile\" ($!)\n" if $V>0;
	}
	$OLDopen = 1;
	print V "$F: Reading from \"$ofile\"\n" if $V>3;
	unless (open(HST,">>$hfile")) {
		print V "$F: Can't write hfile=\"$hfile\" ($!)\n" if $V>0;
		return;
	}
	$HSTopen = 1;
	select HST; $| = 1; select V;
	print V "$F: Writing to \"$hfile\"\n" if $V>3;
	$linkcnt = $tunecnt = $titlcnt = $filecnt = 0;
	$linkmax = $tunemax = $titlmax = $filemax = 0;
	print V "$F: filecnt=$filecnt for new host $h.\n" if $V>5;
	&dt();
	print HST "$now + start $h\n";
	$scancnt = 0;			# Number of times we've done this host
	@oldchunk = ();			# One URL and its info
	@newchunk = ();			# New info about this URL
#	%Depth = ();			# Min depth of URIs so far
#	%Done = ();				# List of URIs we've processed
#	@Left = ();				# List of URIs still to handle
#	print V "$F: Emptied \%Done and \@Left\n" if $V>6;
#
	&LoadLinks($afile) if (-f $afile);
	&LoadLinks($nfile) if (-f $nfile);
hostline:
	while ($l = <OLD>) {
		print V "$F: Next OLD line.\n" if $V>5;
		next if ($l =~ / \+ (start|done)\b/);	# Lines to drop
		next if ($l =~ m"jc/.*\.hdr$"i);		# Ignore jc's HDR files
		last if ($closeDoc || $finishup);
		$l =~ s"[\r\s]+$"";				# Trim away trailing white stuff
		print V "====| $l\n" if $V>7;
		if ($l) {
			push @oldchunk, $l;		# Accumulate lines of one "chunk"
		} else {
			&chunk() if @oldchunk;	# Process one "chunk" of the host's data
		}
		if ($maxurls>0 && $urlcount>$maxurls) {
			print V "$F: hostline: Abort after $urlcount URLs\n" if $V>0;
			last hostline;
		}
	}
	&chunk() if @oldchunk;
	# Run thru the added URIs here. Note that if an entry in %addURI has been
	# undef'd, its name might still be there, and only the value is undefined.
	print V "$F: " . int(@Left) . " URIs left.\n" if $V>5;
URI:
	while (@Left) {				# Local URIs discovered in hyperlinks
#		print V "\n" if $V>3;
		print V "$F: There are " . int(@Left) . " local URIs left.\n" if $V>5;
		if ($maxurls>0 && $urlcount>$maxurls) {
			print V "$F: URI: Abort after $urlcount URLs\n" if $V>0;
			last URI;
		}
		if ($endHost) {				# Abandon the current file.
			print V "$F: Abandon current URL endHost=$endHost ...\n" if $V>0;
			$hs->DOCclose();
			last URI;
		}
		if ($endHost) {
			print V "$F: Quitting because endHost=$endHost ...\n" if $V>0;
			return;
		}
		$urlpath = shift @Left;		# Get one URI
		print V "$F: URI: \"$urlpath\" (" . int(@Left) . " left)\n" if $V>5;
		next URI unless $urlpath;	# Paranoia: Ignore nulls
		if ($Done{$urlpath}) {		# Have we done it already?
			print V "$F: \"$urlpath\" already done at $Done{$urlpath} depth $Depth{$urlpath}.\n" if $V>3;
			print V "$F: DROP \"$urlpath\"\n" if $V>3;
			next URI;
		}
		$purged = 0;				# Set true to skip this URL
		if (($base,$suff) = ($urlpath =~ /^(.*)\.(gif|ps|midi*|jpe*g|zip|g*z|au|mp\d*|wav)$/)) {
			print V "----> '$urlpath' dropped (suffix).\n" if $V>3;
			if ($notearchives) {
				if ($suff eq 'zip') {
					system "echo ZIP: http://$h/$urlpath >> ZIPfiles"
				} elsif ($suff =~ /g*z/) {
					system "echo GZIP http://$h/$urlpath >> ZIPfiles"
				}
			}
			next URI;		# Skip this file
		}
		unless (defined($dpth = $Depth{$urlpath})) {
			print V "$F: ### U:$urlpath depth unknown.\n" if $V>0;
			$dpth = 1;		# Make a guess
		}
		if ($dpth < 1) {
			print V "$F: ### U:$urlpath ignored at depth $dpth.\n" if $V>3;
			next URI;
		}
		print V "$F: Add D:$dpth U:$urlpath\n" if $V>5;
		@oldchunk = ("0 U D:$dpth $urlpath");
		&chunk();
	}
	print V "$F: No more new URIs for \"$h\".\n" if $V>3;
	&CloseHST($HostT0{$h},&dt()) if $HSTopen;
	&Backup($afile) if -e $afile;
	&hostunlock if $lfile;
	if (-d "http/$h") {	# Relink the host's cached files
		$cmd = "nice relink +r 'http/$h'";
		print V "$F: cmd=\"$cmd\"\n" if $V>3;
		system "$cmd &";
	}
}

sub hostlock {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print LCK "$$ $P\n";
	close LCK;
	print V "$P: LOCKed \"$lfile\"\n" if $V>3;
}

sub hostunlock {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	close LCK;
	unlink($lfile);
	print V "$P: unLOCK \"$lfile\"\n" if $V>3;
	$lfile = '';
}

sub href {my $F='href';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Given an href, we decide here how to handle it. The caller must pass us the #
# URL  from  the href, and the string (item) between the '>' and the </a>, in #
# case we need to check what's there.  The main use we make of the item is to #
# check  for and reject "parent dir" references.  We also look at a few other #
# suffixes and decide whether we should load them and scan their contents. If #
# the  URL  is  accepted,  we  pass  it  to &URL() for later processing.  For #
# rejected URLs, we just return.                                              #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local(
		$curr,		# Current URL
		$href,		# URL pointed to
		$item,		# HTML text associated with $href
		$incr)		# Level increment: 0 for frames, 1 for other URLs
			= @_;
	local($a,$l,$p,$s,$u);
	$incr = 1 unless defined $incr;
	print V "$F: curr=\"$curr\" href=\"$href\" item=\"$item\"\n" if $V>6 || ($href =~ /^\$/);
#	$href =~ s/\s*".*//;			# Discard anything after a double quote
	unless ($allowcgi) {
		if ($href =~ /^(\?)/i) {	# Once contained '#'
			print V "$F: Ignore href=\"$href\" (?)\n" if $V>5;
			return;
		}
		if ($href =~ /\bcgi\b/i) {
			print V "$F: Ignore href=\"$href\" (cgi)\n" if $V>1;
			return;
		}
	}
	if ($href =~ /\.(bak|log|tmp|out)\b/i) {
		print V "$F: Ignore href=\"$href\" (bak|log|tmp|out)\n" if $V>5;
		return;
	}
	if ($base) {
		print V "$F: base=\"$base\" replaces curr=\"$curr\"\n" if $V>6;
		$curr = $base;
	}
	print V "$F: href={" . &esc($href) . "}\n" if $V>2;
	if ($href =~ '/$') {	# If final '/', treat as directory
		print V "$F: Treat href=\"$href\" as directory.\n" if $V>7;
		if ($chkuplinks && ($item =~ /\b(Parent|Home|Back)\b/i)) {
			print V "$F: Ignore href=\"$href\" item=\"$item\"\n" if $V>5;
			return;
		} elsif ($href =~ m"^\w*://") {	# Full URL
			print V "$F: \"$href\" read at depth $depth.\n" if $V>6;
			&URL($href,$depth+$incr);
		} else {						# Relative URL
			$u = &URLhref($curr,$href);
			print V "$F: \"$u\" read at depth $depth.\n" if $V>6;
			&URL($u,$depth+$incr);
		}
		print V "$F: Done with directory.\n" if $V>6;
		return;
	}
	# No final '/' on HREF:
	print V "$F: Treat href=\"$href\" as non-directory.\n" if $V>6;
	if ($href =~ m'(.*)#(\w+)$') {
		print V "$F: Offset href=\"$1\" (#$2)\n" if $V>5;
		$u = &URLhref($curr,$1);
		print V "$F: \"$u\" URL at depth $depth.\n" if $V>6;
		&URL($u,$depth+$incr);
	} else {
		print V "$F: Simple href=\"$1\" (#$2)\n" if $V>6;
		$u = &URLhref($curr,$href);
		print V "$F: \"$u\" URL at depth $depth.\n" if $V>6;
		&URL($u,$depth+$incr);
	}
}

sub mkdirs {my $F='mkdirs';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Create a directory tree.  This is recursive.  We attempt to make  the  full #
# path.   If that fails, we trim off the last field, and call ourself to make #
# the parent directory. When that returns, we once again try to make the full #
# path.  We return >1 for success, 0 for failure because the directory exists #
# already, and <1 for real failure.                                           #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($p) = @_;
	local($d,$n,$x);
	return( 0) if -d $p;	# Does it already exist?
	return(-1) if -f $p;	# Is it a file?
	return(1) if mkdir($p,0775);
	$n = 0;				# That failed.
	print V "$F: Make dir \"$p\"\n" if $V>3;
	if (($d,$x) = ($p =~ m"^(.+)/([^/]+)/*$")) {
		if (-d $d) {	# Does parent directory exist?
			print V "$F: Dir \"$p\" exists.\n" if $V>1;
		} elsif (($n = &mkdirs($d)) >= 0) {	# <=== Recurse
			print V "$F: Made dir \"$p\"\n" if $V>1;
		} else {
			print V "$F: ### Can't make dir \"$d\" [$!]\n" if $V>0;
		}
	}
	if (mkdir($p,0775)) {
		++$n;		# Accumulate the number of directories created
		print V "$F: Made dir \"$p\"\n" if $V>1;
		return $n;
	} else {
		print V "$F: ### Cannot make  dir \"$p\" [$!]\n" if $V>0;
		return -$n;
	}
}


sub saveURLs {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Write the accumulates list of URLs to the appropriate hst/* files. All the
# URLs have been split into host and URI portions, and we have the arrays:
#   $h2n{$host} is the number of URIs for $host
#   $h2u{$host}->[$n]  is the nth URI for $host
#   $h2d{$host}->[$n]  is the depth of each URI (1 for initial URIs)
# For each host $h, we open the hst/$h file and append a "U" line  for  each
# URI.  These may be redundant, of course, and if so, will be dropped when we
# process that host again.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($d,$h,$nfile,$hst,$urlpath,$urlfull,$urls,$x);
	unless (%h2n) {
		print V "$P: There are no URLs to save.\n" if $V>3;
		return;
	}
	print V "$P: Save URL list ...\n" if $V>3;
host:
	for $h (sort keys %h2n) {
		next unless $h;
		$hst = lc($h);
		$urls = $h2n{$hst};
		print V "$P: Host \"$hst\" has $urls new URLs.\n" if $V>5;
		next if $urls < 1;
		$nfile = "new/$hst";
		unless (open(NEW,">>$nfile")) {
			print V "### Can't write nfile=\"$nfile\" ($!) (saveURLs)\n" if $V>0;
			next host;
		}
		print NEW "# $h\n";			# Make sure the file identifies the host
		for ($n = 0; $n < $urls; $n++) {
			$urlpath = $h2u{$hst}->[$n] || '/';
			$d   = $h2d{$hst}->[$n];
			$urlfull = "http://$hst$urlpath";
			print V "#---> uri $n depth $d host \"$hst\" is \"$urlpath\"\n" if $V>3;
			if ($x = $Done{$urlfull}) {
				print V "Already done at $x (saveURLs)\n" if $V>3;
			} else {
				&dt();
				print NEW "\n0 U D:$d $urlpath\n";
			}
		}
		close NEW;
	}
	print V "$P: Initial URL list done.\n\n" if $V>3;
	close H;
	if (defined($host) && $host) {
		$h2n{$host} = ();	# Forget the initial list
		$h2u{$host} = ();
		$h2d{$host} = ();
	}
}

sub dmpline {my $F="dmpline";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Produce a symbolic dump of one or more strings.  We label them HTTP or DATA #
# depending on whether we think they're in the HTTP header or in the document #
# data. The first arg is an ID string shown at the start of each line.        #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($id) = shift;
	local($line,$type);
	$type = $inHTTPhdrs ? 'HTTP' : 'DATA';
	for $line (@_) {
		print V "$id $type $doclines {" . &esc($line) . "}\n";
	}
}

sub scanHDRs {my $F="scanHDRs";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# We assume we're in the HTTP header section of the input.  We gobble up  the #
# data,  and look for the few things that might be of interest to us.  Mostly #
# we just discard the lines.  When we hit a  blank  line,  we  turn  off  the #
# inHTTPhdrs flag and return.  We also return at end-of-file.                 #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($line);
	$TMread =  time;			# Note time of last DOC read
	while ($line = $hs->DOCline()) {
		unless (defined($line)) {
			print V "$F: EOF on DOC in HTTP headers.\n" if $V>0;
			closeDoc('EOF');
			$inHTTPhdrs = 0;
			return;
		}
		&dmpline('+++',$line) if $V>2;
		$line =~ s/[\r\s]+$//;	# Trim white stuff and newline
		unless ($line) {		# Blank line is end of HTTP headers
			print V "$F: End of HTTP headers.\n" if $V>2;
			$inHTTPhdrs = 0;
			return;
		}
		# Look for interesting headers:
		if ($line =~ /^content-type:\s(.*)$/i) {
			print V "$F: Original content type was \"$1\"\n" if $V>2;
		} else {
			print V "$F: Ignore \"$line\"\n" if $V>5;
		}
		$TMread =  time;		# Note time of last DOC read
	}
}

sub scan {my $F="scan";
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Grovel through a file, looking for hyperlinks or pieces of abc code.  Check #
# out each of the files listed.  Directories cause recursive traversal. Files #
# with interesting suffixes are read. This routine is complicated by the need #
# to  decode  HTML  as well as plain text.  An extra complication is that the #
# HTTP headers may include various error indications, and we may not be  able #
# to get the file at all.                                                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($URL) = @_;
	local($endDoc,$DOCopen);	# Localize the DOC file
	local($allow,$getcmd,$init,$inABC,$H,$pfx,$X,$ttl);
	local($udir,$ubas,$usuf);
	local($nhrefs,$possibletunes,$when,$xx);
	local($d,$dpth,$h,$item,$l,$line,$lurl,$n,$p,$s,$surl,$t,$u);
	local($ss,$mm,$hh,$DD,$MM,$CY);
#	print V "\n" if $V>2;
	print V "$F: SCAN \"$URL\"\n" if $V>3;
	if ($t = $Done{$URL}) {
		$d = $Depth{$URL};
		print V "$F: \"$URL\" already done at $t depth $d!!!\n" if $V>0;
		print V "$F: DROP \"$urlfull\"\n" if $V>2;
		return 0;
	}
	$DOCopen = $inABC = 0;
	$possibletunes = 0;			# Tunes found in file but maybe rejected
	$tunesinfile   = 0;			# Tunes in file accepted
	$lurl = &URLtrim($URL);		# Long URL may have final '/'
	print V "$F: lurl={" . &esc($lurl) . "}\n" if $V>2;
	unless ($allowcgi) {		# Check for CGI
		if ($lurl =~ /(\bcgi\b|\bbin\b)/) {	# Once contained '|\?'
			print V "$F: \"$lurl\" rejected due to \"$1\"\n" if $V>2;
			return;
		}
	}
	if ($lurl =~ /(#)/) {		# Ignore internal anchors
		print V "$F: \"$lurl\" rejected due to \"$1\"\n" if $V>2;
		return;
	}
	print V "\n" if $V>1;
	print V "$esep\n" if $V>2;
	($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900;
	print V "====> $depth $CY/$MM/$DD $hh:$mm:$ss \"$lurl\"\n" if $V>1;
	($surl = $lurl) =~ s"/+$"";	# Short URL lacks final '/'
# Split the URL into directory/base/suffix:
	if (($udir,$ubas,$usuf) = ($surl =~ m"(.*/)([^/]*)\.(\w+)$")) {
		$usuf = lc($usuf);		# Suffix is caseless to us 
	} elsif (($udir,$ubas) = ($surl =~ m"(.*/)([^/]*)$")) {
		$usuf = '';				# No suffix is OK, too
	} else {
		$udir = $ubas = $usuf = ''
	}
	print V "udir=\"$udir\" ubas=\"$ubas\" usuf=\"$usuf\"\n" if $V>2;
	if ($surl eq '') {			# Shouldn't happen
		local($p,$c,$l) = caller;
		print V "$F: \"$URL\" (from $p/$c/$l)\n" if $V>0;
		return 0;
	}
	if (($dpth = $Depth{$surl}) && ($dpth <= $depth)) {
		print V "$F: \"$URL\" is marked as depth $dpth.\n" if $V>3;
		if ($Done{$URL}) {
			print V "$F: \"$URL\" already scanned at depth $dpth.\n" if $V>2;
			return 1;
		}
	}
#	$Depth{$surl} = $depth;		# Note that we've done this URL
#	print V "$F: \"$lurl\" marked as depth $depth.\n" if $V>3;
	for $init (keys %RejectPfx) {
		if (substr($urlpath,0,length($init)) eq $init) {
			print V "$F: REJECT \"$urlpath\" for currhost=\"$currhost\"\n" if $V>1;
			$purged = 1;		# Tell caller to drop this URL
			$Done{$urlpath} = $now;
			$Depth{$urlpath} = $depth;
			return 0;
		}
	}
#	if (%Disprefix) {
#		print V "$F: Checking disallows ...\n" if $V>3;
#disallow:
#		for $dispfx (keys %Disprefix) {
#			print V "$F: disallow: \"$dispfx\" \n" if $V>2;
#			if (substr($urlpath,0,length($dispfx)) eq $dispfx) {
#				print V "$F: Disallow: \"$dispfx\" \n" if $V>2;
#				print V "$F: Disallows \"$urlpath\" \n" if $V>2;
#				for $allow (keys %AllowURI) {
#					print V "$F: allow: \"$allow\"\n" if $V>2;
#					if (substr($urlpath,0,length($allow)) eq $allow) {
#						print V "$F: ALLOW: \"$urlpath\"\n" if $V>1;
#						print V "$F: PATTERN: \"$allow\"\n" if $V>1;
#						$purged = 0;
#						last disallow;
#					} else {
#						print V "$F: no match: \"$urlpath\"\n" if $V>1;
#					}
#				}
#				print V "$F: DISALLOW: \"$urlpath\"\n" if $V>1;
#				print V "$F: PATTERN: \"$dispfx\"\n" if $V>1;
#				$purged = 1;	# Tell caller to drop this URL
#				$Done{$urlpath} = $now;
#				$Depth{$urlpath} = $depth;
#				return 0;
#			}
#		}
#	}
	$currURL   = $lurl;	# Global copy of current URL
	$DOCopen   = &dt();	# Time DOC opened; zero if not open
	$inABC     = 0;		# Not in ABC yet
	$docbytes  = 0;		# Count the bytes in the current document
	$doclines  = 0;		# Count the lines in the current document
	$doclinks  = 0;		# Count the links in the current document
	$doctitls  = 0;		# Count the titles in the current document
	$doctunes  = 0;		# Count the tunes in the current document
	if ($lurl =~ /\.(doc|exe|tgz)\b/) {	# Reject several binary file formats
		print V "$F: \"$urlfull\" ignored (.doc/.exe/.tgz rule)\n" if $V>1;
		$Done{$lurl} = $now;
		$Depth{$lurl} = $depth;
		$purged = 1;	# Tell caller to drop the URL
		$newuline = '';
		@newchunk = ();
		return 0;
	}
	print V "$F: Opening \"$lurl\" ...\n" if $V>2;
	$TMopen = $TMread = &dt();
	for $pfx (keys %RejectPfx) {
		print V "$F: RejectPfx check \"$pfx\"\n" if $V>2;
		if (substr($lurl,0,length($pfx)) eq $pfx) {
			print V "$F: REJECT \"$lurl\" [matches $pfx]\n" if $V>1;
			$Done{$lurl} = $now;
			$Depth{$lurl} = $depth;
			$purged = 1;	# Tell caller to drop the URL
			$newuline = '';
			@newchunk = ();
			return 0;
		}
	}
	$HTTPcontime = time;	# Note time we last tried to connect
	unless ($hs->DOCopen($docurl = $lurl)) {
		$DOCopen = 0;
		if ($V>5 || ($lurl =~ m"/$")) {
			&dt();
			push @newchunk, "$now # not accessible.";
			$currURL = '';
		} else {
			$newuline = '';
			@newchunk = ();
		}
		$TMopen = $TMread = 0;	# The time we opened the current file
		print V "$F: \"$lurl\" not accessible.\n" if $V>2;
		return 0;
	}
	$TMopen = time;		# Note time of last DOC open
	print V "$F: Reading \"$lurl\" ...\n" if $V>2;
	$fileXhdrs = $fileThdrs = $filePhdrs = $fileKhdrs = 0;
	$w3timedout  = 0;	# No timeout yet
	$ignoretune  = 0;	# If true, ignore all tunes in this file
	$ignorefile  = 0;	# If true, ignore the current tune
	$closeDoc    = 0;	# If true, close the doc file and contine with next doc
	$hs->setOption("+HDRs");	# We want to see HTTP header
	$inHTTPhdrs = 1;	# Interpret first lines as HTTP headers
	print V "$F: Look for HTTP headers ...\n" if $V>2;
	&scanHDRs();		# Read the HTTP headers
# Each time around this loop, we try to  get  one  line  from  the  document,
# append  it  to the remains of the previous line, and decide what to do with
# it.  Most of the time, we will empty out $line, but in  HTML  docs  we  may
# return  here  with  a  partial  line  unprocessed.  We distinguish the HTTP
# headers from the doc's contents, and there are a number of  things  in  the
# headers that we look for.  [No HTML here any more.]
buffer:
	while (!$endDoc && !$ignorefile && !$HTTPalrm && !$finishup) {
		print V "$F: Call scanBuf()\n" if $V>5;
		$i = &scanBuf();
		if    ($i eq 0) {return 0; print V "$F: Return $i from scanBuf()\n" if $V>3;}
		elsif ($i eq 1) {return 1; print V "$F: Return $i from scanBuf()\n" if $V>3;}
		elsif ($i eq 2) {next buffer}
		elsif ($i eq 3) {last buffer}
		else  {print V "$F: scanBuf() returned $i!!!\n" if $V>0}
	}
	if ($w3timedout) {
		print V "$F: Timed out in \"$lurl\"\n" if $V>0;
		&timedout();
	}
	if ($inABC) {
		print V "$F: EOF ends ABC tune at line $doclines.\n" if $V>1;
		&tune();
		@tune = ();
		$inABC = 0;	
		$X = $ttl = undef;
	}
	++$filecnt if ($doctunes>0 || $doctitls>0);
		# N.B.: We will only index the tune if there's a title. Since $doctunes
		# may count tunes for which there's no title, we may be looking at a tune
		# that will not be indexed.  Not that there's a problem with that.
	if ($V>7) {
		print V "$F: filecnt=$filecnt because doctunes=$doctunes doctitls=$doctitls.\n";
		$xx = ($doctunes > 0) ? " in ABC file $filecnt" : '';
		push @newchunk, "$now $doclinks links, $doctunes tunes, $doctitls titles$xx."
			if $doctunes || $doctitls || $doclinks;
	}
	$linkcnt += $doclinks;			# Total links  at this host
	$tunecnt += $doctunes;			# Total tunes  at this host
	$titlcnt += $doctitls;			# Total titles at this host
	print V "$F: doclinks=$doclinks linkcnt=$linkcnt doctunes=$doctunes tunecnt=$tunecnt doctitls=$doctitls titlcnt=$titlcnt\n" if $V>3;
	$linkmax = $linkcnt if $linkmax < $linkcnt;
	$tunemax = $tunecnt if $tunemax < $tunecnt;
	$titlmax = $titlcnt if $titlmax < $titlcnt;
	$t = time - Max($HTTPcontime,$TMread);
#	if ($t < $HTTPdelay) {			# Do we have a min delay between GETs?
#		sleep($HTTPdelay - $t);		# If so, wait out the rest of the delay
#	}
	if ($endDoc || $HTTPalrm) {	# Some disaster detected
		$reason = "timeout alarm after $t sec";
		print V "Close \"$URL\" ($reason)\n" if $V>3;
		$endDoc = $HTTPalrm = 0;
	} elsif ($finishup) {
		$reason = "Told to finishup";
		print V "$F: $reason ...\n" if $V>3;
	} else {
		$reason = 'EOF';
		print V "$F: EOF on DOC file at $cymdhms.\n" if $V>6;
	}
	&closeDoc($reason) if $DOCopen;
	if ($tunesinfile>0 && $V>1) {		# ABC line count
		($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900;
		$s = ($tunesinfile > 1) ? 'tunes' : 'tune';
		$when = "$CY/$MM/$DD $hh:$mm:$ss";
		print V "      \"$lurl\" ==== $tunesinfile abc $s ==== X:$fileXhdrs T:$fileThdrs P:$filePhdrs K:$fileKhdrs $when\n";
		print V "$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost at $when\n" if $V>0;
	 
	}
	print V "$F: Xhdrs=$fileXhdrs Thdrs=$fileThdrs Phdrs=$filePhdrs Khdrs=$fileKhdrs\n" if $V>2;
	print V "$F: D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $urlpath\n" if $V>1;
	alarm 0; $SIG{ALRM} = 0;
	print V "$F: Set alarm 0.\n" if $V>6;
	print V "$F: DONE \"$URL\"\n" if $V>2;
	return 1;
}

sub closeDoc {my $F='closeDoc';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($why) = @_ || 'no reason';
	$hs->endDoc($why);
	print V "$hsep\n" if $V>2;
	print V "$F: DOC had $fileXhdrs X: lines.\n" if $V>2 && $fileXhdrs>2;
	print V "$F: DOC had $fileThdrs T: lines.\n" if $V>2 && $fileThdrs>2;
	print V "$F: DOC had $filePhdrs P: lines.\n" if $V>2 && $filePhdrs>2;
	print V "$F: DOC had $fileKhdrs K: lines.\n" if $V>2 && $fileKhdrs>2;
}

sub scanBuf {my $F='scanBuf';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is the code that handles once through the buffer: loop in scan().   It #
# was  moved  here  during an attempt to profile the code.  Our return values #
# tell the caller to do these things:                                         #
#   0: return 0
#   1: return 1
#   2: next buffer
#   3: last buffer
# The job of this routine is to 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($data,$line,$val);
	print V "$F: Called.\n" if $V>5;
	if ($closeDoc || $ignorefile) {
		print V "$F: Ignoring rest of document.\n" if $V>2 && $ignorefile;
		print V "$F: Closing document.\n"  if $V>2 && $closeDoc;
		return 3;			# Abandon this doc
	}
	print V "$F: Reading DOC ...\n" if $V>5;
	if ($w3timedout) {			# Should we give up on this doc?
		print V "$F: TIMEOUT with V=$V\n" if $V>1;
		unless ($doclinks || $doctunes || $doctitls) {
			print V "$F: TIMEOUT and nothing interesting found.\n" if $V>1;
			return 3;			# Abandon this doc
		}
		print V "$F: Timeout ignored because $doclinks links, $doctunes tunes, $doctitls titles.\n" if $V>3;
	}
	print V "$F: No timeout.\n" if $V>5;
	print V "$F: DOCline call ...\n" if $V>5;
	$TMread =  time;			# Note time of last DOC read
	$data = $hs->DOCline();
	if (!defined($data)) {		# Read one line from document
		print V "$F: EOF\n" if $V>3;
		return 3;
	}
	$doclines ++;				# Count the lines in the document
#	if ($V>1) {					# Count the lines with X:, T:, P: or K: headers
		++$fileXhdrs if ($data =~ /\bX:/);
		++$fileThdrs if ($data =~ /\bT:/);
		++$filePhdrs if ($data =~ /\bP:/);
		++$fileKhdrs if ($data =~ /\bK:/);
#	}
	&dmpline('===',$data) if $V>2;
	print V "BUF: \"$data\"\n" if $V>5;
	$TMread = time;				# Note last time we got data from file
	print V "$F: Set opent1=$TMread after TIMEOUT\n" if $V>1 && $w3timedout;
	$docbytes += length($data)	# Count the bytes in the document
		unless $inHTTPhdrs;
	print V "$F: Doc has $doclines lines $docbytes bytes so far.\n" if $V>5;
	print V "$F: maxlines=$maxlines usuf='$usuf'\n" if $V>5;
	if ($doclines > $maxlines && $usuf ne 'abc') {	# Should we continue?
		unless ($inABC || $doctunes) {
			# For files not of abc type, we require a recognized tune
			# with maxlines, or we reject it and drop the connection.
			# This does a lot to prevent grovelling through huge files
			# of irrelevant stuff.
			print V "$F: Abort after $doclines lines.\n" if $V>2;
			&closeDoc("No ABC in initial $maxlines lines");
			return 1;
		}
	}
	$line = $data;				# Add to leftover from last line
	$line =~ s/[\s\r\n]+$//s;	# Discard trailing white stuff
	print V "$F: line={" . &esc($line) . "}\n" if $V>3;
	print V "LINE \"$line\"\n" if $V>3;
	if (!$line && $inHTTPhdrs) {	# Null line -
		$inHTTPhdrs = 0;			# Terminates headers
		print V "$F: Null line terminates HTTP header.\n" if $V>3;
		return 2;				# Otherwise ignore it
	}
	print V "$F: \"$line\"\n" if $V>5;
# We're not in the HTTP headers.  So this must be in the data.  We try to cut
# out  the  ABC tunes and store them in @tune, calling &tune() at every blank
# line that ends a tune.
	while ($line =~ s"<URL:([^>]+?)>""s) {
		print V "$F: <URL:$1>\n" if $V>2;
		&href($lurl,$1,'',1);	# Handle this href later
		&dmpline('+++',$line) if $V>2;
	}
# Next we look for possible multiple ABC headers on one line. If we find
# them, we insert newlines and put all but the first back into the
# htmlsubs module's text buffer.
	$line =~ s/^\s+(\w:)/$1/s;			# Wipe out initial white stuff
	if ($line =~ s/\s+([A-Z]:)([^|])/\n$1$2/gis) {	# Insert newlines before internal headers
		&dmpline('---',$line) if $V>2;
		if ($line =~ /^([^\n]*)\n(.*)/s) {
			$hs->putBack("$2\n");
			$line = $1;
			&dmpline('---',$line) if $V>2;
		} else {
			print V "$F: ### Oops! Split line doesn't match properly!!!\n" if $V>0;
		}
	}
#	$len = length($line);
	if ($inABC) {
		$line =~ s/^\s+(\w:)/$1/;	# Strip out initial white stuff from ABC headers
		if (!$line) {			# Blank line ends tune
			print V "$F: Blank line ends ABC tune.\n" if $V>2;
			&tune();			# Process the tune
			print V "$F: Tune " . ($val ? 'rejected' : 'accepted') . ".\n" if $V>1;
			@tune = ();			# Reset for possible next tune
			$X = undef;
			$inABC = 0;
		} elsif ($line =~ /^X:\s*(\d+)([^\s]*)/) {	# X: line ends tune
			print V "$F: X:$1$2 line ends previous ABC tune.\n" if $V>1;
			$val = &tune();		# Process the previous tune
			print V "$F: Previous tune " . ($val ? 'rejected' : 'accepted') . ".\n" if $V>1;
			$X = "$1$2";
			print V "$F: X:$X may be new ABC tune (X:$X)\n" if $V>2;
			$inABC = 1;
			@tune = ($line);	# Reset for possible next tune
		} elsif ($line) {
			push @tune, $line;	# Add the line to the tune
		}
		$line = '';
		print V "$F: Return 2 (Line added to tune)\n" if $V>3;
		return 2;
	}
	print V "$F: Not in ABC tune.\n" if $V>5;
	# We're not inside an ABC tune.  The main complications here are ABC %% directives.
	if ($line =~ /^%%(.*)/) {
		$directive = $1;
		print V "$F: DIR %%$directive\n" if $V>2;
		if ($directive =~ /^noindex\b/) {	# %%noindex says ignore ABC tune(s)
			$reason = "%%NOINDEX inABC=$inABC";
			print V "$F: DIR $reason\n" if $V>2;	# Should be 0
			$ignorefile = 1;
			$closeDoc = 1;	# This may be redundant
			print V "$F: Return 3 ($reason)\n" if $V>3;
			return 3;		# Close down this file 
		}
	}
	$nhrefs = 0;
	$line =~ s/^\s+//;			# Strip initial white stuff
	print V "$F: line={" . &esc($line) . "}\n" if $V>3;
#	$line =~ s#^</*[bi]>##si;		# Strip out <B> and <I> tags
#	$line =~ s#^</*pre>#\n\n#si;	# Replace <PRE> tags with double newlines
	if ($line =~ m"^X:\s*(\d+)([^\s]*)") {	# X: line starts a (new) ABC tune
		$X = "$1$2";			# Note that we keep any alpha junk after the index number
		print V "$F: may be ABC tune (X:$X)\n" if $V>2;
		$inABC = 1;
		@tune = ($line);
		++$possibletunes;		# Count the (possible) ABC tunes
		$line = '';
		print V "$F: Return 2 (X line may be ABC)\n" if $V>5;
		return 2;
	}
	if (($H,$ttl) = ($line =~ /^([TPN]):\s*(.*)/)) {	# T: or P: line required
		unless (defined $X) {	# Missing X: line?
			$X = '0';			# Default tune index
			++$possibletunes;	# Count the (possible) ABC tunes
			@tune = ("X: $X");	# X: is first line
		}
		print V "$F: looks like ABC tune ($H:$ttl)\n" if $V>2;
		$line = "$H:__" unless $ttl;    # Missing title?
		$inABC = 1;
		push @tune, $line;
		$line = '';
		print V "$F: Return 2 ($H line possible ABC)\n" if $V>5;
		return 2;
	}
	print V "$F: Not an X or T or P line.\n" if $V>5;
	if ($line) {			# Not X: or T: so just add it to the tune
		push @tune, $line;
	}
	$line = '';
	print V "$F: Return 2 (at end)\n" if $V>5;
	return 2;
}

sub timedout {my $F='timedout';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is called while abandoning a URL, to convert the current "chunk" of  a #
# hst/* file to show the timeout.                                             #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	$now = time;
	$TOopen = $now - $TMopen;	# Time since we opened the current file
	$TOread = $now - $TMread;	# Time since we last read from the file
	print V "$F: TIMEOUT after $TOopen/$TOread at $cymdhms in line $doclines.\n" if $V>0;
#	if ($doclines < 1) {
#		print V "$F: Timed out instantly!!!\n" if $V>0;
#		++ $V;		# Try to find out what's happening
#		print V "$F: Increased V to $V.\n" if $V>0;
#	}
	push @newchunk, "$now URL $lurl";
	push @newchunk, "$now # TIMEOUT after $TOopen/$TOread sec.";
	&closeDoc("Timed out");
	if (@newchunk) {
		print V "$F: Timed out with partial chunk built.\n" if $V>5;
	} else {
		print V "$F: Timed out with nothing, preserving old data\n" if $V>5;
		push @newchunk, @oldchunk if @oldchunk;
	}
}

sub scanURL {my $F='scanURL';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Scan one URL for tunes.  Actually, all we do here is pass off most  of  the #
# work  to  scan(), and then examine the results to see if we want to include #
# this URL in the output.                                                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my $urlfull = shift;
	print V "$F: SCAN \"$urlfull\"\n" if $V>3;
	$urlcount ++;		# Count the URLs that we process
	$purged = 0;		# Mark it as tentavely OK to process
	&scan($urlfull);
	if ($purged) {
		print V "$F: PURGED purgebad=$purgebad.\n" if $V>2;
		$newuline = "$now # D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $urlpath";
	} elsif ($depth<2 || $doctunes || $doctitls || !$purgebad) {	# used to include "$doclinks ||"
		print V "$F: GOOD\n" if $V>2;
		$newuline = "$now U D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $urlpath";
	} else {			# No links, tunes or titles
		print V "$F: Purge because links=tunes=titles=0 purged=$purged.\n" if $V>2;
		$newuline = '';	# This won't be output
	}
	print V "$F: newuline=\"$newuline\"\n" if $V>2;
	print V "$F: DONE \"$urlfull\"\n" if $V>3;
}

sub showcalls {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dump the function-call stack.  This happens in response to some interrupts, #
# depending on how things are configured at the moment. We can also call this #
# from the debugger.                                                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($l,$package, $filename, $line, $subroutine);
	return unless $V<1;
	print V "$P: Call stack:\n";
	$l = 0;
	while (($package, $filename, $line, $subroutine) = caller($l)) {
		printf V "\tLevel %3d line %5d $filename\tin $subroutine\n",$l,$line;
		++$l;
	}
}

sub sigCONT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A CONT signal tells us to abandon the current URL  and  continue  with  the #
# next.   This  is useful when we are hung on a connection, though this isn't #
# much of a problem now that we run $GetCmd as a subprocess.  Just  kill  the #
# $GetCmd process, and we'll continue.                                        #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigCONT: CONT signal.\n" if $V>0;
	print V "sigCONT: close DOC ...\n" if $V>6;
	if ($DOCopen) {
		&closeDoc('sigCONT');
		print V "sigCONT: closed DOC.\n" if $V>3;
	}
	&showcalls();
}

sub sigQUIT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A QUIT signal tells us to abandon the current URL and host, clean  up,  and #
# exit.                                                                       #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigQUIT: QUIT signal.\n" if $V>0;
	print V "sigQUIT: close DOC ...\n" if $V>6;
	&showcalls();
	if ($DOCopen) {
		&closeDoc('sigQUIT');
		print V "sigQUIT: closed DOC.\n" if $V>3;
	}
	$finishup = $endDoc = $endHost = 1;
}

sub sigINT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# An INT signal gets us here, where we do a stack dump and set  the  finishup #
# flag to trigger abandonment of all further URLS.                            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigINT: INT signal.\n" if $V>0;
	&showcalls();
	&hostunlock if $lfile;
	$finishup = $endDoc = $endHost = 1;
	exit 1;
}

sub sigUSR1 {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A USR1 signal just produces a stack dump, and then we continue.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	--$V;
	print V "sigUSR1: USR1 signal V=$V.\n" if $V>0;
	&showcalls();
}

sub sigUSR2 {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A USR2 signal just produces a stack dump, and then we continue.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	++$V;
	print V "sigUSR2: USR2 signal V=$V.\n" if $V>0;
	&showcalls();
}

sub cacheTune {my $F='cacheTune';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is called to stuff one tune into the cache. If it's the first tune, we #
# have extra work: We must first create a directory derived from the URL.  We #
# massage the URL into a  pathname,  converting  "http://"  to  "http/",  and #
# adding a final ':' to avoid conflicts with previous naming schemes. We also #
# HTTP-encode a few characters.  If we end up creating the directory, the tune
# is written to a file in the directory with a name based on the X: index and
# the canonicalized T: title.
# ToDo: Add  the other index/title combos as links.
# We return 1 if we succeed, 0 if we fail  for some reason. 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($path) = $hs->URLenc($docurl); 
	local($file, $line,$lines,$name,$size);
	$path =~ s":*//+"/"g;	# Reduce the :// and multiple slashes
	$path =~ s"/*$":";		# End with ":"
	unless (-d $path) {
		if (&mkdirs($path) < 0) {		# Create the directory path
			print V "$F: Failed to make dir \"$path\"\n" if $V>0;
		} else {
			print V "$F: Made dir \"$path\"\n" if $V>1;
		}
	}
	unless (-d $path) {
		print V "$F: Cache dir \"$path\" doesn't exist; can't cache tune.\n" if $V>0;
		return 0;
	}
	if (-f $cachetmp) {		# Was the source file cached?
		$size = -s $cachetmp;
		print V "$F: Link $cachetmp -> $path/%%src ($size bytes)\n" if $V>1;
		$file = "$path/%%src";
		print V "$F: file={" . &esc($file) . "}\n" if $V>2;
		Backup($file) if -e $file;
		if (link($cachetmp,$file)) {
			print V "$F: Linked $cachetmp -> $file\n" if $V>0;
		} else {
			print V "$F: ### Can't link $cachetmp -> $file [$!]\n" if $V>0;
		}
		if (unlink($cachetmp)) {	# Unlink the tmp cache file to avoid re-use
			print V "$F: Unlinked \"$cachetmp\"\n" if $V>1;
		} else {
			print V "$F: ### Can't unlink \"$cachetmp\" [$!]\n" if $V>1;
		}
	}
line:
	for $line (@tune) {		# Scan the tune for X: and T: line
		if ($line =~ /^X:\s*(\d+[:._\w]*)/) {
			print V "$F: X:$1\n" if $V>1;
			$name = "%$1";
			print V "$F: name={" . &esc($name) . "}\n" if $V>2;
			next line
		}
		if ($line =~ /^([TP]):\s*(.*)$/) {	# Accept T: or P: as title
			print V "$F: $1:$2\n" if $V>1;
			$name = $name . ':' . &Cname($2);
			print V "$F: name={" . &esc($name) . "}\n" if $V>2;
			last line;		# Only the first one for now
		}
	}
	$file = $path . '/' . $name;
	print V "$F: file={" . &esc($file) . "}\n" if $V>2;
	Backup($file) if -e $file;		# Back up previous version
	unless (open(CFILE,">$file")) {	# Create single-tune file
		print V "$F: ### Can't write \"$file\" [$!]\n" if $V>0;
		return 0;
	}
	$lines = 0;
	for $line (@tune) {
		print CFILE "$line\n";
		++$lines;
	}
	print V "$F: Wrote $lines lines to \"$file\"\n" if $V>0;
	close CFILE;
	return 1;
}

sub tune {my $F='tune';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# The @tune array contains what looks like an ABC tune.  Extract the critical #
# data from it, and if it passes as a tune, write the data to the output. The #
# return value is 0 if we reject it as a tune; >0 if we accept it.            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($H,$l,$K,$k,$M,$m,$lines,@N,$P,@P,$t,$T,@T,$Tsrc,$X);
	local($Achords,$Clefs,$Tcnt,$Vcnt,%Voices);
	local($comment,$directive,$lines);
	local($GBcode,$JCcode,$UDcode);
	$lines = int(@tune);
	print V "$F: Tune has $lines lines.\n" if $V>2;
#	if ($cachetmp) {
#		print V "$F: Caching due to $lines-line tune ...\n" if $V>1;
#		&cacheTune();
#	}
	if ($V>2) {			# Dump the tune to the verbose output
		print V "%%%%%%\n";
		for $l (@tune) {print V "%% $l\n"}
		print V "%%%%%%\n";
	}
	if ($ignoretune) {
		print V "$F: ### Tune dropped because $ignoretune=$ignoretune\n" if $V>1;
		$ignoretune = 0;	# Don't ignore the next one.
		return 0;
	}
	if ($ignorefile) {
		print V "$F: ### Tune dropped because $ignorefile=$ignorefile\n" if $V>1;
		return 0;
	}
	$X = $Tcnt = $Vcnt = 0;
	$T = $P = $K = $M = '';
	@N = ();
	@P = ();
	@T = ();
	$Achords = $Clefs = $Vcnt = 0;
	%Hdrs = ();
	%Voices = ();
line:
	for $l (@tune) {
		print V "l=\"$s\"\n" if $V>7;
		++$lines;
		if ($l =~ /^\s*%(.*)/) {			# Comments
			$comment = $1;
			print V "$F: COM %$comment\n" if $V>2;
			if ($comment =~ /^%(.*)/) {		# ABC directive
				$directive = $1;
				print V "$F: DIR %%$directive\n" if $V>2;
				if ($directive =~ /^noindex\b/) {	# %%noindex says ignore ABC tune(s)
					print V "$F: DIR NOINDEX inABC=$inABC\n" if $V>2;
					if ($inABC) {
						print V "$F: DIR NOINDEX inside ABC tune.\n" if $V>2;
						print V "$F: Ignore rest of tune.\n" if $V>2;
						$ignoretune = 1;
						return 0;
					} else {
						print V "$F: DIR NOINDEX outside ABC tune.\n" if $V>2;
						print V "$F: Ignore rest of file.\n" if $V>2;
						$ignorefile = 1;
						return 0;
					}
				} else {
					print V "$F: Unknown %%$directive ignored.\n" if $V>1;
					next line;
				}
			}
			print V "$F: COM %$comment ignored.\n" if $V>3;
			next line;
		}
		if ($l =~ m"^X:\s*(\d+)([^\s]*)") {	# X: index header
			$X = "$1$2";				# Keep any alpha junk after the index number
			print V "$F: X:'$X'\n" if $V>5;
		} elsif ($l =~ /^N:\s*(.*)/) {	# N: parts header
			unless ($K) {				# Use only within headers
				push @N, $1;
				print V "$F: N:'$1'\n" if $V>5;
			}
		} elsif ($l =~ /^P:\s*(.*)/) {	# P: parts header
			$P = $1 unless $P;			# Note first P: text
			unless ($K) {				# Use only within headers
				push @P, $1;
				print V "$F: P:'$1'\n" if $V>5;
			}
			if ($X && !$T) {			# Tune with P: line and no T: line?
				$T = $P;				# Use the first P: line as the title
				print V "$F: P:'$T' used as title.\n" if $V>1; 
			}
		} elsif ($l =~ /^T:\s*(.*)/) {	# T: title header
			++$Tcnt;					# Count the T: lines
			$T = $1 unless $T;			# Note first non-blank title
			$t = $1 || '__';			# Use '__' for missing title
			$t =~ s/\s*[<%].*//;		# Delete comments, HTML
			$t =~ s/\s+/ /g;			# Reduce white space
			if ($SCDkludge && $T && ($t =~ m'[\d_]+x[\d_]+[A-Z][\d_]*')) {
				print V "$F: Drop SCD title \"$t\"\n" if $V>5;
				next line;
			}
			if ($t) {
				push @T, $t;			# Accumulate titles
				print V "$F: T:'$t'\n" if $V>5;
			}
		} elsif ($l =~ /^M:\s*(.*)/) {	# M: meter
			next line unless $X || $T;	# Ignore M lines outside tune
			unless ($M) {				# Use only the first meter
				$M = $1;
				$M =~ s/\s*[<%].*//;	# Delete comments, HTML
				$M =~ s/\s+//;			# Strip out white space
				print V "$F: M:'$M'\n" if $V>5;
			}
		} elsif ($l =~ /^K:\s*([^\r\s]*)\s*(.*)/) {	# K: key ends headers
			next line unless $X || $T;	# Ignore K lines outside tune
			next line if $K;	# Is the key already defined?
			$K = $1;			# It's the first key sig
			$k = $2;			# Any excess stuff
			if (($k =~ /\b(treble|alto|tenor|bass)\b/i) || ($k =~ /\b(clef=[GCF])\b/)) {
				$K .= " $1";	# Include clef with key
				$Clefs ++;		# Count all clefs
			}
			print V "$F: K:'$K'\n" if $V>5;
		} elsif ($l =~ /^V:\s*(\w+)/) {	# V: Voice line
			$Hdrs{V}++;					# Note that voices are used
			$Voices{$1}++;				# Note the different voices
			if ($l =~ /\bclef=(treble|alto|bass|G|C|F)\b/) {
				$Clefs ++;				# Count all clefs
			}
			$Vcnt = int(keys %Voices);	# Count the distinct voices
		} elsif ($l =~ /^([A-Za-z]):/) {	# V: Other headers
			$Hdrs{$1} ++;
		} elsif ($l =~ /^\d:/) {		# Why do we see this?
			print V "drop: $l.\n" if $V>5;
		} elsif ($K) {					# Collecting music for abcCode()
			if ($l =~ /"[A-G][b#]*[m7]*"/) {	# Look for chords
				$Achords ++;
				$Hdrs{'"'} ++;			# Add as a kind of "Header"
			}
			unless ($l =~ /^\w:/) {		# Ignore things like w: words
				$m .= $l;				# Accumulate music as one string
			}
		}
	}
	print V "$F: Tune has $lines lines, with $Tcnt titles $Vcnt voices and key \"$K\"\n" if $V>2;
	if ($X && $T && $Tcnt && $K) {
		print V "$F: TUNE accepted unconditionally ($Tcnt titles X=$X T=\"$T\" K=\"$K\")\n" if $V>2;
	} elsif (($Tcnt || $P) && $K) {
		print V "$F: TUNE accepted ($Tcnt titles P=\"$P\" K=\"$K\")\n" if $V>2;
	} elsif (($Tcnt || $P) && $K) {
		print V "$F: TUNE dubious ($Tcnt titles P=\"$P\" K=\"$K\" and $Vcnt voices)\n" if $V>1;
	} elsif ($X && $K) {
		print V "$F: TUNE untitled ($Tcnt titles P=\"$P\" K=\"$K\" and $Vcnt voices)\n" if $V>1;
	} else {
		print V "$F: TUNE rejected ($Tcnt titles X=$X T=\"$T\" P=\"$P\" K=\"$K\")\n" if $V>1;
		return 0;
	}
	$Vcnt = int(keys %Voices);			# Number of distinct voices found
	print V "$F: $Vcnt voices found in X:'$X'\n" if $Vcnt>0 && $V>2;
	print V "$F: inABC=$inABC K=$K X=$X m=$m\n" if $V>5;
	unless (($inABC && $K) || ($X && $K && $m)) {
		print V "$F: No X line found.\n" if (!$X && $V>2);
		print V "$F: No K line found.\n" if (!$K && $V>2);
		print V "$F: No music found.\n"  if (!$m && $V>2);
		print V "$F: Tune X:$X T:$T K:$K failed tests.\n" if $V>2;
		return $doctunes;
	}
	++$tunesinfile;		# Count the tune as accepted
	if ($cachetmp) {
		print V "$F: Caching $lines-line tune ...\n" if $V>1;
		&cacheTune();
	}
	print V "$F: Call abcCode(\"$K\",\"$L\",\"$M\",\"$m\")\n" if $V>7;
	($GBcode,$JCcode,$UDcode) = $abcCode->abcCode($K,$L,$M,$m);
	print V "$F: GBcode=\"$GBcode\" JCcode=\"$JCcode\" UDcode=\"$UDcode\"\n" if $V>6;
	&dt();
	if (@T) {			# Did we find any T: lines?
		for $t (@T) {
			print V "$F: Title from T:$t\n" if $V>2;
		}
		$Tsrc = 'T';
	} elsif (@P) {
		push @T, $P[0];		# If no title lines, use first P: line
		$Tsrc = 'P';
		print V "$F: Title from P:$P[0]\n" if $V>2;
	} elsif (@N) {
		push @T, $N[0];		# If still no title, use first N: line
		print V "$F: Title from N:$N[0]\n" if $V>2;
		$Tsrc = 'N';
	} else {
		print V "$F: No title found in tune X:$X [urlfull=$urlfull]\n" if $V>2;
		push @T, '__';		# Dummy title
		$Tsrc = '_';		
	}
	$H = join('',sort(keys(%Hdrs)));
	$doctitls += int(@T);
	for $T (@T) {
		$l = "$now X:$X M:$M K:$K";
		if ($Clefs  > 0) {$l .= " C:$Clefs"}
		if ($Vcnt > 0) {$l .= " V:$Vcnt"}
		$l .= " H:$H" if $H;
		$l .= " C1=$JCcode C2=$UDcode T:$T";
		print V "$F: $Tsrc $l\n" if $V>1;
		push @newchunk, $l;
	}
	++$doctunes;
	print V "$F: $doctunes in this file.\n" if $V>2;
	$ignoretune = 0;	# If true, ignore all tunes in this file
	print V "$F: Return doctunes=$doctunes.\n" if $V>2;
	return $doctunes;
}
