#!/usr/bin/perl # #NAME # ABCbot - search the Web for ABC tunes # #SYNOPSIS # ABCbot [options] [host...] # #DESCRIPTION # This program is a web explorer robot that looks for ABC music. # # This program works from a "hosts" database which is currently kept in the # "hst/" subdirectory, one file per host. Each file contains a line per URL # at that host, possibly followed by one or more lines giving information # that was extracted from that URL. # # Each pass of this program gets a list of URLs and/or host names. URLs are # read from standard input, and all we do with them is append them to the # host's file. Thus http://foo.bar.com/abc/ results in the file # hst/foo.bar.com having a line added giving the URL. # # There are two distinct ways of running this program: with or without a # host list on the command line. If there are no hosts listed, we run in # "initialization" mode. The input stream should contain a list of URLs (and # possibly other information such as hosts to avoid). For each URL, we make # sure that the host's file exists, and we append a line for the URL at # depth 1. The next pass should then find these URLs and scan them for links # or tunes. # # Here's how I do the initialization: # ABCbot +CURLs >& ABCbot.out & # # If called with hosts on the command line, we are in "update" mode. The # input may contain hosts to avoid, but URLs there will be ignored. Instead, # we do a scan of each of the listed hosts and update its file in the host # directory. For each host, we move its file to backup (by appending '-' to # its name), and then we read this backup file and write a new file for the # host. For each URL in the host's file, we fetch the file, and extract # hyperlinks and ABC tunes. # # Hyperlinks are accumulated, and at the end we will repeat the # "initialization" and append the links to their hosts' files. For each ABC # tune found, we add a line showing the "interesting" information about the # tune. What is considered interesting may change from time to time. # # Because of the difficulties in preventing infinite loops with URLs we # implement two ways of limiting the URLs that may be followed: You can # restrict the depth of recursion with the -D option, and you can restrict # the hostname(s) with the +H option. # #REQUIRES $GetCmd = './webcat'; # Program to write a URL to STDOUT require "DT.pm"; # Date/time routine(s) # # We call webcat as a subprocess to fetch files from the web. You should # find it in the same directory. This was done so that we could properly # time out zombie connections to some of the broken web sites out there. It # turns out that you can only abort a connect() with sig('ALRM'), and if you # attempt to close the socket after an alarm, you may die a horrible death. # With that isolated in a subprocess, we can continue to run past such # disasters and continue with the next URL. # # This program no longer uses the LWP::Simple modules. I've found a simpler # approach. But you'll have to download these modules, and possibly change # push to say where you put them: # $ENV{'PATH'} = ".:sh:$ENV{'PATH'}"; push @INC, '.', 'sh', split(':',$ENV{'PATH'}); use Backup; # File backup routine. use abcCode; # Calculates tune codes. use DT; # Date/Time routine. use Vopt; # Verbose output. # use HTTPcon; # Makes HTTP connection to server. # use URLdata; # Opens URL and returns file handle. use URLhref; # Combines URL + HREF -> new URL. use URLtrim; # Shrinks URLs. use HTMLdir; # HTML directory listing. use cfghost; # Routine to read host config files # # They'll have to be in your @INC path; by default we add $HOME/sh and # $HOME/pl to @INC, so those are good places to put them. # #ENVIRONMENT # We read the following from the environment: # # V_ABCbot= # If defined, this defines our "verbose" level and output file. The level # is a number (which defaults to 1 or 2, depending on shat I want at # the moment), the optional (which defaults to STDERR) is where the # output is written. Note that this variable's name consists of 'V_' plus # the program's name. If you call this program by some other name, you # should of course use 'V_' plus that name. # #INPUT # We always read from stdin, so if you don't want to provide any input, # you'll need to redirect our input to /dev/null. The input is scanned for # URLs, and they are added to our starting list (at depth 1). # # As a special aid in limiting searches, the input may contain lines of # these forms (with or without the colons): # done: http://foo.bar.com/xyz # ignore http://foo.bar.com/xyz # avoid: http://foo.bar.com/xyz # These are ways of telling ABCbot to ignore certain URLs. The "done" and # "ignore" commands give specific URLs that are to be avoided; this is # implemented by simply listing them as "already done". With the "avoid" # command, we extract the host name, and URLs for that host will not be # used. # #OUTPUT # #OPTIONS # Options start with '-' or '+' plus a letter, with possibly a parameter # (and no embedded spaces). Some of the options take an initial '+' to mean # "enable" and '-' to mean "disable". For others, the '-' or '+' is not # relevant. If '+' is shown in the list below,, then it is significant. # Capitalization of the option letters doesn't matter (but it may matter in # an argument string if there is one). # # - # where is an integer, means a timeout of seconds. The default is # currently: # $ABCtmout = $ENV{'ABCtmout'} || 60; # Was 600 # # -d # This restricts the depth of directory searches to . This is # mostly to avoid infinite loops. The default is 3. Experience has shown # that each depth level produces at least a factor of 10 increase in run # time, so you should be careful with this. It's much faster to have a # shallow depth and a long list of starting URLs. One recommendation: use # the previous output as input, so all the successes then will be # re-scanned (at depth 2) in the current run. # # +h # Allow URLs for . Default: All hosts allowed. If there is one or # more +h options, then only these hosts are allowed. # # -s # +s # Skip over URLs while searching. This has the effect of not making a # lot of requests in succession of a single server. It is implemented by # moving n-1 URLs to the end of the URL list before each attempt to fetch # a URL. # #LOCALHOST # The following host names are rewritten: # %hostsub = ( # 'ecf-guest.mit.edu' => 'localhost', # 'trillian.mit.edu' => 'localhost', # 'jc.tzo.net:1742' => 'localhost', # 'lochaber.tullochgorm.com' => 'localhost', ); # #EXAMPLES # #SIGNALS # There are various ways that this program may get hung up because of # misbehavior (or behavior that may be valid but I don't understand it) on # the part of web servers. You can "kick" this program by sending it these # signals: # # CONT # Abandon the current URL by closing the connection. # INT or HUP # Abandon the search and write the output files. # USR1 # Write a dump of the call stack to the verbose log. Useful for diagnosing # hangups. If this program doesn't do anything for more than about 2 # minutes, you might send it a USR1 signal, to see what it was trying to # do. And since this causes the "interrupted system call" error, it tends # to also get things moving again. # #MISC # Lines in a file starting with "%%noindex" tell ABCbot to ignore tunes. If # a "%%noindex" line is inside a tune, only that one tune will be ignored. # If a "%%noindex" line is found ouside a tune, it means to ignore the rest # of the file. [Added by JC 2007-10-08] # #BUGS # This program is highly experimental, in alpha state, and all that. Use it # at your own risk. (Not much risk, there, actually, but I thought I'd give # the usual friendly warnings.) Just don't write the output back over the # input, and check its output with a browser or two, and there shouldn't be # many problems. # # Of course, there are constant problems with spelling variations. Musicians # are atrocious spellers. This program doesn't even attempt to tackle this # issue. # #AUTHOR: # John Chambers http://trillian.mit.edu/~jc/ # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Assorted initializations: require "DT.pm"; $ENV{"V_$GetCmd"} = 1; ($P = $0) =~ s'.*/'' unless defined($P); # This program's name ($myhost = `hostname`) =~ s/\s+$//; # Our hostname &Vopt($ENV{"V_$P"} || '2'); # Verbose level $| = 1; # Auto-flush stdout $, = "\n"; # Is this still used? $" = "\n\t"; # How about this? $exitstat = 0; # Anyone can set this to nonzero # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Some global vars for controlling actions: $allowcgi = 0; # We usually don't look at cgi URLs $articles = '-'; # Don't include initial articles $chkuplinks = 1; # Check for "parent/home/back" lines $currhost = ''; # The host we're processing right now $followUpLink = 0; # Whether to follow links that contain "/../" #HDRkludge = 1; # Try to ignore HDR files #listabchosts = 1; # Collect list of hosts with abc files $purgebad = 1; # Drop URIs that get 404 (Not found) $saveunmatched = 0; # If true, unmatched chunks will be preserved $schedule = 0; # If >0, schedule a rerun after this many minutes $SCDkludge = 0; # Try to ignore SCD dance-form titles #urlskip = 1; # Set > 1 to scatter URLs # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Initialize the module to calculate tune encodings: # $abcCode = new abcCode; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Counter default values: # $doctunes = 0; # Number of X: lines discovered in current doc $doclinks = 0; # Number of hyperlinks discovered in current doc $doctitls = 0; # Number of T: lines discovered in current doc $filemax = 0; # Is this used? $ignoretune = 0; # If true, ignore all tunes in this file $ignorefile = 0; # If true, ignore the current tune $linkcnt = 0; # Total links at this host $scancnt = 0; # Total scans of this host $tunecnt = 0; # Total tunes at this host $titlcnt = 0; # Total titles at this host # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # These are passed to $GetCmd. There are a lot of problems with web servers # that require a specific HTTP version number. If we get nothing from a site # that should have ABC tunes, try setting the HTTPversion to '1.0'. $HTTPdelay = &env('HTTPdelay', 1); # Was 0 $HTTPtimeout = &env('HTTPtimeout', 60); # Was 600 $HTTPversion = &env('HTTPversion', '1.1'); print V "$P: HTTP delay $HTTPdelay.\n" if $V>1; print V "$P: HTTP timeout $HTTPtimeout.\n" if $V>1; print V "$P: HTTP version $HTTPversion.\n" if $V>1; $TOopen = $TOread = -1; # Timeout intervals # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Initialize the tune cache. $cachehdrs = 0; # Writing HTTP headers to cache? $cachetunes = &env('ABCcache' , 1); # Writing cache? #cachebase = '.'; # Where to put the cache $cachetmp = "cache$$.txt"; # Cache file while reading print V "$P: Caching tunes.\n" if $V>2 && $cachetunes; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Recursion control: # $abcdepth = 4; # Depth limit for *.abc files $hstdepth = # Default depth limit for current host $maxdepth = 3; # Default depth limit for directories [jc 20061212] $maxurls = 0; # If >0, give up after this many URLs $urlcount = 0; # Number of URLs processed so far $depth = 1; # The current depth in directories # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # $maxlines = 500; # Give up if no ABC in this many lines of text $maxscans = 1; # Max number of scans before giving up on host $showlinks = 0; # Include links in host files $smryfile = "Summary.txt"; # Where to write 1-line summary info # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # commands for fetching web files: #ftpget = 'ftpcat +a'; # Default command to get a file via ftp $dfltget = "$GetCmd +TH"; # Default command to get a file via http print V "$P: dfltget=\"$dfltget\"\n" if $V>2; # Before this point should be only simple assignments of initial values. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here are our global arrays and tables: %BadHost = (); # Hosts which should not be accessed at all %BadPath = (); # Paths on all hosts to avoid %badpath = (); # Paths on this host to avoid @oldchunk = (); # lines of old hst/* file entry @newchunk = (); # lines of new hst/* file entry %Depth = (); # link depth for a URL %DepthHost= (); # link depth limit for a host %Done = (); # time that URL was scanned %h2d = (); # list of depths for the h2u URIs %h2n = (); # number of URLs for each host %h2u = (); # list of URIs for each host %inithost = (); # hostnames from the command line %outlink = (); # URLs that have already been seen in this file @tune = (); # lines of current ABC tune #URLts = (); # timestamp when a URL was last read # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # We implement a special kludge to handle local URLs: ($thishost = `hostname`) =~ s/\s*$//; print V "$P: $thishost is our host name.\n" if $V>1; %local = ( "http://localhost/~jc/" => ($ENV{HOME} . "/public_html/"), ## "http://$thishost/~jc/" => ($ENV{HOME} . "/public_html/"), # "http://dmz.atsbank.com/~jc/" => ($ENV{HOME} . "/public_html/"), # "http://ecf-guest.mit.edu/~jc/" => ($ENV{HOME} . "/public_html/"), # "http://trillian.mit.edu/~jc/" => ($ENV{HOME} . "/public_html/"), ); @local = sort(keys(%local)); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here are the MIME types that we look for. Since most file formats have more # than one MIME type, we map the MIME type to a simple word. $notearchives = 1; # Whether to look for zip files, etc. %MIMEtype = ( # MIME -> type mapping 'application/gzip' => 'gzip', 'application/x-gzip' => 'gzip', 'application/x-zip-compressed' => 'zip', 'application/zip' => 'zip', 'multipart/gzip' => 'gzip', 'multipart/zip' => 'zip', ); # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Stuff dealing with time: $spm = 60; # Seconds per minute $mph = 60; # Minutes per hour $hpd = 24; # Hours per day $spd = $spm * $mph * $hpd; # Should be 86400 $mintime = ($V>4) ? 10 : 0; # Minimum wait time before rereading a URL $maxdays = ($V>4) ? 0 : 0; # Maximum wait time in days $maxtime = $maxdays * $spd; # Maximum wait time before purging entries $oScanY = $oScanM = 0; # For remembering year and month of scan # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's where we initialize our signal handling. $SIG{CONT} = 'sigCONT'; # CONT causes stack dump and abandons current URL $SIG{INT} = 'sigINT'; # INT causes stack dump and terminate $SIG{HUP} = 0; # HUP was 'sigINT' but is now ignored $SIG{USR1} = 'sigUSR1'; # USR1 causes stack dump and continue # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Scan the command-line arguments, processing them as we go. Input files are # # read and used to build tables. Any URLs discovered are accumulated in # # @URLs. Options are processed as read, so they will only affect things to # # their right, except for URLs, which we save for last. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # for $a (@ARGV) { print V "$P: Arg \"$a\"\n" if $V>2; if (($fl,$opt) = ($a =~ m'^([-+])(.*)'i)) { # - - - - # +art include articles # -art exclude articles # This option controls whether articles (the, a, an, le, la, etc.) # are to be stripped from the start of titles. The default is -art, # which does nothing. if ($opt =~ m'^art'i) { $articles = $fl; print V "$P: " . ($articles eq '-' ? 'Ignoring' : 'Including') . " articles." if $V>2; # - - - - # -C # Read a bot config file. This is a config file giving starting URLS, plus # allowed and disallowed URLs and hostnames. } elsif ($opt =~ m'^C(.*)'i) { print V "$P: URL file '$1'\n" if $V>2; &cfgbot($1); # - - - - # -D recursion depth limit. # The "depth" of a URL is how many hops it is from our input URL list. The # URLs in this list are at depth D:1; the URLs they link to are at depth # D:2, and so on. This depth is recorded in the hst/* files. The default # is -D3, which experience shows is a practical limit. Note: Files whose # names end with ".abc" are allowed to be one level deeper. } elsif ($opt =~ m'^D(\d*)$'i) { $maxdepth = $1; print V "$P: maxdepth='$maxdepth'\n" if $V>2; $abcdepth = $maxdepth + 1; # - - - - # +L show links from a URL. # This is primarily a debugging hook. If enabled, the +L options means that # all the hyperlinks will be listed after a URL, with a ">" flag to show # that they are hyperlinks. This increases the size of the hst/* files # significantly, so the default is -L. } elsif ($opt =~ m'^L$'i) { $showlinks = ($fl eq '-') ? 0 : $1; print V "$P: showlinks='$showlinks'\n" if $V>2; # - - - - # - # This is a timeout in seconds. If we can't get a URL in this time, we # abandon it and go on to the next. The default is currently 60 seconds. } elsif ($opt =~ s'^(\d+)$'') { $ABCtmout = $1; print V "$P: ABCtmout=$ABCtmout\n" if $V>2; # - - - - # +P purge bad URLs. } elsif ($opt =~ m'^P$'i) { $purgebad = ($fl eq '-') ? 0 : 1; print V "$P: purgebad='$purgebad'\n" if $V>2; # - - - - # -S schedule next host after minutes. # When we finish, we will schedule another instance of this program for the # next host after minutes. The next host is the one in the hst/* # directory that is lexically next; if there is none, we pick the first # host. The default if is missing is to call a routine to try to # determine the interval. } elsif ($opt =~ m'^S(\d*)$'i) { $schedule = $1 || &getschedule(); print V "$P: schedule='$schedule'\n" if $V>2; $abcdepth = $schedule + 1; # - - - - # -T # +T # -T # This sets the timeout for opening URLs to seconds. The default is 30 # seconds. Special cases: -T means -T10 and +T means +T60. If is given # you may use '-' or '+' interchangeably. } elsif ($opt =~ m'^T(\d*)$'i) { $HTTPtimeout = $1 || (($fl eq '-') ? 10 : 60); print V "$P: HTTPtimeout='$HTTPtimeout'\n" if $V>2; # - - - - # -U # This sets the max number of URLs that we attempt to process. This is a # debug hook only. We abandon our task, clean up, and exit when $urlcount # passes this number. If $maxurls is zero, it means no limit. } elsif ($opt =~ m'^U(\d*)$'i) { $maxurls = (length($1)>0) ? int($1) : 10; print V "$P: maxurls=$maxurls.\n" if $V>2; # - - - - # -V # This is the HTTP version number. The default is 1.0, but some web servers # are picky about this and require 1.1. We try to discover this by looking # at the version returned in HTTP messages, but it's faster if you can # specify it on the command line. We can also get this from the cfg/$host # file. } elsif ($opt =~ m'^V([0-9.]*)$'i) { $HTTPversion = $1 ? $1 : '1.1'; print V "$P: HTTPversion='$HTTPversion' from cmdline option.\n" if $V>2; # - - - - # -W min wait: wait at least seconds before rereading a URL. # +W max wait: reread a URL after seconds. # These options control when we are allowed to reread a URL. -W is the # minimum time; i.e., we shouldn't reread a URL until this much time has # passed since we last read it. +W gives the time after which a URL is # considered obsolete and should be read again. If is null, we reread # everything. } elsif ($opt =~ m'^W(\d*)$'i) { if ($1 eq '') { $mintime = $maxtime = $maxdays = 0; print V "$P: Rereading everything.\n" if $V>2; } elsif ($fl eq '-') { $mintime = $1; print V "$P: Min timeout is $mintime sec.\n" if $V>2; } else { $maxtime = ($maxdays = $1) * $spd; print V "$P: Max timeout is $maxtime sec.\n" if $V>2; } print V "$P: mintime=$mintime maxtime=$maxtime (maxdays=$maxdays)\n" if $V>2; # - - - - # The notation '2; } # - - - - # None of these patters matched the option string. } else { print V "$P: Option \"$fl$opt\" not understood.\n" if $V>0; } # - - - - # Args that don't start with '-' or '+' are treated as host names. Here, we # just accumulate them for later processing. } else { $a =~ s"^.*/+""i; # Strip of any directories $a =~ s"-+$""; # Trim "backup" host names $currhost = lc($a); # Remember lower-case host name ++$inithost{$currhost}; # Note that we'll scan this host } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # my %URLcode = ( "\t" => '%09', # HT Horizontal Tab "\n" => '%0A', # LF Line Feed, newline "\r" => '%0D', # CR Carriage Return " " => '%20', # SPACE '"' => '%22', # DOUBLE QUOTE "'" => '%27', # APOSTROPHE "%" => '%25', # PERCENT "&" => '%26', # AMPERSAND "+" => '%2B', # PLUS SIGN "<" => '%3C', # LESS THAN "=" => '%3D', # EQUAL SIGN ">" => '%3E', # GREATER THAN # "?" => '%3F', # QUESTION MARK ); print V "$P: abcdepth=$abcdepth.\n" if $V>0; print V "$P: hstdepth=$hstdepth.\n" if $V>0; print V "$P: maxdepth=$maxdepth.\n" if $V>0; print V "$P: articles=$articles.\n" if $V>0; print V "$P: maxdepth=$maxdepth.\n" if $V>0; print V "$P: maxdays=$maxdays.\n" if $V>0; print V "$P: mintime=$mintime.\n" if $V>0; print V "$P: maxtime=$maxtime.\n" if $V>0; print V "$P: maxurls=$maxurls.\n" if $V>0; print V "$P: purgebad=$purgebad.\n" if $V>0; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Now all the hosts in our list should have their own files in hst/$host and # # these files should be filled with the URLs for the host. Next, we expect a # # list of host names on the command line. We run through these hosts and run # # thru each one's hst/$host file, and process each URL we find there. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (%inithost) { print V "$P: HOST processing ...\n" if $V>2; %h2d = %h2n = %h2u = (); # Forget about input list of URLs hostfile: for $h (sort keys %inithost) { print V "$P: HOST \"$h\" ...\n" if $V>2; if ($OLDopen) {close OLD; $OLDopen = 0} if ($HSTopen) {&CloseHST($HostT0{$h},time)} last if ($closeDoc || $finishup); if ($BadHost{$h}) { print V "$P: Host \"$h\" is in BadHost list.\n" if $V>2; next hostfile; } $hstdepth = $maxdepth; # Default depth limit &host($h); print V `date` . "Host \"$h\" done.\n\n" if $V>2; } print V `date` . " All hosts done.\n\n" if $V>2; } elsif (%h2n) { print V "$P: There are no hosts to process, but we have a URL list.\n" if $V>2; &saveURLs; # Save the info from the initial URLs %h2d = %h2n = %h2u = (); # Forget about this set of URLs } else { print V "$P: There are no hosts to process and no URL list.\n" if $V>2; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (%h2n) { print V "$P: There are more hosts and URLs to remember.\n" if $V>2; &saveURLs; # Save the info from the initial URLs } # Should we schedule a new run of this robot? if ($schedule > 0) { print V "$P: $esep\n" if $V>2; print V "$P: Schedule another run after $schedule minutes ...\n" if $V>2; @hosts = grep(!/(-|\.LCK)\s*$/,glob("hst/*")); $hosts = int(@hosts); print V "$P: We have $hosts hosts.\n"; # if $V>2; $nexthost = ''; host: foreach $host (@hosts) { print V "$P: host file \"$host\"\n" if $V>4; $host =~ s"^hst/([-.\w]+)\s*$"$1"; if ($x = $hostsub{$host}) {$host = $x} print V "$P: host \"$host\"\n" if $V>2; if ($host gt $currhost) { print V "$P: host \"$host\" > \"$currhost\"\n" if $V>3; $nexthost = $host; last host; } } $nexthost = $host[0] if !$nexthost; if ($nexthost) { print V "$P: Next host is \"$host\".\n" if $V>2; $log = "log/$host"; $ENV{"V_ABCbot"} = "$V$log"; $atcmd = "at now + $schedule $P +S $host '3; if (system $atcmd) { $exitstat = $!; print V "$P: FAILED \"$atcmd\" ($!)\n" if $V>2; print V "$P: Exit status was $?.\n" if $V>2; } } } if (-f $cachetmp) {unlink $cachetmp} print V "$P: Exit with status $exitstat.", `date` if $V>0; exit $exitstat; # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # ## sub abc2html { ## # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ## # Convert the abc escape sequences to HTML. ## # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ## local($s) = @_; ## $s =~ s#\\(o)#\&${1}slash;#ig; ## $s =~ s#\\a(a)#\&${1}ring;#ig; ## $s =~ s#\\"(\w)#\&${1}uml;#ig; ## $s =~ s#\\'(\w)#\&${1}acute;#ig; ## $s =~ s#\\`(\w)#\&${1}grave;#ig; ## $s =~ s#\\,(\w)#\&${1}cedille;#ig; ## $s =~ s#\\~(\w)#\&${1}tilde;#ig; ## $s; ## } sub BASE { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Save the contents of a tag. We may need this when we try to use # # a relative URL later on in the file. We save it in the global $base var. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($atts) = @_; local($att,$val,$lup); $atts .= ' '; print V "BASE: atts=\"$atts\"\n" if $V>5; while ($atts =~ s/\s*^(\w+)="*([^"\s]+)["\s]+//) { $att = uc($1); $val = $2; print V "BASE: att=\"$att\" val=\"$val\"\n" if $V>4; print V "BASE: atts=\"$atts\"\n" if $V>5; if (uc($att) eq 'HREF') { $base = $val; print V "BASE: base=\"$base\"\n" if $V>2; return $base; # This is all we want } if (++$lup > 5) { print V "BASE: Looped $lup times; giving up.\n" if $V>2; return undef; } } print V "BASE: Quit with atts=\"$atts\"\n" if $atts && $V>2; return undef; } sub CheckEnd { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Terminate the checking. We first write the log message $m if the verbose # # level is greater than $v. We then kill the check process and close the pipe # # from its stdout. The value of $r is the caller's intended return value, # # which we may print (or not). # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($r,$v,$m) = @_; print V "$m\n" if $V>$v; kill 9, $chkpid; close CHK; return $r; } sub CheckHost {my $F='CheckHost'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Do miscellaneous validations on a host. The return value is the number of # # problems found. 0 means there are no objections to the host; a nonzero # # return means there is some problem and we should skip this host. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($h) = @_; local($chk,$chktim,$n,$t); local($httplvl,$errcode,$errmsg); @robotstxt = (); # Contents of robots.txt file, if any $hstmsg = ''; if ($n = $BadHost{$h}) { $hstmsg = "Host \"$h\" in blacklist"; print V "$P: $hstmsg.\n" if $V>0; return 1; } $GetCmd = $Getcmd{"C:$h"} || "$dfltget -V$HTTPversion"; $chk = "$GetCmd -T$ABCtmout 'http://$h/robots.txt'"; print V "$F \"$chk\"\n" if $V>2; $w3timedout = 0; # Global set true to flag timeout $chktim = time; # When we started this check if ($HTTPtimeout > 0) { # Timeout in effect? $savsig = $SIG{ALRM}; # Save old alarm routine $HTTPcontime = $chktim; # Note time we did open $SIG{ALRM} = 'W3tmout'; # Establish alarm routine alarm $HTTPtimeout; # Set alarm &dt(); print V "$F Set alarm after $HTTPtimeout sec at $now.\n" if $V>3; $CHKopen = 1; # Triggers close on timeout } unless ($chkpid = open(CHK,"$chk |")) { $hstmsg = "\"$chk\" failed ($!)"; print V "$F $hstmsg\n" if $V>0; $CHKopen = 0; return 2; } print V "$F Process $chkpid \"$chk\" running.\n" if $V>4; print V "$F Read 'http://$h/robots.txt'\n" if $V>1; $URLhdr = 1; # The +H option produces headers $CHKopen = 1; # Triggers close on timeout line: while ($line = ) { if ($w3timedout) { &dt(); $hstmsg = "#### TIMEOUT in for loop at $now ####"; print V "$F $hstmsg\n" if $V>0; $t = ($now = time) - $chktim; print V "$F chktim=$chktim now=$now t=$t\n" if $V>2; return &CheckEnd(3,1,"$F Can't connect to \"$h\" in $t sec. (return 3)"); } $line =~ s/[\r\s]+$//;; print V "$F +++ \"$line\"\n" if $V>6; if ($URLhdr) { print V "$F HDR \"$line\"\n" if $V>2; $t = ($now = time) - $chktim; if (!$line) { print V "$F End of headers.\n" if $V>2; $URLhdr = 0; } if ($line =~ m"^") { print V "$F chktim=$chktim now=$now t=$t\n" if $V>2; return &CheckEnd(4,1,"$F Can't connect to \"$h\" in $t sec. (return 4)"); } if ($line =~ m"^ ") { return &CheckEnd(4,1,"$F Can't connect to \"$h\" in $t sec. (return 4)"); } if ($line =~ m"^") { return &CheckEnd(4,1,"$F Can't connect to \"$h\" in $1 sec. (return 4)"); } if ((($httplvl,$errcode,$errmsg) = ($line =~ m"^HTTP/([0-9.]+)\s+ERR\s+(\d+)\s+(.*)$"i)) || (($httplvl,$errcode,$errmsg) = ($line =~ m"^HTTP/([0-9.]+)\s+(\d+)\s+(.*)$"i)) ) { print V "$F httplvl=$httplvl errcode=$errcode errmsg=\"$errmsg\"\n" if $V>2; if ($errcode >= 400) { $hstmsg = "No robots.txt file found"; print V "$F $hstmsg\n" if $V>2; return &CheckEnd(0,1,"$F HTTP/$1 ERR $2 ($3)"); } print V "$F robots.txt file found.\n" if $V>0; print V "$F HTTP/$1 ERR $2 ($3) -- accepted.\n" if $V>2; next line; } print V "$F Not an ERR line.\n" if $V>6; if ($line =~ m"^HTTP/([0-9.]+)\s") { # unless ($Getcmd{"C:$h"}) { # $HTTPversion = $1; # print V "$F HTTP version $HTTPversion (from HTTP header)\n" if $V>2; # } $getcmd = $Getcmd{"C:$h"} || "$dfltget -V$HTTPversion"; } elsif ($line =~ m"^Server:\s*(.*)") { $Server{$h} = $1; print V "$F Server for \"$h\" is \"$Server{$h}\"\n" if $V>4; } } else { print V "$F TXT \"$line\"\n" if $V>2; push @robotstxt, $line; if ($line =~ /^User-agent:\s*(.*)$/i) { $agentpat = $1; $agentmatch = 0; if (($agentpat eq '*') || ($agentpat =~ /ABCbot/i)) { print V "$F User-agent \"$agentpat\" matches us.\n" if $V>0; $agentmatch = 1; } } elsif ($line =~ /^(Avoid|Disallow):\s*(.*)\s*$/i) { if ($1) { $Disallow{$2}++; $hstmsg = "$1 \"$2\""; print V "$F $hstmsg\n" if $V>0; } } } } print V "$F Done with \"http://$h/robots.txt\"\n" if $V>2; if ($w3timedout) { $t = time - $chktim; $hstmsg = "#### TIMEOUT (at for loop after $t sec) ####"; print V "$F $hstmsg\n" if $V>0; return &CheckEnd(6,1,"$F Can't connect to \"$h\" in $t sec. (return 6)"); } return &CheckEnd(0,0,"$F OK"); } sub cfgbot {my $F='cfgbot'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Read in a config file for this program. It should contain a list of URLs # # that need special treatment. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($cfgfile) = @_; local($prot,$host,$rest); print V "$F Read CFG file '$cfgfile' ...\n" if $V>2; if (open(CFG,$cfgfile)) { print V "$F Read CFG file '$cfgfile'\n" if $V>2; } else { print V "$F Can't read '$cfgfile' ($!)\n" if $V>2; return 0; } print V "$F myhost \"$myhost\"\n" if $V>2; while ($line = ) { # Input contains config directives $line =~ s/[\r\n\s]*$/ /; # We want exactly one space at end of line print V "$F line \"$line\"\n" if $V>6; if ($line =~ /^\s*#/) { # Ignore comments } elsif ($line =~ /^\s*$/) { # Ignore blank lines } elsif ($line =~ s"^\s*((http|ftp)://\S+)\s"$1"i) { print V "$F URL: $1\n" if $V>2; &URL($1,1); # unless %inithost; # Add it as a level-1 URL to be examined } elsif ($line =~ s"^(scan|search):*\s*(\S+)\s"$2"i) { print V "$F SCAN $2\n" if $V>2; &URL($2,1); # unless %inithost; # Add it as a level-1 URL to be examined } elsif ($line =~ s"^(dead|done|gone|ignore):*\s*(\S+)/*\s"$2"i) { $Depth{$2} = 1; # Mark this one as "already done" $Done{$2} = $now; # Use current time as when we did it print V "$F DONE \"$2\" at $now.\n" if $V>0; } elsif ($line =~ s"^avoid:*\s*(\S+)\s*"$1"i) { # Hosts or URLs to avoid print V "$F Avoid \"$line\"\n" if $V>2; if (($prot,$host,$rest) = ($line =~ m"(HTTPS*)://([-_:.\w]+)(.*)$"i)) { print V "$F Avoid prot='$prot' host='$host' rest='$rest'\n" if $V>2; if ($rest eq '') { # http://host $BadHost{$host}++; # It's just a host "to be avoided" print V "$F Avoid host: \"$host\"\n" if $V>2; } elsif ($rest eq '/') { # http://host/ $BadHost{$host}++; # It's just a host "to be avoided" print V "$F Avoid host/ \"$host\"\n" if $V>2; } elsif ($host ne '') { # http://host/path $BadPath{"$host$rest"}++; # Host + path "to be avoided" print V "$F AVOID PATH: '$host/$rest'\n" if $V>2; } elsif ($host eq $currhost) { $Disallow{"$host$rest"}++; # Host + path "to be avoided" print V "$F Disallow: \"$host$rest\"\n" if $V>2; } } elsif ($line =~ /^([-_:.\w]+)\s*$/) { $BadHost{$1} ++; # Mark this host as "to be avoided" print V "$F AVOID HOST: \"$1\"\n" if $V>2; } else { print V "$F Avoid \"$line\" IGNORED (can't parse).\n" if $V>0; } } else { # Otherwise it's a comment print V "$F \"$line\" IGNORED (can't parse).\n" if $V>0; } } close CFG; return 1; } sub CloseHST { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Finish writing all files related to the current host and close the files. # # The params are the start and finish timestamps for this host. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($t0,$t1) = @_; local($hr,$mn,$sc,$tm); local($ss,$mm,$hh,$DD,$MM,$CY); local($gcmd,$srvr); ++$scancnt; print HST "\n$now T X:$tunecnt T:$titlcnt F:$filecnt H:$currhost\n"; unless ($tunemax>0 && $titlmax>0) { print V "$P: No ABC ever found at host $currhost in pass $scancnt.\n" if $V>0; print HST "\n$now # No ABC found at $currhost in pass $scancnt\n"; if ($scancnt > $maxscans) { $pfile = "nul/$currhost"; print V "$P: Move \"$hfile\" to \"$pfile\"\n" if $V>2; &Backup($pfile) if -f $pfile; unless (rename($hfile,$pfile)) { print V "$P: Can't rename \"$hfile\" to \"$pfile\" ($!)\n" if $V>0; } } } $tm = $t1 - $t0; $sc = $tm % 60; $mn = int($tm/60) % 60; $hr = int($tm/3600); printf HST "\n$now Scanned $currhost at $cymdhms in $tm sec (%d:%02d:%02d)\n",$hr,$mn,$sc; close HST; # if (open(LOG,">>$smryfile")) { local($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900; $summary = sprintf("%04d%02d%02d %02d:%02d %8d sec (%2d:%02d:%02d) %6d files %6d tunes %6d titles at $currhost\n" ,$CY,$MM,$DD,$hh,$mm ,$tm,$hr,$mn,$sc ,$filecnt,$tunecnt,$tunecnt); printf LOG $summary; close LOG; } # $HSTopen = 0; $gcmd = $Getcmd{"C:$currhost"}; $srvr = $Server{$currhost}; print V "$P: Command for \"$currhost\" is \"$gcmd\"\n" if $V>2; print V "$P: Server for \"$currhost\" is \"$srvr\"\n" if $V>2; print V "$P: Depth for \"$currhost\" is \"$hstdepth\"\n" if $hstdepth != $maxdepth && $V>2; if (($tunecnt > 0) && ($titlcnt > 0) && ($gcmd || $srvr || $hstdepth != $maxdepth)) { print V "$P: Rewriting cfg/$currhost\n" if $V>2; if (open(CFG,">cfg/$currhost")) { &dt(); for $line (@cfg) { if ($gcmd && ($line =~ /^\d+\s|C:/)) { print CFG "$now C:$gcmd\n"; $gcmd = ''; # Rewrite get-command line } elsif ($gcmd && ($line =~ /^\d+\s|D:/)) { # print CFG "$now D:$hstdepth\n"; $hstdepth = ''; # Rewrite h-depth line # We now create this below and discard the old depth } elsif ($srvr && ($line =~ /^\d+\s|S:/)) { print CFG "$now S:$srvr\n"; $srvr = ''; # Rewrite server-id line } else { print CFG "$line\n"; } } print CFG "$now C:$gcmd\n" if $gcmd; # Old file didn't have get command print CFG "$now S:$srvr\n" if $srvr; # Old file didn't have server id print CFG "$now L:$maxlines\n" if $maxlines != 100; # Nonstandard maxlines value print CFG "$now D:$hstdepth\n" if $hstdepth != $maxdepth; # Nonstandard maxdepth value print CFG "$now O:CGI\n" if $allowcgi; # CGI enabled for this site close CFG; # Host has new config file @cfg = (); # Discard the host's config info } } } sub Line {my $F='Line'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Return one line of input from the current document. Here is where we try to # # deal with incoming HTML, by splitting it into lines on any strings of \r or # # \n, and stripping out tags. Note that our return value has the final # # newline, but other trailing whitespace is gone. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($line,@lines); if (!$DOCline) { $DOCline = ; # Next chunk of input print V "$F \"$DOCline\"\n" if $V>5; return '' if !$DOCline; $DOCline =~ s/[ \t]*(\r\n|\r)/\n/g; # DOS/Mac kludge print V "=DOC= \"$DOCline\"\n" if $V>6; $HTTPreadtime = time; # Note when the last HTTP read occurred } if ($DOCline =~ s/^(.*)\n//) { # ASCII line end $line = $1; } elsif ($DOCline =~ s/^(.*)\r//) { # MacOS line end $line = $1; } else { $line = $DOCline; # Unterminated line! $DOCline = ''; } print V "$F \"$line\"\n" if $V>4; if ($cachetune) { # Writing to cache? if ($URLhdr) { # Within the HTTP headers? if ($cachehdrs) { # Include HTTP headers in cached files? if ($line =~ /^(Last-Modified|Content-(Length|Type)):/i) { print CACHE "%%$line\n"; } } } else { print CACHE "$line\n"; } } print V "$F inHTML=$inHTML URLhdr=$URLhdr\n" if $V>4; if ($inHTML && !$URLhdr) { # Inside an HTML document $line =~ s"^\s+""s; $line =~ s"^]*>\s*""sig; # Initial
tags ignored $line =~ s"\s*]*>\s*"\n"sig; # Replace other
tags with newlines $line =~ s"\s*]*>\s*"\n\n"sig; # Replace

tags with double newlines $line =~ s"\s*\s*"\n\n"sig; # Replace

 tags with double newlines
		$line =~ s"\<"<"sg;		# Reduce HTML entities
		$line =~ s"\>">"sg;
		$line =~ s"\&"\&"sg;
		if ((@lines = split("\n",$line)) && (int(@lines) > 1)) {	# Is it now multiple lines?
			$line = shift(@lines);
			$DOCline = join("\n",@lines) . "\n" . $DOCline;
		}
		if ($line =~ /<[^>]*$/) {	# Unclosed tag at end of line?
			$line .= ' ' . &Line();	# Append the next line.
		}
	}
	print V "LINE:\"$line\"\n" if $V>2;
	$line =~ s/[\r\s]*$/\n/;		# Trailing white stuff already stripped off
	return "$line\n";
}

sub LoadLinks {my $F='LoadLinks';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my $file;
	my($l,$ts,$fl,$dpth,$urlpath);
	for $file (@_) {
		print V "$F	Load \"$file\"\n" if $V>4;
		if (open(LINKFILE,$file)) {
			while ($l = ) {
				$l =~ s/[\r\s]*$/ /;	# Strip white stuff
				next if $l eq ' ';		# Ignore blank lines
				print V "$F	LINKFILE line: $l" if $V>3;
				if ($l =~ /^\s*#/) {	# Comment
					print V "$F	Drop \"$l" if $V>6;
				} elsif ((($ts,$fl,$dpth,$urlpath) = ($l =~ m'^(\d+)\s+([-#\w]) D:(\d+) *(.*) $'))
				||   (($ts,$dt,$fl,$dpth,$urlpath) = ($l =~ m'^(\d+)=(\d+) ([-#\w]) D:(\d+) *(.*) $'))) {
					print V "$F	LINKFILE fl='$fl' D:'$dpth' \"$urlpath\"\n" if $V>3;
					print V "$F	urlpath=\"$urlpath\"\n" if $V>3;
					&NewU($urlpath,1,$ts);	# Was $dpth, now all hosts start at depth 1
				} else {
					print V "$F	BAD \"$l\"" if $V>2;
				}
			}
			close LINKFILE;
			print V "$F	Done \"$file\"\n" if $V>4;
		} else {
			print V "$F	Can't read \"$file\" ($!)\n" if $V>0;
		}
	}
}

sub Max {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is a dumb loop that finds the max numeric value in a list. Why doesn't #
# perl have this as a builtin?  It has nearly everything else.                #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($n) = shift;
	for (@_) {
		$n = $_ if $_ > $n;
		shift;
	}
	return $n;
}

sub NewU {my $F='NewU';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Add a URI to our list of added/unprocessed URIs.  We keep track of the  min #
# depth,  and  at  the  end, any URIs in the @Left list will be scanned.  The #
# timestamp is currently included, but not actually used,  since  we  fake  a #
# timestamp of zero to force a scan of all new URIs.                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($urlpath,$dpth,$ts) = @_;
	local($d);
	print V "$F	Add D:$dpth U:$urlpath\n" if $V>3;
	if ($Done{$urlpath}) {
		print V "$F \"$urlpath\" already marked done at depth $Depth{$urlpath}.\n" if $V>2;
		return;
	}
	if (defined($d = $Depth{$urlpath})) {
		print V "$F	Dup Depth{$urlpath}='$Depth{$urlpath}' is '$d'\n" if $V>5;
		if ($Depth{$urlpath} > $dpth) {
			$Depth{$urlpath} = $dpth;	# Note minimum depth for URI
			print V "$F	Depth{$urlpath}=$dpth\n" if $V>3;
		}
	} else {	# New URI; set its depth
		$Depth{$urlpath} = $dpth;	# Note its depth
		print V "newU: Depth{$urlpath}=$dpth\n" if $V>2;
		push @Left, $urlpath;		# List of unprocessed URIs
		print V "$F	New Depth{$urlpath}='$Depth{$urlpath}'\n" if $V>3;
	}
}


sub rejectURL {my $F='rejectURL';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Test a URL for acceptability.  We look it up in whatever tables of rejected #
# URLs  we're  using  at  the  moment.   The  current tables are %badpath and #
# %Disallow, and maybe we'll add some more soon.                              #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($url) = @_;
	local($lx,$pfx);
	print V "$F	? Is URL $url allowed at $currhost?\n" if $V>3;
	if (%badpath) {
		for $pfx (sort keys %badpath) {		# Run thru forbidden paths
			$lx = length($pfx);		# Length of one bad prefix
			print V "$F	Bad path '$pfx' vs '$url'?\n" if $V>2;
			if ($pfx eq substr($url,0,$lx)) {
				print V "$F	\"$url\" ignored (bad path \"$pfx\")\n" if $V>2;
				return "Bad path";
			}
		}
	}
	if (%Disallow) {
		for $pfx (sort keys %Disallow) {	# Run thru disallowed paths
			$lx = length($pfx);	# Length of one disallowed prefix
			print V "$F	? Disallow $lx '$pfx' vs '$url'?\n" if $V>2;
			if ($pfx eq substr($url,0,$lx)) {
				print V "$F	\"$url\" ignored (disallowed \"$pfx\")\n" if $V>2;
				return "Disallowed";
			}
		}
	}
	return "";	# No reason means accepted
}

sub URL {my $F=':URL:';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Register a new URL for later scanning.  We can do some weeding out here  if #
# we  so desire.  We return 0 if we reject the URL; 1 if we accept it, though #
# callers don't yet use this info. We implement a special ABC kludge here: If #
# the URL ends with .abc, we accept it even if it's beyond the maximum depth. #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($urlfull,$depth) = @_;
	local($done,$h,$hh,$hst,$hstpath,$lx,$n,$p,$u,$urlpath,$pfx);
	print V "$F	\"$urlfull\" d=$depth.\n" if $V>2;
	if ($Done{$urlfull}) {
		print V "$F	\"$urlfull\" already marked done at depth $Depth{$urlfull}.\n" if $V>2;
		return 0;
	}
	if (!$followUpLink && ($urlfull =~ /\/\.\.\//)) {
		print V "$F	\"$urlfull\" ignored (/../)\n" if $V>3;
		return 0;
	}
	if ($depth > $hstdepth) {
		if ($urlfull =~ /\.abc$/i) {		# ABC files are special
			if ($depth > $abcdepth) {	# They may be one level deeper
				print V "$F	\"$urlfull\" ignored (depth $depth > $abcdepth && .abc file)\n" if $V>3;
				return 0;
			}
			print V "$F	\"$urlfull\" accepted (depth $depth && .abc file)\n" if $V>2;
		} elsif ($depth >= $hstdepth) {	# Reject non-ABC files at depth limit
			print V "$F	\"$urlfull\" ignored (depth $depth >= $hstdepth)\n" if $V>3;
			return 0;
		}
	}
	print V ">->-> \"$urlfull\" [$depth]\n" if ($depth>$hstdepth && $V>2);
	$urlfull = &URLtrim($urlfull);				# Shorten the URL if possible
	print V "----> $depth '$urlfull'\n" if $V>2;
	if ($urlfull !~ m'^(http|file):'i) {	# Accept only these protocols
		print V "$F	\"$urlfull\" ignored (http|file rule)\n" if $V>2;
		return undef;
	}
	if ($urlfull =~ m'\b(bin|tmp)/'i) {		# Don't try to fetch from bin or tmp directories
		print V "$F	\"$urlfull\" ignored (bin|tmp rule)\n" if $V>2;
		return undef;
	}
	unless ($allowcgi) {
		if ($urlfull =~ m'\bcgi\b'i) {		# Ignore cgi scripts
			print V "$F	\"$urlfull\" ignored (cgi rule)\n" if $V>2;
			return undef;
		}
		if ($urlfull =~ m/[<>\?;#"]/) {		# Ignore URLs that look like HTML or CGI calls
			print V "$F	\"$urlfull\" ignored (HTML/CGI rule)\n" if $V>2;
			return undef;
		}
	}
	if ($urlfull =~ m'/\?\w=\w$') {			# Ignore apache listing URLs
		print V "$F	\"$urlfull\" ignored (/?X=Y\$ rule)\n" if $V>2;
		return undef;
	}
	if ($urlfull =~ m'jc/.*\.hdr$'i) {		# Ignore jc's HDR files
		print V "$F	\"$urlfull\" ignored (hdr rule)\n" if $V>2;
		return undef;
	}
	print V "$F	urlfull=\"$urlfull\"\n" if $V>3;
	if (($p,$h,$urlpath) = ($urlfull =~ m'^(http|ftp)://([^/]+)(.*)$'i)) {	# FTP doesn't work yet
		print V "$F	urlpath=\"$urlpath\"\n" if $V>2;
		unless ($h =~ /^[-_:.\w]*$/) {
			print V "$0: Bogus host \"$h\" ignored.\n" if $V>2;
			return undef;
		}
		$hst = lc($h);
		$hstpath = "$hst$urlpath";
		print V "$F	hstpath=\"$hstpath\"\n" if $V>2;
		if ($depth <= 1) {
			$hh = "http://$hstpath";
			$AllowURL{$hh}++;	# Note that this one is explicitly allowed
			print V "$F	Allow \"$hh\"\n" if $V>4;
		}
#		if (%BadHost) {
#			if ($BadHost{$hst}) {
#				print V "$F	\"$urlfull\" ignored (BadHost \"$hst\")\n" if $V>0;
#				return undef;
#			}
#			if ($BadHost{$h}) {
#				print V "$F	\"$urlfull\" ignored (bad host \"$h\")\n" if $V>0;
#				return undef;
#			}
#		}
		if ($why = rejectURL($hstpath)) {
			print V "$F	'$hstpath' rejected [$why]\n" if $V>1;
			return undef;
		}
#		if ($urlfull =~ m"/mackay/") {	# Bad dir at www.inference.phy.cam.ac.uk
#			print V "$F	### Accepted /mackay/ ###\n";
#			exit -1;
#		}
		print V "$F	h='$h' hst='$hst' currhost='$currhost'\n" if $V>2;
		if ($hst eq $currhost) {
			print V "$F	URI \"$urlpath\" at depth $depth is local.\n" if $V>2;
			&NewU($urlpath,$depth,($now = time));
			return 1;
		} else {
			print V "$F	URI \"$urlpath\" at depth $depth is non-local.\n" if $V>2;
		}
		if ($h =~ m'(__|\.\.)') {
			print V "$F	\"$urlfull\" ignored (host contains \"$1\")\n" if $V>2;
			return undef;
		}
		$n = $h2n{$hst} || 0;
		$h2u{$hst}->[$n] = $urlpath;	# Note URI, not the full URL
		$h2d{$hst}->[$n] = $depth;	# Note each URL's depth
		$h2n{$hst} ++;				# Count the URLs for each host
		$doclinks ++;				# Count the links from the current document
		print V "$F	host $hst URL $h2n{$hst} is \"$urlfull\"\n" if $V>3;
		if ($showlinks) {
			unless ($outlink{"$currURL:$urlfull"}) {	# Don't put out repeats
				&dt();
				push @newchunk, ("$now > D:$depth " . (($hst eq $currhost) ? $urlpath : $urlfull));
#				if ($hst eq $currhost) {
#					push @newchunk, "$now > D:$depth $urlpath";
#				} else {
#					push @newchunk, "$now > D:$depth $urlfull";
#				}
			}
			++ $outlink{"$currURL:$urlfull"};	# Number of times we've encountered this URL
			print V "----> $depth '$urlfull'\n" if $V>1;
		}
	}
	return 1;
}

sub URLenc {my $F='URLenc';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# do the HTTP encoding to convert URL special chars to %XX.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my($c,$d,$l,$v);
	for (@_) {
		$l = $_;	# Copy the arg.
		print "$F	+++ \"$l\"
\n" if $V>4; $l =~ s/([\t\n\r "'%&+<=>])/$URLcode{$1}/eg; $v .= $l; } return $v; } sub W3tmout {my $F='W3tmout'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's an alarm handler for reads from DOC. When a timeout happens, we # # close the DOC file and return, which should cause abandonment of the # # current document. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # &dt(); $TOopen = $now - $TMopen; # Time since we opened the current file $TOread = $now - $TMread; # Time since we last read from the file print "$F: Called with TOopen=$TOopen TOread=$TOread sec [HTTPtimeout=$HTTPtimeout]\n" if $V>1; if ($TOread < $HTTPtimeout) { print "$F: Timeout ignored: only $TOread sec since last read [HTTPtimeout=$HTTPtimeout]\n" if $V>1; } else { print "$F: TIMEOUT after $TOopen/$TOread sec [HTTPtimeout=$HTTPtimeout]\n" if $V>1; $endDoc = 1; $w3timedout = 1; } } sub canon { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Canonicalize a name. We upper-case everything, and strip out all funny # # chars. If $articles is enabled, we look for articles initially and after a # # comma, and delete them. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($name) = @_; local($lcs,$ucs); ($ucs = $name) =~ s/[^A-Z]+//g; # Extract upper-case letters ($lcs = $name) =~ s/[^a-z]+//g; # Extract lower-case letters print V "canon: name=\"$name\" lcs=\"$lcs\" ucs=\"$ucs\"\n" if $V>6; if ($ucs && $lcs) { # Both cases used # $name =~ s/^[^A-Z]+//; # Strip stuff before first upper-case letter print V "canon: Mixed-case \"$name\"\n" if $V>6; } elsif (!$ucs && $lcs) { # All lower case print V "canon: Lower-case \"$name\" curious\n" if $V>6; } elsif ($ucs && !$lcs) { # All upper case print V "canon: Upper-case \"$name\" suspect.\n" if $V>6; } else { # No letters at all print V "canon: Name \"$name\" with no letters rejected.\n" if $V>6; return ''; } if ($articles eq '-') { $name =~ s/^the\s+//i; $name =~ s/^an?\s+//i; $name =~ s/^l[ae]?s?\s+//i; $name =~ s/,s*the\s+//i; $name =~ s/,s*an?\s+//i; $name =~ s/,s*l[ae]?s?\s+//i; } $name = uc($name); # Upper-case everything $name =~ s"&(\w)\w*;"$1"g; # De-htmlize the name # $name =~ s/,.*//; # Discard everything after a comma $name =~ s"\W+""g; # Delete non-alpha chars # $Tname{$name} = 1; # Note that we've seen the name return $name; } sub done {my $F='done'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Mark a URI/URL as done at a specific depth and time. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($d,$t,$urlpath,$urlfull) = @_; if ($Done{$urlpath}) { print V "$F \"$urlpath\" already marked done at time $Done{$urlpath} depth $Depth{$urlpath}.\n" if $V>0; return; } if ($Done{$urlfull}) { print V "$F \"$urlfull\" already marked done at time $Done{$urlfull} depth $Depth{$urlfull}.\n" if $V>0; return; } $d = $depth unless defined($d) && $d > 0; $t = ($now = time) unless defined($t) && $t > 0; $Depth{$urlfull} = $d; $Done{$urlfull} = $t; print V "$F URLfull \"$urlfull\" marked done at time $t depth $d.\n" if $V>2; $Depth{$urlpath} = $d; $Done{$urlpath} = $t; print V "$F URLpath \"$urlpath\" marked done at time $t depth $d.\n" if $V>2; } sub chunk {my $F="chunk"; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # A "chunk" starts with a line giving a timestamp and URL, plus some little # fields giving the URL's depth and what we last did with it, optionally # followed by data about that URL. The 1-char flags that follow the time are: # U new URL, not processed yet. # > link to another URL. # - URL not read for some reason. # # Error trying to read URL. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($depth,$newuline,$olduline,$links,$rest,$titls,$tunes); local($hstpath,$urlpath,$urlfull); local($d,$dt,$fl,$l,$oB,$oL,$oX,$oT,$ts,$x,$y); local($sec,$min,$hour,$mday,$mon,$year); # gmtime() return list print V "CHUNK:\n\t@oldchunk\n" if $V>5; $olduline = shift @oldchunk; print V "$F \"$olduline\"\n" if $V>4; $dt = $oB = $oL = $oX = $oT = ''; $docbytes = $matched = 0; # We now attempt to match several different "U" lines that we have used in # various versions of this bot. if ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d*)( L:\d*)( X:\d*)( T:\d*) (.*)$') { $ts = $1; # Time of last update $fl = $2; # Line-type flag, 'U' for URLs $depth = $3; # Hyperlink depth $oB = $4; # Old byte count $oL = $5; # Old tune count $oX = $6; # Old tune count $oT = $7; # Old title count $urlpath = $8; # URL minus protocol and host $matched = 1; } elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d*)( X:\d*)( T:\d*) (.*)$') { $ts = $1; # Time of last update $fl = $2; # Line-type flag, 'U' for URLs $depth = $3; # Hyperlink depth $oB = $4; # Old byte count $oX = $5; # Old tune count $oT = $6; # Old title count $urlpath = $7; # URL minus protocol and host $matched = 1; } elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+)( B:\d+) (.*)$') { $ts = $1; $fl = $2; $depth = $3; $oB = $4; # Old byte count $urlpath = $5; $matched = 1; } elsif ($olduline =~ m'^(\d+) ([-#\w]) D:(\d+) (.*)$') { $ts = $1; $fl = $2; $depth = $3; $urlpath = $4; $matched = 1; } elsif ($olduline =~ m'^(\d+)=(\d+) ([-#\w]) D:(\d+) (.*)$') { $ts = $1; $dt = $2; $fl = $3; $depth = $4; $urlpath = $5; $matched = 1; } else { print V "$F Line not matched.\n" if $V>4; } if ($matched) { $urlpath =~ s/[\r\s.]+$//; # Trim trailing junk print V "$F dt $fl D=$depth oB=$oB $oL=$oL oX=$oX oT=$oT urlpath=\"$urlpath\"\n" if $V>4; # if ($Done{$urlpath}) { # print V "$F \"$urlpath\" already done at $Done{$urlpath} depth $Depth{$urlpath}.\n" if $V>0; # print V "$F DROP \"$urlpath\"\n" if $V>2; # return; # } if ($urlpath =~ /\.(bak|old|fmt|gif|png|ps|pdf|ppt|swf|midi*|jpe*g|zip|g*z|au|mp\d*|wav|wmv)$/i) { print V "----> '$urlpath' ignored ($1 suffix).\n" if $V>1; return; } $urlpath =~ s":80/"/"; # Drop default port print V "$F urlpath=\"$urlpath\"\n" if $V>2; $hstpath = "$currhost$urlpath"; # host + path print V "$F hstpath=\"$hstpath\"\n" if $V>2; $urlfull = "http://$hstpath"; # Construct full URL print V "$F urlfull=\"$urlfull\"\n" if $V>2; if ($why = rejectURL($hstpath)) { print V "$F Reject '$hstpath'\n" if $V>1; return; } if ($Done{$urlfull}) { print V "$F \"$urlfull\" already done at $Done{$urlfull} depth $Depth{$urlfull}.\n" if $V>0; print V "$F DROP \"$urlfull\"\n" if $V>2; return; } if (defined($d = $Depth{$urlpath})) { print V "$F \"$urlpath\" is at depth $d.\n" if $V>3; if ($d < $depth) { # Adjust URI's depth print V "$F \"$urlpath\" changed from depth $depth to $d.\n" if $V>3; $depth = $d; # URI's Use min depth } } if ($x = defined($y = $Done{$urlfull}) && $y && defined($x)) { # Have we seen this URL already? print V "$F \"$urlfull\" already done at $x\n" if $V>2; @oldchunk = (); # Suppress the chunk entirely return; } elsif ($fl eq '-') { # Old comment lines $newuline = $olduline; @newchunk = @oldchunk; print V "$F Mark \"$urlfull\" done now (-).\n" if $V>3; &done($d,$ts,$urlpath,$urlfull); # Mark this URI/URL done print V "$ts URL \"$urlfull\" marked done now (-).\n" if $V>3; } elsif (@oldchunk && (($x = &dt() - $ts) < $mintime)) { print V "$ts only $x < $mintime sec.\n" if $V>3; $newuline = $olduline; @newchunk = grep(!/- (too soon|rescan|obsolete) /,@oldchunk); unshift @newchunk, "$now - too soon ($x < $mintime)" if $V>2; print V "$F Mark \"$urlfull\" done now ($x < mintime=$mintime).\n" if $V>3; &done($d,$ts,$urlpath,$urlfull); # Mark this URI/URL done print V "$ts URL \"$urlfull\" marked done now ($x < mintime=$mintime)\n" if $V>3; for $l (@newchunk) { # Look for previous link and tune counts if (($links,$tunes) = ($l =~ /(\d+) links, (\d+) ABC tune/)) { $linkcnt += $links; $linkmax = $linkcnt if $linkmax < $linkcnt; $tunecnt += $tunes; $tunemax = $tunecnt if $tunemax < $tunecnt; $filecnt ++ if $tunecnt>0; } elsif (($links,$tunes,$titls) = ($l =~ /(\d+) links, (\d+) tunes, (\d+) titles/)) { $linkcnt += $links; $linkmax = $linkcnt if $linkmax < $linkcnt; $tunecnt += $tunes; $tunemax = $tunecnt if $tunemax < $tunecnt; $titlcnt += $titls; $titlmax = $titlcnt if $titlmax < $titlcnt; $filecnt ++ if $tunecnt>0 || $titlcnt>0; } } } elsif ($x > $maxtime) { print V "$ts $x > maxtime=$maxtime.\n" if $V>3; &dt(); $newuline = "$now U D:$depth$oB$oL$oX$oT $urlpath"; push @newchunk, "$now - rescan ($x > $maxtime)" if $V>5; print V "$F Scan \"$urlfull\" now ($x > maxtime=$maxtime).\n" if $V>3; &scanURL($urlfull); # Rescan it print V "$F Mark \"$urlfull\" done now ($x > maxtime=$maxtime).\n" if $V>3; &done($depth,$now,$urlpath,$urlfull); print V "$now URL \"$urlfull\" marked done now ($x > maxtime=$maxtime)\n" if $V>3; } else { # It's a URL that we should examine print V "$F SCAN \"$urlfull\"\n" if $V>4; &dt(); $newuline = "$now U D:$depth$oB$oL$oX$oT $urlpath"; print V "$F Scan \"$urlfull\" now (else).\n" if $V>3; &scanURL($urlfull); # Scan it print V "$F Mark \"$urlfull\" done now (after scanURL).\n" if $V>3; &done($depth,$now,$urlpath,$urlfull); } if ($maxurls > 0 && $urlcount > $maxurls) { # Debug hook: exit after $maxurls URLs print V "$ts Abort after $urlcount URLs.\n" if $V>0; return; } } elsif ($olduline =~ / T L:(\d+) X:(\d+) T:(\d+) F:(\d+) H:(.*)/) { $linkmax = $1 if $linkmax < $1; $tunemax = $2 if $tunemax < $2; $titlmax = $3 if $titlmax < $3; ++$scancnt; # Count the number of times we've done this host $newuline = $olduline; @newchunk = @oldchunk; } elsif (($ts,$fl,$rest) = ($olduline =~ m'^(\d+) ([-+T]) (.*)$'i)) { print V "$F ts=$ts $fl $rest\n" if $V>4; ($sec,$min,$hour,$mday,$mon,$year) = gmtime($1); if ($oScanY == $year && $oScanM == $mon) { # [jc] 20030331 # We've seen a timestamp for this month; make trivial change to indicate it: $olduline = ""; # "$ts t $rest"; # Note 't' rather than 'T' } $oScanY = $year; # Note year and month of scan $oScanM = $mon; $newuline = $olduline; @newchunk = @oldchunk; } elsif (($ts,$dt,$fl,$rest) = ($olduline =~ m'^(\d+)=(\d+) ([-+T]) (.*)$')) { print V "$F dt $fl $rest\n" if $V>4; $newuline = $olduline; @newchunk = @oldchunk; } elsif (($ts,$dt) = ($olduline =~ m'^(\d+)=(\d+)$')) { print V "$F dt timestamp.\n" if $V>4; $newuline = $olduline; @newchunk = @oldchunk; } else { print V "$F Unmatched:", $olduline, @oldchunk, "\n" if $V>2; if ($saveunmatched) { $newuline = $olduline; @newchunk = @oldchunk; unshift @newchunk, "$now - Unmatched"; } else { $newuline = ''; @newchunk = (); } } if ($newuline || @newchunk) { print HST "\n"; print HST "$newuline\n" if $newuline; print HST @newchunk if @newchunk; print HST "\n" if @newchunk; } @oldchunk = (); @newchunk = (); print V "$F done.\n" if $V>5; } sub endDoc {my $F='endDoc'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Do whatever is needed at the end of a document. The main thing we do here # # is handle the cache file. If we wrote a tune to it, we need to link it to # # the appropriate place in our cache (http/...). Note that this routine may # # be called in the middle of a document, if we decide to abandon it. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($why) = @_; local($t) = time - $TMopen; # Length of time we've had the doc open if ($cmdpid > 0) { # Kill the download subprocess. print V "$F Kill-9 child process $cmdpid after $t sec at $now...\n" if $V>3; kill 9, $cmdpid; print V "$F Killed child process $cmdpid after $t sec at $now ($why).\n" if $V>2 && $why ne 'EOF'; $cmdpid = 0; } close DOC; $DOCopen = $inABC = $inHTML = 0; if ($cachetune) { # Did we cache this tune? close CACHE; # Make sure we've finished it if ($doctunes > 0) { # Did we find any tunes? if (-f $cachepath) { # Is there an old version? &Backup($cachepath); } &mkdirs($cachedir) unless -d $cachedir; if (rename($cachetmp,"$cachepath")) { print V "$F Renamed \"$cachetmp\" -> \"$cachepath\"\n" if $V>3; } else { print V "$F Can't move \"$cachetmp\" -> \"$cachepath\" ($!)\n" if $V>0; } } else { print V "$F Not cached (no tunes)\n" if $V>3; } $cachetune = 0; # Make sure we don't "remember" next file $cachepath = ''; } } sub env { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Return an environment value, if it's defined. If not, set it to the 2nd # # arg, and return that value. It's best if the value is a string. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($name,$dflt) = @_; if (defined $ENV{$name}) { return $ENV{$name}; } else { return($ENV{$name} = $dflt); } } sub errchunk { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($erruline) = @_; &dt(); push @newchunk, $erruline; print V "$P: erruline=\"$erruline\"\n" if $V>3; $newuline = $olduline; print V "$P: newuline=\"$newuline\"\n" if $V>3; } sub getschedule { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here we try to determine the schedule interval for repeated runs of this # # robot. We first try to extract the interval from a file. If that fails, we # # return a constant; this should only happen during debugging. Note that if # # the value returned is zero, no rescheduling is done. This is useful for # # stopping runaways. Note that the time interval is in minutes. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($l,$n); $schedfile = 'ABCbot.sch' unless $schedfile; if (open(SCHED,$schedfile)) { while ($l = ) {if ($l =~ /^(\d+)/) {$n = $1; close SCHED; last}} } else { print V "getschedule: Can't read \"$schedfile\" ($!); using 1 min.\n" if $V>0; $n = 1; } print V "getschedule: Return $n min.\n" if $V>2; } sub host {my $F='host'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's the main routine to process one host name.. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($h,$cmd) = @_; local($base,$pfx,$suff); $currhost = lc($h); # Global lower-case name of host being scanned $HostT0{$h} = $HostTO{$currhost}= time; print V "$F Scan host h='$h' currhost='$currhost'\n" if $V>1; %badpath = (); for $pfx (sort keys %BadPath) { # Run thru forbidden paths if ($pfx =~ m"^$currhost/") { # Select bad paths for this host ++$badpath{$pfx}; print V "$F Bad path '$pfx'\n" if $V>1; next; } } for $pfx (sort keys %Disallow) { # Run thru disallowed paths if ($pfx =~ m"^$currhst/") { # Select disallowed paths for this host ++$badpath{$pfx}; print V "$F Disallow '$pfx'\n" if $V>1; next; } } print V "$hsep\n" if $V>2; print V "$F HOST h='$h' currhost='$currhost'\n" if $V>4; &hostunlock if $lfile; # Unlock previous host %Disallow = (); # Forget any disallows for previous host %AllowURI = (); # Find the allows for this host # print V "$F ### AllowURL is empty!\n" unless %AllowURL; print V "$F AllowURL contains URLs\n" if %AllowURL && $V>2; for $allow (keys %AllowURL) { print V "$F Allow \"$allow\"\n" if $V>4; if (($pp,$hh,$uu) = ($allow =~ m"^(\w+)://([-_:.\w]+)(/.*)")) { print V "$F Allow h=\"$h\" vs hh=\"$hh\" (pp=\"$pp\"\n" if $V>4; if ($h eq lc($hh)) { $AllowURI{$uu}++; # Allow this URI for this host print V "$F Allow host \"$h\" uri \"$uu\"\n" if $V>3; } else { print V "$F host \"$h\" uri \"$uu\" don't match\n" if $V>3; } } } $cfile = "cfg/$h"; # Config info this host $hfile = "hst/$h"; # Current data for this host $afile = "add/$h"; # Additional URLs for host $nfile = "new/$h"; # Newly added URLs for host $ofile = "old/$h"; # Backup file name $lfile = "lck/$h"; # Lock file name if (-f $lfile) { # Someone's working on it print V "$F Host $h has a lockfile $lfile\n" if $V>0; return; } unless (open(LCK,">$lfile")) { print V "$F Can't write lfile=\"$lfile\" ($!)\n" if $V>0; next hostfile; } &hostlock($lfile); $TMopen = # Time since we opened the current file $TMread = # Time since we last read from the file &dt(); # Make these default to right now. if (&CheckHost($h)) { print V "$F Host \"$h\" failed checks.\n" if $V>0; print V "$F $hstmsg\n" if $V>0; next hostfile; } if (@robotstxt) { print V "$F Got 'http://$h/robots.txt' file.\n" if $V>0; } if (&Backup($ofile)) { print V "$F Can't backup \"$ofile\" ($!)\n" if $V>2; } if (-f $cfile) { require "cfghost.pm"; &cfgload($cfile,$h); $hstdepth = $DepthHost{$h} || $maxdepth; print V "$F Max depth for $h is $hstdepth.\n" if $V>2; } if (rename($hfile,$ofile)) { print V "$F Moved \"$hfile\" to \"$ofile\"\n" if $V>2; } else { print V "$F Can't rename(\"$hfile\",\"$ofile\") ($!)\n" if $V>0; } unless (open(OLD,$ofile)) { print V "$F Can't read \"$ofile\" ($!)\n" if $V>0; } $OLDopen = 1; print V "$F Reading from \"$ofile\"\n" if $V>2; unless (open(HST,">>$hfile")) { print V "$F Can't write hfile=\"$hfile\" ($!)\n" if $V>0; next hostfile; } $HSTopen = 1; select HST; $| = 1; select V; print V "$F Writing to \"$hfile\"\n" if $V>2; $linkcnt = $tunecnt = $titlcnt = $filecnt = 0; $linkmax = $tunemax = $titlmax = $filemax = 0; print V "$F filecnt=$filecnt for new host $h.\n" if $V>3; &dt(); print HST "$now + start $h\n"; $scancnt = 0; # Number of times we've done this host @oldchunk = (); # One URL and its info @newchunk = (); # New info about this URL # %Depth = (); # Min depth of URIs so far # %Done = (); # List of URIs we've processed # @Left = (); # List of URIs still to handle # print V "$F Emptied \%Done and \@Left\n" if $V>4; # &LoadLinks($afile) if (-f $afile); &LoadLinks($nfile) if (-f $nfile); hostline: while ($l = ) { print V "$F Next host line.\n" if $V>3; next if ($l =~ / \+ (start|done)\b/); # Lines to drop next if ($l =~ m"jc/.*\.hdr$"i); # Ignore jc's HDR files last if ($closeDoc || $finishup); $l =~ s"[\r\s]+$""; # Trim away trailing white stuff print V "====| $l\n" if $V>5; if ($l) { push @oldchunk, $l; # Accumulate lines of one "chunk" } else { &chunk() if @oldchunk; # Process one "chunk" of the host's data } if ($maxurls>0 && $urlcount>$maxurls) { print V "$F hostline: Abort after $urlcount URLs\n" if $V>0; last hostline; } } &chunk() if @oldchunk; # Run thru the added URIs here. Note that if an entry in %addURI has been # undef'd, its name might still be there, and only the value is undefined. print V "$F " . int(@Left) . " URIs left.\n" if $V>3; URI: while (@Left) { # Local URIs discovered in hyperlinks print V "\n" if $V>2; print V "$F There are " . int(@Left) . " local URIs left.\n" if $V>3; if ($maxurls>0 && $urlcount>$maxurls) { print V "$F URI: Abort after $urlcount URLs\n" if $V>0; last URI; } $urlpath = shift @Left; # Get one URI print V "$F urlpath=\"$urlpath\" (" . int(@Left) . " left)\n" if $V>2; $hstpath = "$currhost$urlpath"; print V "$F hstpath=\"$hstpath\"\n" if $V>2; next URI unless $urlpath; # Paranoia: Ignore nulls if ($why = rejectURL($hstpath)) { print V "$F Reject '$hstpath' [$why]\n" if $V>1; next URI; } # if ($urlpath =~ m"/mackay/") { # Bad dir at www.inference.phy.cam.ac.uk # print V "$F ### urlpath with /mackay/ accepted ###\n"; # exit -1; # } if ($Done{$urlpath}) { # Have we done it already? print V "$F \"$urlpath\" already done at $Done{$urlpath} depth $Depth{$urlpath}.\n" if $V>2; print V "$F DROP \"$urlpath\"\n" if $V>2; next URI; } if (($base,$suff) = ($urlpath =~ /^(.*)\.(gif|ps|midi*|jpe*g|zip|g*z|au|mp\d*|wav)$/)) { print V "----> '$urlpath' dropped (suffix).\n" if $V>2; if ($notearchives) { if ($suff eq 'zip') { system "echo ZIP: http://$h/$urlpath >> ZIPfiles" } elsif ($suff =~ /g*z/) { system "echo GZIP http://$h/$urlpath >> ZIPfiles" } } next URI; # Skip this file } unless (defined($dpth = $Depth{$urlpath})) { print V "$F ### U:$urlpath depth unknown.\n" if $V>0; $dpth = 1; # Make a guess } if ($dpth < 1) { print V "$F ### U:$urlpath ignored at depth $dpth.\n" if $V>2; next URI; } print V "$F Add D:$dpth U:$urlpath\n" if $V>3; @oldchunk = ("0 U D:$dpth $urlpath"); &chunk(); } print V "$F No more new URIs for \"$h\".\n" if $V>2; &CloseHST($HostT0{$h},&dt()) if $HSTopen; &Backup($afile); # unlink($afile); &hostunlock if $lfile; if (-d "http/$h") { # Relink the host's cached files $cmd = "nice relink +r 'http/$h'"; print V "$F cmd=\"$cmd\"\n" if $V>2; system "$cmd &"; } } sub hostlock { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # print LCK "$$ $P\n"; close LCK; print V "$P: LOCKed \"$lfile\"\n" if $V>2; } sub hostunlock { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # close LCK; unlink($lfile); print V "$P: unLOCK \"$lfile\"\n" if $V>2; $lfile = ''; } sub href {my $F='href'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given an href, we decide here how to handle it. The caller must pass us the # # URL from the href, and the string (item) between the '>' and the , in # # case we need to check what's there. The main use we make of the item is to # # check for and reject "parent dir" references. We also look at a few other # # suffixes and decide whether we should load them and scan their contents. If # # the URL is accepted, we pass it to &URL() for later processing. For # # rejected URLs, we just return. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local( $curr, # Current URL $href, # URL pointed to $item, # HTML text associated with $href $incr) # Level increment: 0 for frames, 1 for other URLs = @_; local($a,$l,$p,$s,$u); $incr = 1 unless defined $incr; print V "$F curr=\"$curr\" href=\"$href\" item=\"$item\"\n" if $V>4 || ($href =~ /^\$/); unless ($allowcgi) { if ($href =~ /^(\?)/i) { # Once contained '#' print V "$F Ignore href=\"$href\" (?)\n" if $V>3; return; } if ($href =~ /\bcgi\b/i) { print V "$F Ignore href=\"$href\" (cgi)\n" if $V>3; return; } } if ($href =~ /\.(bak|log|tmp|out)\b/i) { print V "$F Ignore href=\"$href\" (bak|log|tmp|out)\n" if $V>3; return; } if ($base) { print V "$F base=\"$base\" replaces curr=\"$curr\"\n" if $V>4; $curr = $base; } if ($href =~ '/$') { # If final '/', treat as directory print V "$F Treat href=\"$href\" as directory.\n" if $V>5; if ($chkuplinks && ($item =~ /\b(Parent|Home|Back)\b/i)) { print V "$F Ignore href=\"$href\" item=\"$item\"\n" if $V>3; return; } elsif ($href =~ m"^\w*://") { # Full URL print V "$F \"$href\" read at depth $depth.\n" if $V>4; &URL($href,$depth+$incr); } else { # Relative URL $u = &URLhref($curr,$href); print V "$F \"$u\" read at depth $depth.\n" if $V>4; &URL($u,$depth+$incr); } print V "$F Done with directory.\n" if $V>4; return; } # No final '/' on HREF: print V "$F Treat href=\"$href\" as non-directory.\n" if $V>4; if ($href =~ m'(.*)#(\w+)$') { print V "$F Offset href=\"$1\" (#$2)\n" if $V>3; $u = &URLhref($curr,$1); print V "$F \"$u\" URL at depth $depth.\n" if $V>4; &URL($u,$depth+$incr); } else { print V "$F Simple href=\"$1\" (#$2)\n" if $V>4; $u = &URLhref($curr,$href); print V "$F \"$u\" URL at depth $depth.\n" if $V>4; &URL($u,$depth+$incr); } } sub mkdirs {my $F='mkdirs'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Create a directory tree. This is recursive. We attempt to make the full # # path. If that fails, we trim off the last field, and call ourself to make # # the parent directory. When that returns, we once again try to make the full # # path. We return 1 for success, 0 for failure. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($p) = @_; local($d,$x); return if -d $p; return if mkdir($p,0775); print V "$F Make dir \"$p\"\n" if $V>2; if (($d,$x) = ($p =~ m"^(.+)/([^/]+)/*$")) { if (-d $d) { print V "$F Dir \"$p\" exists.\n" if $V>3; } elsif (&mkdirs($d)) { print V "$F Made dir \"$p\"\n" if $V>2; } else { print V "$F Can't make dir \"$d\" ($!)\n" if $V>3; } } if (mkdir($p,0775)) { print V "$F Made dir \"$p\"\n" if $V>2; return 1; } else { print V "$F Can not make dir \"$p\" ($!)\n" if $V>0; return 0; } } sub newDoc {my $F='newDoc'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a URL or local file name, this routine attempts to open it and return # # with F containing the file handle. If we succeed, we return 1; a return of # # 0 means that we can't read the object. Note that we "open" a URL by firing # # up a $GetCmd subprocess to fetch it. Yes, this is terribly inefficient, but # # it's the only way I've found to deal with the cases where connect() hangs # # and can't be interrupted. Note also that we pass $GetCmd the +I option, # # which causes it to claim to be Mozilla. Some web servers won't deliver # # pages to anything except a limited set of browsers, so we pretend to be # # one. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($name) = @_; local($getcmd,$file,$h,$l,$p,$path,$t,$eurl,$x); if ($V>4) { local($p,$c,$l) = caller; print V "$F \"$name\" (from $p/$c/$l)\n"; } elsif ($V>2) { print V "$F \"$name\" [$depth]\n" if $V>4; } ($path = $name) =~ s/^[<\s]+//; if (($p,$h) = ($path =~ m'^(http|file)://([-_:.\w]+?)/'i)) { # Hostname? print V "$F URL contains host \"$h\"\n" if $V>7; } else { $h = $myhost; print V "$F URL contains no host, using \"$h\"\n" if $V>7; } if (-d $path) { # A local directory is special if (-f "$path/index.html") { # index.html kludge $path = "$path/index.html"; print V "$F index: \"$path\"\n" if $V>2; } else { require "HTMLdir.pm"; # Dir-to-HTML module if (HTMLdir(*DOC,$path)) { return 1; } print V "$F Can't read local dir \"$path\" ($!)\n" if $V>0; close DOC; $DOCopen = $inABC = $inHTML = 0; return undef; } } print V "$F Is it a local file?\n" if $V>5; $TMopen = &dt(); # The time that we opened the current file if (open(DOC,$name)) { print V "$F Opened local file \"$name\"\n" if $V>4; return 1; } print V "$F Is it a URL?\n" if $V>5; if ($name !~ m"^http:"i) { # |file|ftp local($p,$c,$l) = caller; print V "$F \"$name\" ignored (from $p/$c/$l)\n"; $TMopen = 0; return undef; } localURL: for $x (@local) { # Is this a local URL? $l = length($x); if (substr($path,0,$l) eq $x) { $file = $path; # Build local path substr($file,0,$l) = $local{$x}; last localURL; } } if ($file) { # Did we get a local path? print V "$F Local file \"$file\"\n" if $V>3; if (-f "$file/index.html") { # index.html kludge $file = "$file/index.html"; print V "$F index: \"$file\"\n" if $V>2; } if (-d $file) { # Directories are scanned require "HTMLdir.pm"; # Dir-to-HTML module if (&HTMLdir(*DOC,$file)) { $URLhdr = 1; $DOCopen = &dt(); $inHTML = 1; $inABC = 0; print V "$F Local dir \"$file\"\n" if $V>2; return 1; } else { print V "$F Can't read local dir \"$file\" ($!)\n" if $V>0; $DOCopen = 0; } } elsif (open(DOC,$file)) { $URLhdr = 0; $DOCopen = 1; $inHTML = ($file =~ /\.html*$/i) ? 1 : 0; $inABC = ($file =~ /\.abc$/i) ? 1 : 0; $x = $inABC ? ' ABC' : $inHTML ? ' HTML' : ''; print V "$F Local$x file \"$file\"\n" if $V>2; return 1; } else { print V "$F Can't read local file \"$file\" ($!)\n" if $V>0; $DOCopen = 0; } } print V "$F currhost=\"$currhost\"\n" if $V>3; print V "$F dfltget=\"$dfltget\"\n" if $V>3; print V "$F ABCtmout=\"$ABCtmout\"\n" if $V>3; print V "$F path=\"$path\"\n" if $V>3; if ($path =~ /%/) { # Is the URL URL-encoded already? $eurl = $path; # If so, use it as-is print V "$F eurl=\"$eurl\"\n" if $V>2; } else { $eurl = &URLenc($path); # URL-encode it print V "$F eurl=\"$eurl\" (encoded)\n" if $V>2; } $getcmd = $Getcmd{"C:$currhost"} || "$dfltget -V$HTTPversion" || "$GetCmd +TH"; $getcmd .= " -T$ABCtmout '$eurl'"; print V "$F getcmd=\"$getcmd\"\n" if $V>2; $w3timedout = 0; if ($HTTPtimeout > 0) { # Timeout in effect? $savsig = $SIG{ALRM}; # Save old alarm routine $HTTPcontime = time; # Note time we did open $SIG{ALRM} = 'W3tmout'; # Establish alarm routine alarm $HTTPtimeout; # Set alarm &dt(); print V "$F Set alarm after $HTTPtimeout sec at $now.\n" if $V>3; $DOCopen = 1; # Triggers close on timeout } # sleep($HTTPdelay) if $HTTPdelay > 0; if ($cmdpid = open(DOC,"$getcmd |")) { print V "$F Process $cmdpid \"$getcmd\" running.\n" if $V>4; if ($w3timedout) {print V "#### TIMEOUT but open succeeded ####\n" if $V>0} $URLhdr = 1; # The +H option produces headers $DOCopen = &dt(); # Triggers close on timeout $inABC = $inHTML = 0; } else { print V "$F \"$getcmd\" failed ($!).\n" if $V>2; print V "$F TIMEOUT, open failed.\n" if $V>3; $DOCopen = $inABC = $inHTML = 0; return 0; } $base = ''; # Need new base URL $t = time - $TMopen; # How long have we had this file open? unless ($cmdpid > 0) { print V "$F \"$path\" failed in $t sec. ($!)\n" if $V>0; close DOC; $DOCopen = $inABC = $inHTML = 0; return undef; } print V "$F \"$path\" $t sec.\n" if (($V>3) && ($t>0)); # If we are caching tunes, we create a temporary file to hold this file's # contents. At the end, if we haven't found any tunes, we will unlink it. if ($cachetunes) { print V "$F Cache \"$lurl\"\n" if $V>2; $cachepath = $lurl; # Convert URL to pathname $cachepath =~ s":*/+"/"g; # Reduce these to just '/' $cachepath =~ s/\s+/_/g; # Convert spaces to underscore $cachepath =~ s/[^-_:.%@\~\/\w]//g; # Get rid of "funny" chars $cachepath =~ s"/$"/index.html"; # If directory, add "index.html" print V "$F cachepath:\"$cachepath\"\n" if $V>2; print V "$F cachetmp: \"$cachetmp\"\n" if $V>3; ($cachedir = $cachepath) =~ s"/[^/]+$""; print V "$F cachedir: \"$cachedir\"\n" if $V>3; if (substr($cachepath,-1,1) eq '/') { print V "$F Don't cache directories.\n" if $V>2; $cachetune = 0; } else { if (open(CACHE,">$cachetmp")) { print V "$F Writing \"$cachetmp\"\n" if $V>3; $cachetune = 1; } else { print V "$F Can't write \"$cachetmp\" ($!)\n"; $cachetune = 0; } } } return 1; } sub saveURLs { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Write the accumulates list of URLs to the appropriate hst/* files. All the # URLs have been split into host and URI portions, and we have the arrays: # $h2n{$host} is the number of URIs for $host # $h2u{$host}->[$n] is the nth URI for $host # $h2d{$host}->[$n] is the depth of each URI (1 for initial URIs) # For each host $h, we open the hst/$h file and append a "U" line for each # URI. These may be redundant, of course, and if so, will be dropped when we # process that host again. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($d,$h,$nfile,$hst,$urlpath,$urlfull,$urls,$x); unless (%h2n) { print V "$P: There are no URLs to save.\n" if $V>2; return; } print V "$P: Save URL list ...\n" if $V>2; host: for $h (sort keys %h2n) { next unless $h; $hst = lc($h); $urls = $h2n{$hst}; print V "$P: Host \"$hst\" has $urls new URLs.\n" if $V>3; next if $urls < 1; $nfile = "new/$hst"; unless (open(NEW,">>$nfile")) { print V "### Can't write nfile=\"$nfile\" ($!) (saveURLs)\n" if $V>0; next host; } print NEW "# $h\n"; # Make sure the file identifies the host for ($n = 0; $n < $urls; $n++) { $urlpath = $h2u{$hst}->[$n] || '/'; $d = $h2d{$hst}->[$n]; $urlfull = "http://$hst$urlpath"; print V "#---> uri $n depth $d host \"$hst\" is \"$urlpath\"\n" if $V>2; if ($x = $Done{$urlfull}) { print V "Already done at $x (saveURLs)\n" if $V>2; } else { &dt(); print NEW "\n0 U D:$d $urlpath\n"; } } close NEW; } print V "$P: Initial URL list done.\n\n" if $V>2; close H; if (defined($host) && $host) { $h2n{$host} = (); # Forget the initial list $h2u{$host} = (); $h2d{$host} = (); } } sub scan {my $F="scan"; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Grovel through a file, looking for hyperlinks or pieces of abc code. Check # # out each of the files listed. Directories cause recursive traversal. Files # # with interesting suffixes are read. This routine is complicated by the need # # to decode HTML as well as plain text. An extra complication is that the # # HTTP headers may include various error indications, and we may not be able # # to get the file at all. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # local($URL) = @_; local(*DOC,$endDoc,$DOCopen,$DOCdtype,$DOCstype); # Localize the DOC file local($allow,$disallow,$getcmd,$inABC,$inHTML,$H,$X,$ttl); local($udir,$ubas,$usuf); local($ecod,$emsg,$hlvl,$nhrefs,$tunes,$xx); local($b,$d,$dpth,$h,$href,$i,$item,$l,$line,$lurl,$n,$p,$s,$surl,$t,$u); print V "\n" if $V>1; print V "$F SCAN \"$URL\"\n" if $V>2; if ($t = $Done{$URL}) { $d = $Depth{$URL}; print V "$F \"$URL\" already done at $t depth $d!!!\n" if $V>0; print V "$F DROP \"$urlfull\"\n" if $V>2; return 0; } $DOCopen = $inABC = $inHTML = 0; $tunes = 0; $lurl = &URLtrim($URL); # Long URL may have final '/' unless ($allowcgi) { # Check for CGI if ($lurl =~ /(\bcgi\b|\bbin\b)/) { # Once contained '|\?' print V "$F \"$lurl\" rejected due to \"$1\"\n" if $V>2; return; } } if ($lurl =~ /(#)/) { # Ignore internal anchors print V "$F \"$lurl\" rejected due to \"$1\"\n" if $V>2; return; } print V "====> $depth \"$lurl\"\n" if $V>1; ($surl = $lurl) =~ s"/+$""; # Short URL lacks final '/' # Split the URL into directory/base/suffix: if (($udir,$ubas,$usuf) = ($surl =~ m"(.*/)([^/]*)\.(\w+)$")) { $usuf = lc($usuf); # Suffix is caseless to us } else { $udir = $ubas = $usuf = '' } print V "udir=\"$udir\" ubas=\"$ubas\" usuf=\"$usuf\"\n" if $V>3; if ($surl eq '') { # Shouldn't happen local($p,$c,$l) = caller; print V "$F \"$URL\" (from $p/$c/$l)\n" if $V>0; return 0; } if (($dpth = $Depth{$surl}) && ($dpth <= $depth)) { print V "$F \"$URL\" is marked as depth $dpth.\n" if $V>2; if ($Done{$URL}) { print V "$F \"$URL\" already scanned at depth $dpth.\n" if $V>2; return 1; } } # $Depth{$surl} = $depth; # Note that we've done this URL # print V "$F \"$lurl\" marked as depth $depth.\n" if $V>2; if (%Disallow) { print V "$F Checking disallows ...\n" if $V>4; disallow: for $disallow (keys %Disallow) { print V "$F disallow: \"$disallow\" \n" if $V>4; if (substr($urlpath,0,length($disallow)) eq $disallow) { print V "$F Disallow: \"$disallow\" \n" if $V>4; print V "$F Disallows \"$urlpath\" \n" if $V>4; for $allow (keys %AllowURI) { print V "$F allow: \"$allow\"\n" if $V>4; if (substr($urlpath,0,length($allow)) eq $allow) { print V "$F +++allow: \"$allow\"\n" if $V>4; print V "$F +++ALLOW: \"$urlpath\"\n" if $V>2; last disallow; } else { print V "$F disallow: \"$urlpath\"\n" if $V>2; } } $newuline = ''; @newchunk = (); print V "$F DISALLOW \"$urlpath\"\n" if $V>2; print V "$F (reason: \"$disallow\")\n" if $V>2; return; } } } $currURL = $lurl; # Global copy of current URL $DOCopen = &dt(); # False after document ends $inABC = ($lurl =~ /\.abc$/i) ? 1 : 0; $inHTML = ($lurl =~ /\.\w*html*$/i) ? 1 : 0; $docbytes = 0; # Count the bytes in the current document $doclines = 0; # Count the lines in the current document $doclinks = 0; # Count the links in the current document $doctitls = 0; # Count the titles in the current document $doctunes = 0; # Count the tunes in the current document print V "$F \"$lurl\" ...\n" if $V>4; $TMopen = $TMread = &dt(); unless (&newDoc($lurl)) { $DOCopen = 0; if ($V>3 || ($lurl =~ m"/$")) { &dt(); push @newchunk, "$now # not accessible."; $currURL = ''; } else { $newuline = ''; @newchunk = (); } $TMopen = 0; # The time we opened the current file print V "$F \"$lurl\" not accessible.\n" if $V>1; return 0; } $ignoretune = 0; # If true, ignore all tunes in this file $ignorefile = 0; # If true, ignore the current tune $closeDoc = 0; # If true, close the doc file and contine with next doc # Each time around this loop, we try to get one line from the document, # append it to the remains of the previous line, and decide what to do with # it. Most of the time, we will empty out $line, but in HTML docs we may # return here with a partial line unprocessed. We distinguish the HTTP # headers from the doc's contents, and there are a number of things in the # headers that we look for. buffer: while (!$endDoc && !$ignorefile && !$HTTPalrm && !$finishup) { $i = &scanBuf(); print V "$F Return $i from scanBUF()\n" if $i<2 && $V>2; if ($i eq 0) {return 0} elsif ($i eq 1) {return 1} elsif ($i eq 2) {next buffer} elsif ($i eq 3) {last buffer} else {print V "$F scanBuf() returned $i!!!\n" if $V>0} } if ($w3timedout) {&timedout()} if ($inABC) { print V "$F EOF ends ABC tune at line $doclines.\n" if $V>4; &tune(); @tune = (); $inABC = 0; $X = $ttl = undef; } ++$filecnt if ($doctunes>0 || $doctitls>0); # N.B.: We will only index the tune if there's a title. Since $doctunes # may count tunes for which there's no title, we may be looking at a tune # that will not be indexed. Not that there's a problem with that. if ($V>5) { print V "$F filecnt=$filecnt because doctunes=$doctunes doctitls=$doctitls.\n"; $xx = ($doctunes > 0) ? " in ABC file $filecnt" : ''; push @newchunk, "$now $doclinks links, $doctunes tunes, $doctitls titles$xx." if $doctunes || $doctitls || $doclinks; } $linkcnt += $doclinks; # Total links at this host $tunecnt += $doctunes; # Total tunes at this host $titlcnt += $doctitls; # Total titles at this host print V "$F doclinks=$doclinks linkcnt=$linkcnt doctunes=$doctunes tunecnt=$tunecnt doctitls=$doctitls titlcnt=$titlcnt\n" if $V>2; $linkmax = $linkcnt if $linkmax < $linkcnt; $tunemax = $tunecnt if $tunemax < $tunecnt; $titlmax = $titlcnt if $titlmax < $titlcnt; $t = time - Max($HTTPcontime,$HTTPreadtime); if (($i = ($HTTPdelay - $t)) > 0) { # Do we have a min delay between GETs? print V "Delay $i sec.\n" if $V>2; sleep $i; } if ($endDoc || $HTTPalrm) { # Some disaster detected $reason = "timeout alarm after $t sec"; print V "Close \"$URL\" ($reason)\n" if $V>2; $endDoc = $HTTPalrm = 0; } elsif ($finishup) { $reason = "Told to finishup"; print V "$F $reason ...\n" if $V>2; } else { $reason = 'EOF'; print V "$F EOF on DOC file.\n" if $V>4; } &endDoc("$reason") if $DOCopen; if ($tunes>0 && $V>1) { # ABC line count ($ss,$mm,$hh,$DD,$MM,$CY) = gmtime($now = time); ++$MM; $CY += 1900; $s = ($tunes > 1) ? 'tunes' : 'tune'; print V " \"$lurl\" ==== $tunes abc $s ==== $CY/$MM/$DD $hh:$mm:$ss\n"; } print V "$F D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $urlpath\n" if $V>2; alarm 0; $SIG{ALRM} = 0; print V "$F Set alarm 0.\n" if $V>4; print V "$F DONE \"$URL\"\n" if $V>2; # print V "\n" if $V>2; return 1; } sub scanBuf {my $F='scanBuf'; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This is the code that handles once through the buffer: loop in scan(). It # # was moved here during an attempt to profile the code. Our return values # # tell the caller to do these things: # # 0: return 0 # 1: return 1 # 2: next buffer # 3: last buffer # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if ($closeDoc || $ignorefile) { print V "$F Ignoring rest of document.\n" if $V>1 && $ignorefile; print V "$F Closing document.\n" if $V>1 && $closeDoc; return 3; # Abandon this doc } if ($w3timedout) { # Should we give up on this doc? print V "$F TIMEOUT\n" if $V>2; unless ($doclinks || $doctunes || $doctitls) { print V "$F TIMEOUT and nothing interesting found.\n" if $V>2; return 3; # Abandon this doc } print V "$F Timeout ignored because $doclinks links, $doctunes tunes, $doctitls titles.\n" if $V>2; } if (!($b = &Line())) { # Read one line from document print V "$F EOF\n" if $V>2; return 3; } $TMread = time; # Note last time we got data from file print "$F: Set opent1=$TMread after TIMEOUT\n" if $V>1 && $w3timedout; $doclines ++; # count the lines in the document $docbytes += length($b) # Count the bytes in the document unless $URLhdr; if ($doclines > $maxlines && $usuf ne 'abc') { # Should we continue? unless ($inHTML || $inABC || $doctunes) { # For files not of abc type, we require a recognized tune # with maxlines, or we reject it and drop the connection. # This does a lot to prevent grovelling through huge files # of irrelevant stuff. print V "$F Abort after $doclines lines.\n" if $V>1; &endDoc("No ABC in initial $maxlines lines"); return 1; } } $line .= $b; # Add to leftover from last line $line =~ s/[\s\r]+$//; # Discard trailing white stuff if (!$line && $URLhdr) { # Null line - $URLhdr = 0; # Terminates headers print V "$F Null line terminates HTTP header.\n" if $V>2; return 2; # Otherwise ignore it } print V "$F \"$line\"\n" if $V>4; if ($URLhdr) { # Are we within the HTTP headers? print V "$F Header line.\n" if $V>5; if (($DOCdtype,$DOCstype) = ($line =~ m"Content-Type:\s*(.*)/(.*)\s*$"i)) { print V "$F HDR 'Content-Type: $DOCdtype/$DOCstype'\n" if $V>1; $dt = lc($DOCdtype); $st = lc($DOCstype); if ($notearchives) { if ($MIMEtype{"$dt/$st"} eq 'zip') { system "echo zip: $URL >> ZIPfiles"; } elsif ($MIMEtype{"$dt/$st"} eq 'gzip') { system "echo gzip $URL >> ZIPfiles"; } } if ($dt eq '*' && $st eq '*') { # AOLserver actually does this if ($lurl =~ /\.(abc|txt)$/i) { # Fix it for .abc and .txt files $dt = 'text'; $st = 'plain'; } } if (($dt ne 'text' && $dt ne 'application') || ($st eq 'ps') # Ignore PostScript files || ($st eq 'pdf') # Ignore PDF files || ($st eq 'doc') # These are mostly MS doc files || ($st eq 'exe') # These are mostly MS executables ) { print V "$F Type \"$DOCdtype/$DOCstype\" ignored.\n" if $V>2; &endDoc("Ignore $dt/$st files"); $newuline = ''; # This deletes the file from the output @newchunk = (); return 1; # This mostly avoids graphical and compressed files } if ($line =~ /: Can't open .*\((.*)\)\s*$/) { $reason = $1; &dt(); push @newchunk, "$now # Can't open ($reason)"; push @newchunk, @oldchunk; &endDoc("$reason"); print V "$F Return 1 ($reason)\n" if $V>2; return 1; } if ($DOCstype =~ m"html"i) { $inHTML = 1; # We've gotta decode HTML print V "$F HTML file ($DOCdtype/$DOCstype).\n" if $V>4; } else { $inHTML = 0; # It's probably plain text print V "$F TEXT file ($DOCdtype/$DOCstype).\n" if $V>4; } } elsif ($line =~ m"^HTTP/([0-9.]+)\s") { # unless ($Getcmd{"C:$currhost"}) { # $HTTPversion = $1; # print V "$F HTTP version $HTTPversion (from HTTP hdr)\n" if $V>2; # } $getcmd = $Getcmd{"C:$currhost"} || "$dfltget -V$HTTPversion"; unless ($Getcmd{"C:$currhost"}) { $getcmd =~ s/ [_=]V[0-9.]+//; $getcmd = $Getcmd{"C:$currhost"} = "$dfltget -V$HTTPversion"; print V "$F Get command \"$getcmd\" for $currhost (from HTTP header)\n" if $V>2; } } elsif ($line =~ /^Server:\s*(.*)\s*$/) { $Server{$currhost} = $1; print V "$F Server for \"$currhost\" is \"$Server{$currhost}\"\n" if $V>4; } elsif (($hlvl,$ecod,$emsg) = ($line =~ /^(HTTP\/[4-9][\d.]+)s+(\d+)\s+(.*)\s*-->/)) { print V "$F HDR '$hlvl $ecod $emsg'\n" if $V>0; &errchunk("$now # $hlvl err $ecod ($emsg)"); print V "$F Return 1 ($reason)\n" if $V>2; return 1; } elsif ($line =~ /^Location:\s*(.*)/) { # Redirection $u = &URLhref($curr,$1); print V "$F Redirected to \"$u\" at depth $depth.\n" if $V>2; &URL($u,$depth); push @newchunk, "$now R $lurl -> $u" if $V>2; } elsif (($ecod,$emsg) = ($line =~ /^/)) { print V "$F HDR 'Fail with HTTP err' $ecod ($emsg) \"$currURL\"\n" if $V>0; if ($ecod > 400) { if ($purgebad) { print V "$F Purge \"$currURL\" (HTTP err $ecod $emsg)\n" if $V>2; $newuline = ''; @newchunk = (); } else { &errchunk("$now # HTTP err $ecod $emsg ### BAD URI ###"); } } else { &errchunk("$now # HTTP err $ecod $emsg"); } print V "$F Return 1 ($reason)\n" if $V>2; return 1; } elsif ($line =~ /Can't open "(.*)" \((.*)\)/) { $reason = "'Can't open $1 ($2)\n"; print V "$F HDR $reason\n" if $V>2; &errchunk("$now # $reason"); print V "$F Return 1 ($reason)\n" if $V>2; return 1; } elsif ($line =~ /Can't connect to "(.*)" in (\d) sec. \((.*)\)/) { $reason = "Can't connect to $1 in $2 sec. ($2)"; print V "$F HDR $reason\n" if $V>2; &errchunk("$now # $reason ($2)"); print V "$F Return 1 ($reason)\n" if $V>2; return 1; } elsif ($line =~ /Can't connect to "(.*)" \((.*)\)/) { $reason = "Can't connect to $1 ($2)"; print V "$F HDR $reason\n" if $V>2; &errchunk("$now # $reason"); print V "$F Return 1 ($reason)\n" if $V>2; return 1; } else { print V "$F HDR unrecognized \"$line\".\n" if $V>5; # Ignore unknown HTTP headers. } $line = ''; return 2; } # We're not in the HTTP headers. So this must be in the data. We try # to cut out the ABC tunes and store them in @tune, calling &tune() at # every blank line that ends a tune. $len = length($line); print V "$F inABC=$inABC inHTML=$inHTML $len-char line.\n" if $V>4; if ($inABC) { $line =~ s/^\s+//; # Strip out initial white stuff if ($line) { push @tune, $line; # Add the line to the tune } elsif (@tune) { print V "$F Blank line ends ABC tune.\n" if $V>4; &tune(); @tune = (); $X = undef; $inABC = 0; } $line = ''; return 2; } # We're not inside an ABC tune. The main complications here are ABC # %% directives and HTML. We first look for interesting directives: if ($line =~ /^%%(.*)/) { $directive = $1; print V "$F DIR %%$directive\n" if $V>1; if ($directive =~ /^noindex\b/) { # %%noindex says ignore ABC tune(s) $reason = "%%NOINDEX inABC=$inABC"; print V "$F DIR $reason\n" if $V>1; # Should be 0 $ignorefile = 1; $closeDoc = 1; # This may be redundant print V "$F Return 1 ($reason)\n" if $V>2; return 3; # Close down this file } } $nhrefs = 0; $line =~ s/^\s+//; # Strip initial white stuff if ($inHTML) { # Strip out HTML tags and entities if (&scanHTML($line)) { print V "$F scanHTML succeeded.\n" if $V>3; print V "$F line=\"$line\"\n" if $V>3; } else { print V "$F scanHTML failed; abandoning \"$line\"\n" if $V>2; return 2; } } print V "line:$line\n" if $V>4; print V "$F No hrefs.\n" if $V>4; $line =~ s#^##si; # Strip out and tags $line =~ s#^#\n\n#si; # Replace
 tags with double newlines
		if ($line =~ /^4;
			$inHTML = 1;			# Note it's HTML
			$line .= ' ';
			return 2;				# Append another line
		}
		if ($line =~ m"^X:\s*([\d/.]+)") {	# X: line starts a (new) ABC tune
			$X = $1;
			$inABC = 1;
			@tune = ($line);
			++$tunes;				# Count the (possible) ABC tunes
			print V "$F	\"$line\" may be ABC (X:$X)\n" if $V>4;
			$line = '';
#			$line =~ s"^.*"";
#			$line =~ s"^[\r\s]+"";
			return 2;
		}
		if (($H,$ttl) = ($line =~ /^([TPN]):\s*(.*)/)) {	# T: or P: line required
			unless (defined $X) {	# Missing X: line?
				$X = '0';			# Default tune index
				++$tunes;			# Count the (possible) ABC tunes
				@tune = ("X: $X");	# X: is first line
			}
			$line = "$H:__" unless $ttl;	# Missing title?
			$inABC = 1;
			push @tune, $line;
			print V "$F	\"$line\" may be ABC ($H:$ttl)\n" if $V>4;
			print V "$F	\"$line\" may be ABC in HTML.\n" if $V>2 && $inHTML;
			$line = '';
#			$line =~ s"^.*"";
#			$line =~ s"^[\r\s]+"";
			return 2;
		}
		print V "$F	Not an X or T or P line.\n" if $V>6;
		if ($line) {			# Not X: or T: so just add it to the tune
			push @tune, $line;
		}
		if ($line =~ s"^(<.*>)\s*"") {	# Paranoia. (Is this needed?)
			$inHTML = 1;
			print V "$F	HTML tag \"$1\" deleted.\n" if $V>6;
		}
		return 2 if !$line;
		if ($inHTML) {
			$line =~ s"^([^<]+)"";
			print V "$F	Drop \"$1\" from HTML.\n" if $V>4;
		} else {
			print V "$F	Drop \"$line\"\n" if $V>4;
			$line = '';
		}
		print V "$F	\"$line\" left.\n" if $V>7;
		return 2;
}

sub timedout {my $F='timedout';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This is called while abandoning a URL, to convert the current "chunk" of  a #
# hst/* file to show the timeout.                                             #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#	$TOopen = &dt() - $TMopen;	# Time since we opened the current file
#	$TOread = $now  - $TMread;	# Time since we last read from the file
	print V "$F	Timed out at line $doclines.\n" if $V>1;
	push @newchunk, "$now URL $lurl";
	push @newchunk, "$now # TIMEOUT after $TOopen/$TOread sec.";
	&endDoc("Timed out");
	if (@newchunk) {
		print V "$F	Timed out with partial chunk built.\n" if $V>3;
	} else {
		print V "$F	Timed out with nothing, preserving old data\n" if $V>3;
		push @newchunk, @oldchunk if @oldchunk;
	}
}

sub scanHTML {my $F='scanHTML';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Here we process the current $line looking for interesting HTML  stuff.   We #
# are mostly interested in links of various sorts.  Maybe we should also take #
# care of HTML entities here.                                                 #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($attr,$href,$init,$item,$rest);
	print V "$F	\"$line\"\n" if $V>4;
	while (			# Match   (.*)#i))
	||	(($init,$attr,$href,$item,$rest) = ($line =~ m#^(.*?)]*?\s(VALUE)='(HTTP:[^>']*)'[^>]*>(.*?)(.*)#i))
	||	(($init,$attr,$href,$item,$rest) = ($line =~ m#^(.*?)]*?\s(VALUE)="(HTTP:[^>"]*)"[^>]*(>+)(.*)#i))
	||	(($init,$attr,$href,$item,$rest) = ($line =~ m#^(.*?)]*?\s(VALUE)='(HTTP:[^>']*)'[^>]*(>+)(.*)#i))
	||	(($init,$attr,$href,$item,$rest) = ($line =~ m#^(.*?)<(URL):"([^">]+?)">([^<]*)(.*)#i))
	||	(($init,$attr,$href,$item,$rest) = ($line =~ m#^(.*?)<(URL):'([^'>]+?)'>([^<]*)(.*)#i))
	||	(($init,$attr,$href,$item,$rest) = ($line =~ m#^(.*?)<(URL):([^>]+?)(>+)(.*)#i))
	) {
		print V "\n$F: Matched  attr='$attr' href='$href' item='$item'\n" if $V>3;
		unless (HTMLmatch()) {return 0}
		$line = $init . $rest;
	}
	print V "$F	No link left in \"$line\"\n" if $V>3;
	while (						# Match 
		(($init,$href,$rest) = ($line =~ m#^(.*?)]*?SRC\s*=\s*"([^>"]+)"[^>]*>(.*)#i))
	||	(($init,$href,$rest) = ($line =~ m#^(.*?)]*?SRC\s*=\s*'([^>']+)'[^>]*>(.*)#i))
	) {
		print V "\n$F: Matched  href=\"$href\"\n" if $V>3;
		$href =~ s/^\s+//;		# Disregard initial spaces
		$line = "$init,$rest";	# Remove frame from line
		if ($href) {
			$doclinks++;		# Count the links in the current document
			$nhrefs ++;
			&href($lurl,$href,'',0);	# Handle this href later
		}
	}
	print V "$F	No FRAME left in \"$line\"\n" if $V>3;
	if ($line =~ m/4;
		return 0;
	}
	if ($nhrefs) {
		print V "$F	Drop \"$line\" after $nhrefs hrefs removed.\n" if $V>4;
		$line = '';
		return 0;
	}
	if ($line =~ s///i) {
		print V "$F	BASE att=\"$1\"\n" if $V>3;
		&BASE($1);
	}
	$len = length($line);
	print V "$F	No more recognized HTML; $len bytes left.\n" if $V>3;
	return 1;
}

sub HTMLmatch {my $F='HTMLmatch';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This routine was split out to help in profiling.  There seems  to  be  some #
# sort  of  problem with the above code getting into an infinite loop at this #
# code.  It uses variables from its caller, scanHTML()  above.   When  we  no #
# longer  want  to do this profiling, we might want to move this code back to #
# replace the above call.                                                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	if (!$href) {			# Is this a perl bug?  (It has happened.)
		print V "$F	Matched \"$line\" with null href!!!\n" if $V>2;
		$line = '';			# Abandon the line
		return 0;
	}
	$href =~ s/^\s+//;		# Disregard initial spaces
	$head = $head . $init;	# Add text item to growing line.
	$tail = $rest;
	if ($href) {
		$doclinks++;		# Count the links in the current document
		$nhrefs ++;
		&href($lurl,$href,$item,1);	# Handle this href later
	}
	return 1;
}

sub scanURL {my $F='scanURL';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Scan one URL for tunes.  Actually, all we do here is pass off most  of  the #
# work  to  scan(), and then examine the results to see if we want to include #
# this URL in the output.                                                     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	my $urlfull = shift;
	print V "$F	SCAN \"$urlfull\"\n" if $V>2;
	$urlcount ++;		# Count the URLs that we process
	&scan($urlfull);
	if ($depth<2 || $doctunes || $doctitls || !$purgebad) {	# used to include "$doclinks ||"
		$newuline = "$now U D:$depth B:$docbytes L:$doclinks X:$doctunes T:$doctitls $urlpath";
	} else {			# No links, tunes or titles
		$newuline = '';	# This won't be output
		print V "$F	Purge because links=tunes=titles=0 purgebad=$purgebad.\n" if $V>2;
	}
	print V "$F	DONE \"$urlfull\"\n" if $V>2;
}

sub showcalls {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dump the function-call stack.  This happens in response to some interrupts, #
# depending on how things are configured at the moment. We can also call this #
# from the debugger.                                                          #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($l,$package, $filename, $line, $subroutine);
	print V "$P: Call stack:\n" if $V>0;
	$l = 0;
	while (($package, $filename, $line, $subroutine) = caller($l)) {
		printf V "\tLevel %3d line %5d $filename\tin $subroutine\n",$l,$line if $V>0;
		++$l;
	}
}

sub sigCONT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A CONT signal tells us to abandon the current URL  and  continue  with  the #
# next.   This  is useful when we are hung on a connection, though this isn't #
# much of a problem now that we run $GetCmd as a subprocess.  Just  kill  the #
# $GetCmd process, and we'll continue.                                        #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigCONT: CONT signal.\n" if $V>0;
	print V "sigCONT: close DOC ...\n" if $V>4;
	if ($DOCopen) {
		&endDoc();
		print V "sigCONT: closed DOC.\n" if $V>2;
	}
	&showcalls();
}

sub sigINT {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# An INT signal gets us here, where we do a stack dump and set  the  finishup #
# flag to trigger abandonment of all further URLS.                            #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigINT: INT signal.\n" if $V>0;
	&showcalls();
	$finishup = $endDoc = 1;
	&hostunlock if $lfile;
	exit 1;
}

sub sigUSR1 {
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# A USR1 signal just produces a stack dump, and then we continue.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	print V "sigUSR1: USR1 signal.\n" if $V>0;
	&showcalls();
}

sub tune {my $F='tune';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# The @tune array contains what looks like an ABC tune.  Extract the critical #
# data from it, and if it passes as a tune, write the data to the output.     #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
	local($H,$l,$K,$k,$M,$m,@N,@P,$t,$T,@T,$Tsrc,$X);
	local($Achords,$Clefs,$Vcnt,%Voices);
	local($comment,$directive);
	local($GBcode,$JCcode,$UDcode);
	print V "$F	$l\n" if $V>4;
	if ($ignoretune) {
		print V "$F	### Tune dropped because $ignoretune=$ignoretune\n" if $V>1;
		$ignoretune = 0;	# Don't ignore the next one.
		return 0;
	}
	if ($ignorefile) {
		print V "$F	### Tune dropped because $ignorefile=$ignorefile\n" if $V>1;
		return 0;
	}
	$X = 0;
	$M = '';
	$K = '';
	@N = ();
	@P = ();
	@T = ();
	$Achords = $Clefs = $Vcnt = 0;
	%Hdrs = ();
	%Voices = ();
line:
	for $l (@tune) {
		print V "l=\"$s\"\n" if $V>7;
		if ($l =~ /^\s*%(.*)/) {			# Comments
			$comment = $1;
			print V "$F	COM %$comment\n" if $V>1;
			if ($comment =~ /^%(.*)/) {	# ABC directive
				$directive = $1;
				print V "$F	DIR %%$directive\n" if $V>1;
				if ($directive =~ /^noindex\b/) {	# %%noindex says ignore ABC tune(s)
					print V "$F	DIR NOINDEX inABC=$inABC\n" if $V>1;
					if ($inABC) {
						print V "$F	DIR NOINDEX inside ABC tune.\n" if $V>1;
						print V "$F	Ignore rest of tune.\n" if $V>1;
						$ignoretune = 1;
						return 0;
					} else {
						print V "$F	DIR NOINDEX outside ABC tune.\n" if $V>1;
						print V "$F	Ignore rest of file.\n" if $V>1;
						$ignorefile = 1;
						return 0;
					}
				} else {
					print V "$F	DIR %%$directive ignored.\n" if $V>2;
					next line;
				}
			}
			print V "$F	COM %$comment ignored.\n" if $V>2;
			next line;
		}
		if ($l =~ m"^X:\s*([\d/.]+)") {	# X: index header
			$X = $1;
			print V "$F	X:'$X'\n" if $V>3;
		} elsif ($l =~ /^N:\s*(.*)/) {	# N: parts header
			unless ($K) {				# Use only within headers
				push @N, $1;
				print V "$F	N:'$1'\n" if $V>3;
			}
		} elsif ($l =~ /^P:\s*(.*)/) {	# P: parts header
			unless ($K) {				# Use only within headers
				push @P, $1;
				print V "$F	P:'$1'\n" if $V>3;
			}
		} elsif ($l =~ /^T:\s*(.*)/) {	# T: title header
			$t = $1 || '__';			# Note title '_' if missing
			$t =~ s/\s*[<%].*//;		# Delete comments, HTML
			$t =~ s/\s+/ /g;			# Reduce white space
			if ($SCDkludge && $T && ($t =~ m'[\d_]+x[\d_]+[A-Z][\d_]*')) {
				print V "$F	Drop SCD title \"$t\"\n" if $V>3;
				next line;
			}
			if ($t) {
				push @T, ($T = $t);		# Accumulate titles
				print V "$F	T:'$T'\n" if $V>3;
			}
		} elsif ($l =~ /^M:\s*(.*)/) {	# M: meter
			next line unless $X || $T;	# Ignore M lines outside tune
			unless ($M) {				# Use only the first meter
				$M = $1;
				$M =~ s/\s*[<%].*//;	# Delete comments, HTML
				$M =~ s/\s+//;			# Strip out white space
				print V "$F	M:'$M'\n" if $V>3;
			}
		} elsif ($l =~ /^K:\s*([^\r\s]*)\s*(.*)/) {	# K: key ends headers
			next line unless $X || $T;	# Ignore K lines outside tune
			next line if $K;	# Is the key already defined?
			$K = $1;			# It's the first key sig
			$k = $2;			# Any excess stuff
			if (($k =~ /\b(treble|alto|tenor|bass)\b/i) || ($k =~ /\b(clef=[GCF])\b/)) {
				$K .= " $1";	# Include clef with key
				$Clefs ++;		# Count all clefs
			}
			print V "$F	K:'$K'\n" if $V>3;
		} elsif ($l =~ /^V:\s*(\w+)/) {	# V: Voice line
			$Hdrs{V}++;					# Note that voices are used
			$Voices{$1}++;				# Note the different voices
			if ($l =~ /\bclef=(treble|alto|bass|G|C|F)\b/) {
				$Clefs ++;				# Count all clefs
			}
		} elsif ($l =~ /^([A-Za-z]):/) {	# V: Other headers
			$Hdrs{$1} ++;
		} elsif ($l =~ /^\d:/) {		# Why do we see this?
			print V "drop: $l.\n" if $V>3;
		} elsif ($K) {					# Collecting music for abcCode()
			if ($l =~ /"[A-G][b#]*[m7]*"/) {	# Look for chords
				$Achords ++;
				$Hdrs{'"'} ++;			# Add as a kind of "Header"
			}
			unless ($l =~ /^\w:/) {		# Ignore things like w: words
				$m .= $l;				# Accumulate music as one string
			}
		}
	}
	print V "$F	EOF after $doclines lines.\n" if $V>6;
	$Vcnt = int(keys %Voices);			# Number of distinct voices found
	print V "$F	$Vcnt voices found in X:'$X'\n" if $Vcnt>0 && $V>1;
	print V "$F	inABC=$inABC K=$K X=$X m=$m\n" if $V>3;
	unless (($inABC && $K) || ($X && $K && $m)) {
		print V "$F	No X line found.\n" if (!$X && $V>3);
		print V "$F	No K line found.\n" if (!$K && $V>3);
		print V "$F	No music found.\n"  if (!$m && $V>3);
		print V "$F	Failed.\n" if $V>3;
		return 0;
	}
	print V "$F	Call abcCode(\"$K\",\"$L\",\"$M\",\"$m\")\n" if $V>5;
	($GBcode,$JCcode,$UDcode) = $abcCode->abcCode($K,$L,$M,$m);
	print V "$F	GBcode=\"$GBcode\" JCcode=\"$JCcode\" UDcode=\"$UDcode\"\n" if $V>4;
	&dt();
	if (@T) {			# Did we find any T: lines?
		for $t (@T) {
			print V "$F	Title from T:$t\n" if $V>1;
		}
		$Tsrc = 'T';
	} elsif (@P) {
		push @T, $P[0];		# If no title lines, use first P: line
		$Tsrc = 'P';
		print V "$F	Title from P:$P[0]\n" if $V>1;
	} elsif (@N) {
		push @T, $N[0];		# If still no title, use first N: line
		print V "$F	Title from N:$N[0]\n" if $V>1;
		$Tsrc = 'N';
	} else {
		print V "$F	No title found in tune X:$X [urlfull=$urlfull]\n" if $V>1;
	}
	$H = join('',sort(keys(%Hdrs)));
	$doctitls += int(@T);
	for $T (@T) {
		$l = "$now X:$X M:$M K:$K";
		if ($Clefs  > 0) {$l .= " C:$Clefs"}
		if ($Vcnt > 0) {$l .= " V:$Vcnt"}
		$l .= " H:$H" if $H;
		$l .= " C1=$JCcode C2=$UDcode T:$T";
		print V "$F	$Tsrc $l\n" if $V>1;
		push @newchunk, $l;
	}
	++$doctunes;
	print "$F	$doctunes in this file.\n" if $V>1;
	$ignoretune = 0;	# If true, ignore all tunes in this file
	return $doctunes;
}