#!/usr/bin/perl # NAME # w3ld - (down)load the contents of a web directory. # SYNOPSIS # w3ld [options].. URL.. # REQUIRES # This program needs the following modules, which should be found # in the same directory that you found this program. push @INC,"$ENV{HOME}/sh",'sh'; require "Vopt.pm"; require "URLopen.pm"; require "HTMLdir.pm"; require "HTTPcon.pm"; require "URLhref.pm"; require "URLtrim.pm"; # DESCRIPTION # This program expects one or more URLs on the command line. Each # URL is treated as a directory and its contents are copied into # the current directory. If the -r ("recursive") option is used, # subdirectories will be identified and copied recursively. # The URLs on the command line should all end with '/', but if one # doesn't, we add '/'. Thus, this program currently can't grab just # a single file. (That's done by w3cat, which should be in the same # place you found this program.) # By "treated as a directory", we mean that the URL's contents are # scanned for hyperlinks. Those hyperlinks are in turn downloaded, # and written to local files that have the same names. # OPTIONS # Options start with '-' or '+'. For some of the options, '+' means # "enable" and '-' means "disable"; for others, there's no such # concept and you may use either flag char. The options that don't # have args may be combined with the next option in the usual way, # so that "-mnr2" is the same as "-m -n -r2". # Default options are set up to whatever I mostly find useful at # the moment, so they may change from time to time. Check below for # what the true defaults are in whatever version you have. # # The following option isn't yet fully implemented; don't use it. # # -b # # Don't do any backup. This is usually the default. # # +b # # Back up existing files. Several backup styles are supported. if # # is '-' or '~', then backups are made by appending to # # the name. Such backups are recursive, i.e., if a backup file # # exists, it will in turn be backed up, resulting in multiple # # flag chars on the end of its name. If is present, it is the # # limit to the number of backups. The in some versions is -b-7, # # which will produce the file and up to 7 backup copies. If # # is '.', then the backup will be done by appending . to the # # name, and only one backup is possible. Thus -b.bak will rename # # foo.c as foo.c.bak, as is conventional of DOS systems. # -c # Don't do a copyright check (described below). # +c # Do a copyright check (default). # -m # Don't copy modification times. Copied files will have mtimes # as of when they are copied. # +m # Set mtimes (modification time) from the "Last-modified" times # for URLs that are copied. # -n # Copy all files, ignoring modification times. # +n # If a file already exists, copy only the newer files. If the web # server doesn't give a coherent date, we will assume the file is # newer and copy it. At verbose level -v3, URLs not copied due to # age are flagged with '<'. # -p # Sents requests via a proxy HTTP server. The should be # the "protocol://host.name:port" portion of a URL. # -r # Recurse to levels. The default, -r1, means to treat only # the top-level URLs as directories. The -r2 option means to try # to determine which of the second-level URLs are directories. # When such a subdirectory is found, a local directory by the # same name is created, and filled with the URL's contents. If # is omitted, -r2 is assumed. # -v[file] # Verbose option. Sets the verbose level to , and if the file # name is present, writes to that file. The default is currently # -v2, which will get you a list of URLs copied, flagged by '>', # and written to STDOUT. Use -v3 if you'd like to see the URLs # that are not copied, flagged by '<' or a reason. # -x\ # Exceptions. Any URLs that match the pattern will be ignored and # not fetched. # ENVIRONMENT # We use the following from the environment: # W3PROXY # The name (or address) and an optional :port for a proxy # gateway. URLs that don't match the W3NOPROXY will be fetched # indirectly via the proxy's web server. # W3NOPROXY # A pattern which is applied to URLs, and if they match, no proxy # is used. That is, any URL that matches W3NOPROXY is considered # local, and we will access it directly. # EXAMPLES # mkdir RFC; cd RFC # w3ld http://ds.internic.net/rfc/ # Download all the RFCs from the InterNIC. # mkdir Scot; cd Scot # w3ld -r2 http://trillian.mit.edu:8739/~jc/abc/Scotland/ # This downloads John Chambers' Scottish Country Dance music into # the local directory 'Scot'. The -r2 is needed because the tune # files are in subdirectories 'jig', 'reel', and so on. # mkdir Scot; cd Scot # w3ld -mr2 http://trillian.mit.edu:8739/~jc/abc/Scotland/ # The same, but preserve the "mtime" modification times. # mkdir Celtic; cd Celtic # w3ld -r2 -mnv3 http://trillian.mit.edu:8739/~jc/abc/Scotland/ \ # http://trillian.mit.edu:8739/~jc/abc/Ireland/ # Similar, but copy only files that are newer than those already # in the local directory. The -v3 causes the output to include # all file names; files not copied are marked "(not copied)". The # two remote directories will be merged into one local Celtic # directory. (If there are two different files by the same name, # the one under Scotland/ will will be lost.) # mkdir celtic; cd celtic # w3ld http://celtic.stanford.edu/pub/tunes/abc.tunes/ # Download the abc tune collections from the Ceolas archive. No # -r option here so we don't go into the abc.tunes.ps/ directory. # mkdir hn; cd hn # w3ld http://home1.swipnet.se/~w-11382/abc/ # Download Henrik Norbeck's abc files. No -r option, so we don't # go into his subdirectories. This gets a lot of big abc files. # LIMITATIONS # So far only the http:// protocol is implemented; ftp://, file:// # and others may appear if I need them. If someone feels like # adding FTP code, you might send me a copy. # DEBUGGING # You can use "perl -dw", of course. Or you can do the following: # setenv V_w3ld 5/tmp/w3ld.out # csh or tcsh users. # export V_w3ld=5/tmp/w3ld.out # ksh or bash users. # This will turn on the "print V" lines and write to /tmp/w3ld.out. # Or use the -v option on the command line. # BUGS # Despite many attempts to detect failure, we still don't optimally # handle all the myriad things that can go wrong. # It isn't technically possible to determine whether a URL names a # directory or a file. Our scheme is to treat a final '/' as the # sign of a directory. Most web servers use this convention when # they produce directory listings. But there are web pages around # that lie, putting '/' on non-directory URLs and leaving '/' off # of directory URLs, and we have no defense against this. # Many servers have a low limit to the number of links that they # can have open simultaneously. This program's "recursive" copy is # truly recursive, and a URL is kept open while lower URLs are # read. So if a server limits links to, say, 4, then -r5 won't # work, because it requires keeping 5 links open. # It's impossible to detect multiply-linked files via HTTP, and we # don't try. The result will be duplicate files with different # names. If you want to fix this, look for my "relink" program. # COPY BLOCKING # This program implements a simple kind of protection to prevent # downloading files, such as those that contain copyrighted data. # If a file foo.bar is a link to the file .Copyright.bar in the # same directory, we will not download foo.bar from another system. # For files without a suffix, the file .Copyright is checked. For # this to be effective, the .Copyright* file must be in the same # directory and must have the same suffix. # For example, if you know that the "MissRowanDavies.abc" is the # name of a file that contains copyrighted data, you can create a # file .Copyright.abc, link it to MissRowanDavies.abc, and a file # by that name will not be downloaded into your directory, even if # newer. # FUTURE # Lots of options are possible. One useful option might be to write # the files to STDOUT, as one big file, prehaps in shar format. Is # it possible to pipe such output to tar and have it do the Right # Thing? # AUTHOR # John Chambers # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # $| = 1; # Unbuffer STDOUT. $exitstat = 0; # Our exit status. ($me = $0) =~ s"^.*/""; # Our name, minus directory. $Vdfl = 2; # Default verbose level. &Vopt($ENV{"V_$me"} || $ENV{"T_$me"} || $ENV{"D_$me"} || $Vdfl); print V "$me: Started ", `date` if $V>1; ### Here are some defaults that you may wish to twiddle: $copyright = 1; # Check for copyright files. $mtimes = 1; # Copy modification times. $newer = 0; # Copy only newer files. $rlevel = 0; # Current recursion level. $rlimit = 2; # Maximum recursion depth allowed. ### Some globals that you probably don't want to change: %ML = ('Jan',0,'Feb',1,'Mar',2,'Apr',3,'May',4,'Jun',5,'Jul',6,'Aug',7,'Sep',8,'Oct',9,'Nov',10,'Dec',11); @ML = (31,28,31,30,31,30,31,31,30,31,30,31); $LMpat = '^(Last-modified):\s*(\w+),\s*(\d+)\s*(\w+)\s*(\d+)\s*(\d+):(\d+):(\d+)\s*(\w+)'; $spm = 60; $mph = 60; $hpd = 24; $dpy = 365; $sph = $spm * $mph; $spd = $sph * $hpd; $spy = $spd * $dpy; ### Run thru the command-line args. for $a (@ARGV) { if (($flg,$opt) = ($a =~ /^([-+])(.*)/)) { while ($opt) { if ($opt =~ s/^b([-~]\d*)//i) { # Backup. if ($flg eq '-') {$bplim = 0} else {&bpinit($1)} } elsif ($opt =~ s/^b(\..*)//i) { # Backup. if ($flg eq '-') {$bplim = 0} else {&bpinit($1)} } elsif ($opt =~ s/^c//i) { # Check for copyright files. $copyright = ($flg eq '+'); } elsif ($opt =~ s/^m//i) { # Copy mtime values. $mtimes = ($flg eq '+'); } elsif ($opt =~ s/^n//i) { # Copy newer files. $newer = ($flg eq '+'); } elsif ($opt =~ s/^p(.*)//i) { # Proxy host. if ($flg eq '-') {$W3nopxy = $1} else {$W3proxy = $1} } elsif ($opt =~ s/^r(\d*)//i) { # Recursion limit. $rlimit = $1 || 2; } elsif ($opt =~ s/^v(\d+)(.*)//i) { # Verbose. &Vopt("$1$2"); } elsif ($opt =~ s/(.)//) { print STDERR "$me: Unknown option \"$1\" ignored.\n" if $V>0; } elsif ($opt =~ s/^x(.*)//i) { # Exceptions. push @Xpats, $1; } } next; } # If it's not an option, treat it as a URL: $a =~ s"/*$"/"; # Make sure there's exactly one final '/' print V "$me: URL \"$a\"\n" if $V>3; $baseurl = $a; $baselen = length($a); &ldir($a,''); } print V "$me: Finished ", `date` if $V>1; exit $exitstat; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub bpinit { local($bpopt) = @_; if ($bpopt =~ /^([-\~])(\d*)/) { $bpchr = $bpsuf = $1; $bplim = int($2) || 7; print V "$me: Backup chr=suf=\"$bpchr\" lim=$bplim.\n" if $V>2; } elsif ($bpopt =~ /\./) { $bpchr = '.'; $bpsuf = $bpopt; $bplim = 1; print V "$me: Backup chr='$bpchr' suf=\"$bpsuf\" lim=$bplim.\n" if $V>2; } else { print V "$me: Backup option \"$bpopt\" not recognized.\n" if $V>0; $bpchr = $bpsuf = ''; $bplim = 0; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Extract a directory listing from a server. We make some assumptions # # here about how web servers return directory listings; if they get # # too weird, we won't recognize the contents. The first arg is a URL # # that should be a directory (or perhaps an index.html file that acts # # like a directory). The second argument is the local directory that # # files are to be copied into; we use it to keep track of where we # # are in recursive loads. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub ldir { local($url,$dir) = @_; local($buf,$href,$htag,$init,$line,$tmp,*U,$URL); ++$rlevel; $URLerr = "Don't know why"; return 0 if !&chkurl($u); if ($dir) { $URL = "$url$dir"; } else { $URL = $url; } if (!&URLopen(U,$URL)) { print V "ldir: Can't open \"$URL\" ($URLerr)\n" if $V>0; $exitstat = 1; close U; # Paranoia. --$rlevel; return 0; } more: while ($buf = ) { print V "ldir: Got \"$buf\"\n" if $V>5; $buf =~ s/\s*$/ /; $line .= $buf; while ($line) { print V "ldir: \"$line\"\n" if $V>5; $line =~ s/^[^<]+//; # Delete junk before first '<'. if ($line =~ s/^<([^>]*)>//) { # Scan for a tag. $htag = $1; print V "ldir: htag=\"$htag\"\n" if $V>4; if ($htag =~ /^a\b/i) { print V "ldir: htag=\"$htag\" is anchor.\n" if $V>3; if ($htag =~ /href="([^"]+)"/i) { $href = $1; print V "ldir: href=\"$href\"\n" if $V>3; next if ($href =~ /^#/); # Skip relative URLs. next if ($href =~ /\/\.\.\//); # Skip upward links. if ($line =~ /^\s*Parent dir/i) { print V "ldir: href=\"$href\" rejected (parent dir)\n" if $V>3; } else { print V "ldir: href=\"$href\" looks good ...\n" if $V>3; if ($href =~ m"/$") { # Final '/'; treat as directory. $U = &URLhref($URL,$href); print V "ldir: New URL: \"$U\"\n" if $V>3; $init = substr($U,0,$baselen); $sdir = substr($U,length($init)); if ($rlevel < $rlimit) { if ($init eq $baseurl) { print V "ldir: sdir=\"$sdir\"\n" if $V>4; if (&chkurl($U)) { print V "ldir: \"$U\" is new URL.\n" if $V>4; &ldir($init,$sdir); } } else { print V "--- Ignoring \"$U\"\n" if $V>1; print V " Notunder \"$baseurl\"\n" if $V>1; } } else { print V "$U ignored (past recursion limit $rlimit)\n" if $V>0; } } else { # No final '/'; treat as file. print V "ldir: File u=\"$URL\" h=\"$href\"\n" if $V>3; &Copy($URL,$dir,$href); } } } else { print V "ldir: htag=\"$htag\" has no href.\n" if $V>3; } } else { print V "ldir: htag=\"$htag\" ignored.\n" if $V>6; } } else { print V "ldir: \"$line\" contains no tag.\n" if $V>5; $line = ''; } } } close U; --$rlevel; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Examine a URL and decide whether we want to read it. We return 1 # if it looks interesting; 0 if we should ignore it. If the second # arg is present, it is a URL that we "came from"; we might want to # use that as part of the decision.. sub chkurl { local($u,$f) = @_; local($x); if ($u =~ /^(ftp|file|mailto):/i) { print V "$u\t--- Can't handle $1 protocol yet.\n" if $V>2; return 0; } if ($Ufrom{$U}) { print V "$U\tdone already (from:$Ufrom{$U})\n" if $V>2; $Ufrom{$U} .= ' ' . $f if $f; return 0; } for $x (@Xpats) { if ($u =~ $x) { print V "$U\tskipped (/$x/)\n" if $V>1; return 0; } } return 1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a local file pathname, we check to see whether it is a link # # to a local file whose name starts with ".Copyright" and has the # # same suffix. If so, we return true, otherwise we return 1. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub copyrightcheck { local($pth) = @_; local($dir,$fil,$bas,$suf,$cpf); print V "copyrightcheck: pth=\"$pth\"\n" if $V>3; if (($dir,$fil) = ($pth =~ m"^(.*/)(.*)")) { } else { $dir = ''; $fil = $pth; } print V "copyrightcheck: dir=\"$dir\" fil=\"$fil\"\n" if $V>3; if (($bas,$suf) = ($fil =~ m"^(.*)(\..*)$")) { $cpf = $dir . '.Copyright' . $suf; } else { $cpf = $dir . '.Copyright'; } print V "copyrightcheck: bas=\"$bas\" suf=\"$suf\"\n" if $V>3; print V "copyrightcheck: cpf=\"$cpf\"\n" if $V>3; if (($dv1,$in1) = stat($cpf)) { print V "copyrightcheck: dv1=$dv1 in1=$in1 cpf=\"$cpf\"\n" if $V>3; if (($dv2,$in2) = stat($pth)) { print V "copyrightcheck: dv2=$dv2 in2=$in2 pth=\"$pth\"\n" if $V>3; if ($dv1 == $dv2 && $in1 == $in2) { print "$pth\tis a copyright file.\n" if $V>2; return 1; } } } return 0; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Copy a single remote file into a local file. We are passed the base # # URL, the directory, and the HREF. We combine the directory and the # # href to get the local pathname. We might have to use mkdir to # # create the directory; we trust that only the final directory will # # be needed, due to the recursive nature of our work here. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Copy { local($u,$d,$h) = @_; local($buf,$dir,$fil,$hdr,$n,$path,$url,*F); local($pro,$err,$msg); local($UWD,$UD0,$UD1,$UM0,$UM1,$UYY,$Uhh,$Umm,$Uss,$Utz,$Usi,$Uts); local($FWD,$FD0,$FD1,$FM0,$FM1,$FYY,$Fhh,$Fmm,$Fss,$Ftz,$Fsi,$Fts); print V "Copy: u=\"$u\"\n" if $V>3; print V "Copy: d=\"$d\"\n" if $V>3; print V "Copy: h=\"$h\"\n" if $V>3; return if !&chkurl($u); $url = &URLhref($u,$h); print V "Copy: url=\"$url\"\n" if $V>3; if (!&URLopen(*F,$url)) { print V "Copy: Can't open \"$url\" ($!)\n" if $V>4; close F; return; } print V "Copy: url=\"$url\" opened.\n" if $V>3; if ($d) { $path = "$d$h"; print V "Copy: path=\"$path\"\n" if $V>3; } else { $path = $h; print V "Copy: path=\"$path\"\n" if $V>3; } while ($URLhdr) { ($buf = ) =~ s/\s+$//; print V "Copy: hdr \"$buf\"\n" if $V>3; if ($buf =~ /^\s*$/) { $URLhdr = 0; # Done with headers. } elsif (($pro,$err,$msg) = ($buf =~ m"(HTTP/[\d.]+)\s*([4-9]\d+)\s+(.*)$")) { print V "$url\t$pro error $err \"$msg\"\n" if $V>0; close F; return; } elsif (!$Usi) { # Only notice one Last-modified line. if (($hdr,$UWD,$UD1,$UMo,$UYY,$Uhh,$Umm,$Uss,$Utz) = ($buf =~ /$LMpat/i)) { print V "Copy: $buf\n" if $V>3; $UM0 = $ML{$UMo}; $UM1 = $UM0 + 1; $UD0 = $UD1 - 1; $Usi = sprintf("%04d%02d%02d%02d%02d%02d",$UYY,$UM1,$UD1,$Uhh,$Umm,$Uss); print V "Copy: $Usi $tz ($url)\n" if $V>3; ($FX1,$FX2,$FX3,$FX4,$FX5,$FX6,$FX7,$FX8,$FX9,$Fts) = stat($path); ($Fss,$Fmm,$Fhh,$FD1,$FM0,$FYY) = gmtime($Fts); $FYY += 1900; $FM1 = $FM0 + 1; $FD0 = $FD1 - 1; $Fsi = sprintf("%04d%02d%02d%02d%02d%02d",$FYY,$FM1,$FD1,$Fhh,$Fmm,$Fss); print V "Copy: $Fsi GMT ($path)\n" if $V>3; if ($Usi le $Fsi) { print V "$url\t< $path\t(not copied)\n" if $V>2; close F; return; } print V "Copy: $url newer than $path\n" if $V>3; } } } if (-f $path) { print V "Copy: $url exists.\n" if $V>3; if ($copyright && ©rightcheck($path)) { print V "$url\tskipped (copyright file)\n" if $V>1; close F; return; } if ($bplim > 0) { print V "Copy: No backup yet; unlink(\"$path\"\n" if $V>2; unlink($path); } else { print V "Copy: No backups; unlink(\"$path\"\n" if $V>3; unlink($path); } } if (!open(O,">$path")) { print V "Copy: Can't write \"$path\" ($!)\n" if $V>4; if (($dir,$fil) = ($path =~ m"^(.*)/(.+)$")) { # Missing directory? print V "Copy: dir=\"$dir\" fil=\"$fil\"\n" if $V>3; if (!mkdir($dir,0777)) { # Try to create the directory. print V "Copy: Can't mkdir \"$dir\" ($!)\n" if $V>0; close F; return; } print V "DIR\t\"$dir\" created.\n" if $V>1; if (!open(O,">$path")) { # Retry the open. print V "Copy: Can't write \"$path\" ($!)\n" if $V>0; close F; return; } } else { close F; return; } } print V "$url\t> $path\t(..." if $V>1; while ($buf = ) { print V "Copy: Got \"$buf\"\n" if $V>5; print O $buf; } print V " copied)\n" if $V>1; close F; close O; if ($Usi && $mtimes) { print V "Copy: Set mtime to $Usi ...\n" if $V>1; $Uts = &ts($UYY,$UM0,$UD1,$Uhh,$Umm,$Uss); utime(time,$Uts,$path); } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Convert a 6-number time list to a timestamp. This probably only # works on Unix/POSIX systems. It'll be off by a day after 21000301. sub ts { local($YY,$M0,$D1,$hh,$mm,$ss) = @_; local($ly,$ts,$m,$yy); $yy = ($YY > 1900) ? ($YY - 1970) : $YY; # Year since epoch. --$D1; # Adjust for no zero days. $ts = ($yy * $spy); # Secs to start of this year. print "ts: ts=$ts to start of year $yy.\n" if $V>3; $ly = int(($yy + 2) / 4); # Leap years since epoch. $ts += $ly * $spd; # Add a day per leap year. print "ts: ts=$ts for $ly leap years.\n" if $V>3; for ($m=0; $m<$M0; $m++) { # Add months this year. $ts += $ML[$m] * $spd; print "ts: ts=$ts for month $m of this year.\n" if $V>3; } print "ts: ts=$ts for $m months this year.\n" if $V>3; if (($M0 > 2) && ($yy % 4 == 0)) { # If this is a leap year. $ts += $spd; print "ts: ts=$ts for this leap year.\n" if $V>3; } $ts += ($D1 * $spd); # Add days this month. print "ts: ts=$ts for $D1 days this month.\n" if $V>3; $ts += ($hh * $sph); # Add hours in day. print "ts: ts=$ts for $hh hours this day.\n" if $V>3; $ts += ($mm * $spm); # Add minutes in hour. print "ts: ts=$ts for $mm minutes this hour.\n" if $V>3; $ts += $ss; print "ts: ts=$ts for $ss seconds this minute.\n" if $V>3; print "ts: $YY/$M0/$D1 $hh:$mm:$ss => $ts.\n" if $V>3; @X = gmtime($ts); print "ts: $X[5]/$X[4]/$X[3] $X[2]:$X[1]:$X[0] <= $ts.\n" if $V>3; return $ts; }