Index of $u

#!/usr/bin/perl # # NAME # w3ld - (down)load the contents of a web directory. # # SYNOPSIS # w3ld [options].. URL.. # # DESCRIPTION # This program expects one or more URLs on the command line. Each # URL is treated as a directory and its contents are copied into # the current directory. If the -r ("recursive") option is used, # subdirectories will be identified and copied recursively. # # The URLs on the command line should all end with '/', but if one # doesn't, we add '/'. Thus, this program currently can't grab just # a single file. (That's done by w3cat, which should be in the same # place you found this program.) # # By "treated as a directory", we mean that the URL's contents are # scanned for hyperlinks. Those hyperlinks are in turn downloaded, # and written to local files that have the same names. # # OPTIONS # Options start with '-' or '+'. For some of the options, '+' means # "enable" and '-' means "disable"; for others, there's no such # concept and you may use either flag char. The options that don't # have args may be combined with the next option in the usual way, # so that "-mnr2" is the same as "-m -n -r2". # # Default options are set up to whatever I mostly find useful at # the moment, so they may change from time to time. Check below for # what the true defaults are in whatever version you have. # # # The following option isn't yet fully implemented; don't use it. # # -b # # Don't do any backup. This is usually the default. # # +b # # Back up existing files. Several backup styles are supported. if # # is '-' or '~', then backups are made by appending to # # the name. Such backups are recursive, i.e., if a backup file # # exists, it will in turn be backed up, resulting in multiple # # flag chars on the end of its name. If is present, it is the # # limit to the number of backups. The in some versions is -b-7, # # which will produce the file and up to 7 backup copies. If # # is '.', then the backup will be done by appending . to the # # name, and only one backup is possible. Thus -b.bak will rename # # foo.c as foo.c.bak, as is conventional of DOS systems. # # -c # Don't do a copyright check (described below). # +c # Do a copyright check (default). # # -m # Don't copy modification times. Copied files will have mtimes # as of when they are copied. # +m # Set mtimes (modification time) from the "Last-modified" times # for URLs that are copied. # # -n # Copy all files, ignoring modification times. # +n # If a file already exists, copy only the newer files. If the web # server doesn't give a coherent date, we will assume the file is # newer and copy it. At verbose level -v3, URLs not copied due to # age are flagged with '<'. # # -p # Sents requests via a proxy HTTP server. The should be # the "host.name:port" portion of a URL. # # -r # Recurse to levels. The default, -r1, means to treat only # the top-level URLs as directories. The -r2 option means to try # to determine which of the second-level URLs are directories. # When such a subdirectory is found, a local directory by the # same name is created, and filled with the URL's contents. If # is omitted, -r2 is assumed. # # -v[file] # Verbose option. Sets the verbose level to , and if the file # name is present, writes to that file. The default is currently # -v2, which will get you a list of URLs copied, flagged by '>', # and written to STDOUT. Use -v3 if you'd like to see the URLs # that are not copied, flagged by '<' or a reason. # # -x\ # Exceptions. Any URLs that match the pattern will be ignored and # not fetched. # # EXAMPLES # # mkdir RFC; cd RFC # w3ld http://ds.internic.net/rfc/ # Download all the RFCs from the InterNIC. # # mkdir Scot; cd Scot # w3ld -r2 http://trillian.mit.edu:8739/~jc/abc/Scotland/ # This downloads John Chambers' Scottish Country Dance music into # the local directory 'Scot'. The -r2 is needed because the tune # files are in subdirectories 'jig', 'reel', and so on. # # mkdir Scot; cd Scot # w3ld -mr2 http://trillian.mit.edu:8739/~jc/abc/Scotland/ # The same, but preserve the "mtime" modification times. # # mkdir Celtic; cd Celtic # w3ld -r2 -mnv3 http://trillian.mit.edu:8739/~jc/abc/Scotland/ \ # http://trillian.mit.edu:8739/~jc/abc/Ireland/ # Similar, but copy only files that are newer than those already # in the local directory. The -v3 causes the output to include # all file names; files not copied are marked "(not copied)". The # two remote directories will be merged into one local Celtic # directory. (If there are two different files by the same name, # the one under Scotland/ will will be lost.) # # mkdir celtic; cd celtic # w3ld http://celtic.stanford.edu/pub/tunes/abc.tunes/ # Download the abc tune collections from the Ceolas archive. No # -r option here so we don't go into the abc.tunes.ps/ directory. # # mkdir hn; cd hn # w3ld http://home1.swipnet.se/~w-11382/abc/ # Download Henrik Norbeck's abc files. No -r option, so we don't # go into his subdirectories. This gets a lot of big abc files. # # LIMITATIONS # So far only the http:// protocol is implemented; ftp://, file:// # and others may appear if I need them. If someone feels like # adding FTP code, you might send me a copy. # # DEBUGGING # You can use "perl -dw", of course. Or you can do the following: # setenv V_w3ld 5/tmp/w3ld.out # csh or tcsh users. # export V_w3ld=5/tmp/w3ld.out # ksh or bash users. # This will turn on the "print V" lines and write to /tmp/w3ld.out. # Or use the -v option on the command line. # # REQUIRES # If the following lines are commented out, then I've read them # into this script, and nothing is required. If these are live perl # commands, then you'll have to make sure that these .pmfiles are # found on your system: # # $H = $ENV{'HOME'} || '.'; # push(@INC, "$H/sh", "$H/pl"); # require "HTTPcon.pm"; # To make HTTP connections. # require "URLopen.pm"; # To open a URL and get a file handle. # require "URLhref.pm"; # URL + HREF -> new URL. # require "URLtrim.pm"; # Remove /../ and trailing junk from URL. # # BUGS # Despite many attempts to detect failure, we still don't optimally # handle all the myriad things that can go wrong. # # It isn't technically possible to determine whether a URL names a # directory or a file. Our scheme is to treat a final '/' as the # sign of a directory. Most web servers use this convention when # they produce directory listings. But there are web pages around # that lie, putting '/' on non-directory URLs and leaving '/' off # of directory URLs, and we have no defense against this. # # Many servers have a low limit to the number of links that they # can have open simultaneously. This program's "recursive" copy is # truly recursive, and a URL is kept open while lower URLs are # read. So if a server limits links to, say, 4, then -r5 won't # work, because it requires keeping 5 links open. # # It's impossible to detect multiply-linked files via HTTP, and we # don't try. The result will be duplicate files with different # names. If you want to fix this, look for my "relink" program. # # COPY BLOCKING # This program implements a simple kind of protection to prevent # downloading files, such as those that contain copyrighted data. # If a file foo.bar is a link to the file .Copyright.bar in the # same directory, we will not download foo.bar from another system. # For files without a suffix, the file .Copyright is checked. For # this to be effective, the .Copyright* file must be in the same # directory and must have the same suffix. # # For example, if you know that the "MissRowanDavies.abc" is the # name of a file that contains copyrighted data, you can create a # file .Copyright.abc, link it to MissRowanDavies.abc, and a file # by that name will not be downloaded into your directory, even if # newer. # # FUTURE # Lots of options are possible. One useful option might be to write # the files to STDOUT, as one big file. # # AUTHOR # John Chambers # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # $| = 1; # Unbuffer STDOUT. $exitstat = 0; # Our exit status. ($me = $0) =~ s"^.*/""; # Our name, minus directory. $Vdfl = 2; # Default verbose level. &Voption($ENV{"V_$me"} || $ENV{"T_$me"} || $ENV{"D_$me"} || $Vdfl); print V "$me: Started ", `date` if $V>1; ### Here are some defaults that you may wish to twiddle: $copyright = 1; # Check for copyright files. $mtimes = 1; # Copy modification times. $newer = 0; # Copy only newer files. $rlevel = 0; # Current recursion level. $rlimit = 2; # Maximum recursion depth allowed. ### Some globals that you probably don't want to change: %ML = ('Jan',0,'Feb',1,'Mar',2,'Apr',3,'May',4,'Jun',5,'Jul',6,'Aug',7,'Sep',8,'Oct',9,'Nov',10,'Dec',11); @ML = (31,28,31,30,31,30,31,31,30,31,30,31); $LMpat = '^(Last-modified):\s*(\w+),\s*(\d+)\s*(\w+)\s*(\d+)\s*(\d+):(\d+):(\d+)\s*(\w+)'; $spm = 60; $mph = 60; $hpd = 24; $dpy = 365; $sph = $spm * $mph; $spd = $sph * $hpd; $spy = $spd * $dpy; ### Run thru the command-line args. for $a (@ARGV) { if (($flg,$opt) = ($a =~ /^([-+])(.*)/)) { while ($opt) { if ($opt =~ s/^b([-~]\d*)//i) { # Backup. if ($flg eq '-') {$bplim = 0} else {&bpinit($1)} } elsif ($opt =~ s/^b(\..*)//i) { # Backup. if ($flg eq '-') {$bplim = 0} else {&bpinit($1)} } elsif ($opt =~ s/^c//i) { # Check for copyright files. $copyright = ($flg eq '+'); } elsif ($opt =~ s/^m//i) { # Copy mtime values. $mtimes = ($flg eq '+'); } elsif ($opt =~ s/^n//i) { # Copy newer files. $newer = ($flg eq '+'); } elsif ($opt =~ s/^p(.*)//i) { # Proxy host. $HTTPpxy = $1; } elsif ($opt =~ s/^r(\d*)//i) { # Recursion limit. $rlimit = $1 || 2; } elsif ($opt =~ s/^v(\d+)(.*)//i) { # Verbose. &Voption("$1$2"); } elsif ($opt =~ s/(.)//) { print STDERR "$me: Unknown option \"$1\" ignored.\n" if $V>0; } elsif ($opt =~ s/^x(.*)//i) { push(@Xpats, $1); } } next; } # If it's not an option, treat it as a URL: $a =~ s"/*$"/"; # Make sure there's exactly one final '/' print V "$me: URL \"$a\"\n" if $V>3; $baseurl = $a; $baselen = length($a); &ldir($a,''); } print V "$me: Finished ", `date` if $V>1; exit $exitstat; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub bpinit { local($bpopt) = @_; if ($bpopt =~ /^([-\~])(\d*)/) { $bpchr = $bpsuf = $1; $bplim = int($2) || 7; print V "$me: Backup chr=suf=\"$bpchr\" lim=$bplim.\n" if $V>2; } elsif ($bpopt =~ /\./) { $bpchr = '.'; $bpsuf = $bpopt; $bplim = 1; print V "$me: Backup chr='$bpchr' suf=\"$bpsuf\" lim=$bplim.\n" if $V>2; } else { print V "$me: Backup option \"$bpopt\" not recognized.\n" if $V>0; $bpchr = $bpsuf = ''; $bplim = 0; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Extract a directory listing from a server. We make some assumptions # # here about how web servers return directory listings; if they get # # too weird, we won't recognize the contents. The first arg is a URL # # that should be a directory (or perhaps an index.html file that acts # # like a directory). The second argument is the local directory that # # files are to be copied into; we use it to keep track of where we # # are in recursive loads. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub ldir { local($url,$dir) = @_; local($buf,$href,$htag,$init,$line,$tmp,*U,$URL); ++$rlevel; $URLerr = "Don't know why"; return 0 if !&chkurl($u); if ($dir) { $URL = "$url$dir"; } else { $URL = $url; } if (!&URLopen(U,$URL)) { print V "ldir: Can't open \"$URL\" ($URLerr)\n" if $V>0; $exitstat = 1; close U; # Paranoia. --$rlevel; return 0; } more: while ($buf = ) { print V "ldir: Got \"$buf\"\n" if $V>5; $buf =~ s/\s*$/ /; $line .= $buf; while ($line) { print V "ldir: \"$line\"\n" if $V>5; $line =~ s/^[^<]+//; # Delete junk before first '<'. if ($line =~ s/^<([^>]*)>//) { # Scan for a tag. $htag = $1; print V "ldir: htag=\"$htag\"\n" if $V>4; if ($htag =~ /^a\b/i) { print V "ldir: htag=\"$htag\" is anchor.\n" if $V>3; if ($htag =~ /href="([^"]+)"/i) { $href = $1; print V "ldir: href=\"$href\"\n" if $V>3; next if ($href =~ /^#/); # Skip relative URLs. next if ($href =~ /\/\.\.\//); # Skip upward links. if ($line =~ /^\s*Parent dir/i) { print V "ldir: href=\"$href\" rejected (parent dir)\n" if $V>3; } else { print V "ldir: href=\"$href\" looks good ...\n" if $V>3; if ($href =~ m"/$") { # Final '/'; treat as directory. $U = &URLhref($URL,$href); print V "ldir: New URL: \"$U\"\n" if $V>3; $init = substr($U,0,$baselen); $sdir = substr($U,length($init)); if ($rlevel < $rlimit) { if ($init eq $baseurl) { print V "ldir: sdir=\"$sdir\"\n" if $V>4; if (&chkurl($U)) { print V "ldir: \"$U\" is new URL.\n" if $V>4; &ldir($init,$sdir); } } else { print V "--- Ignoring \"$U\"\n" if $V>1; print V " Notunder \"$baseurl\"\n" if $V>1; } } else { print V "$U ignored (past recursion limit $rlimit)\n" if $V>0; } } else { # No final '/'; treat as file. print V "ldir: File u=\"$URL\" h=\"$href\"\n" if $V>3; &Copy($URL,$dir,$href); } } } else { print V "ldir: htag=\"$htag\" has no href.\n" if $V>3; } } else { print V "ldir: htag=\"$htag\" ignored.\n" if $V>6; } } else { print V "ldir: \"$line\" contains no tag.\n" if $V>5; $line = ''; } } } close U; --$rlevel; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Examine a URL and decide whether we want to read it. We return 1 # if it looks interesting; 0 if we should ignore it. If the second # arg is present, it is a URL that we "came from"; we might want to # use that as part of the decision.. sub chkurl { local($u,$f) = @_; local($x); if ($u =~ /^(ftp|file|mailto):/i) { print V "$u\t--- Can't handle $1 protocol yet.\n" if $V>2; return 0; } if ($Ufrom{$U}) { print V "$U\tdone already (from:$Ufrom{$U})\n" if $V>2; $Ufrom{$U} .= ' ' . $f if $f; return 0; } for $x (@Xpats) { if ($u =~ $x) { print V "$U\tskipped (/$x/)\n" if $V>1; return 0; } } return 1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Given a local file pathname, we check to see whether it is a link # # to a local file whose name starts with ".Copyright" and has the # # same suffix. If so, we return true, otherwise we return 1. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub copyrightcheck { local($pth) = @_; local($dir,$fil,$bas,$suf,$cpf); print V "copyrightcheck: pth=\"$pth\"\n" if $V>3; if (($dir,$fil) = ($pth =~ m"^(.*/)(.*)")) { } else { $dir = ''; $fil = $pth; } print V "copyrightcheck: dir=\"$dir\" fil=\"$fil\"\n" if $V>3; if (($bas,$suf) = ($fil =~ m"^(.*)(\..*)$")) { $cpf = $dir . '.Copyright' . $suf; } else { $cpf = $dir . '.Copyright'; } print V "copyrightcheck: bas=\"$bas\" suf=\"$suf\"\n" if $V>3; print V "copyrightcheck: cpf=\"$cpf\"\n" if $V>3; if (($dv1,$in1) = stat($cpf)) { print V "copyrightcheck: dv1=$dv1 in1=$in1 cpf=\"$cpf\"\n" if $V>3; if (($dv2,$in2) = stat($pth)) { print V "copyrightcheck: dv2=$dv2 in2=$in2 pth=\"$pth\"\n" if $V>3; if ($dv1 == $dv2 && $in1 == $in2) { print "$pth\tis a copyright file.\n" if $V>2; return 1; } } } return 0; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Copy a single remote file into a local file. We are passed the base # # URL, the directory, and the HREF. We combine the directory and the # # href to get the local pathname. We might have to use mkdir to # # create the directory; we trust that only the final directory will # # be needed, due to the recursive nature of our work here. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Copy { local($u,$d,$h) = @_; local($buf,$dir,$fil,$hdr,$n,$path,$url,*F); local($pro,$err,$msg); local($UWD,$UD0,$UD1,$UM0,$UM1,$UYY,$Uhh,$Umm,$Uss,$Utz,$Usi,$Uts); local($FWD,$FD0,$FD1,$FM0,$FM1,$FYY,$Fhh,$Fmm,$Fss,$Ftz,$Fsi,$Fts); print V "Copy: u=\"$u\"\n" if $V>3; print V "Copy: d=\"$d\"\n" if $V>3; print V "Copy: h=\"$h\"\n" if $V>3; return if !&chkurl($u); $url = &URLhref($u,$h); print V "Copy: url=\"$url\"\n" if $V>3; if (!&URLopen(*F,$url)) { print V "Copy: Can't open \"$url\" ($!)\n" if $V>4; close F; return; } print V "Copy: url=\"$url\" opened.\n" if $V>3; if ($d) { $path = "$d$h"; print V "Copy: path=\"$path\"\n" if $V>3; } else { $path = $h; print V "Copy: path=\"$path\"\n" if $V>3; } while ($URLhdr) { ($buf = ) =~ s/\s+$//; print V "Copy: hdr \"$buf\"\n" if $V>3; if ($buf =~ /^\s*$/) { $URLhdr = 0; # Done with headers. } elsif (($pro,$err,$msg) = ($buf =~ m"(HTTP/[\d.]+)\s*([4-9]\d+)\s+(.*)$")) { print V "$url\t$pro error $err \"$msg\"\n" if $V>0; close F; return; } elsif (!$Usi) { # Only notice one Last-modified line. if (($hdr,$UWD,$UD1,$UMo,$UYY,$Uhh,$Umm,$Uss,$Utz) = ($buf =~ /$LMpat/i)) { print V "Copy: $buf\n" if $V>3; $UM0 = $ML{$UMo}; $UM1 = $UM0 + 1; $UD0 = $UD1 - 1; $Usi = sprintf("%04d%02d%02d%02d%02d%02d",$UYY,$UM1,$UD1,$Uhh,$Umm,$Uss); print V "Copy: $Usi $tz ($url)\n" if $V>3; ($FX1,$FX2,$FX3,$FX4,$FX5,$FX6,$FX7,$FX8,$FX9,$Fts) = stat($path); ($Fss,$Fmm,$Fhh,$FD1,$FM0,$FYY) = gmtime($Fts); $FYY += 1900; $FM1 = $FM0 + 1; $FD0 = $FD1 - 1; $Fsi = sprintf("%04d%02d%02d%02d%02d%02d",$FYY,$FM1,$FD1,$Fhh,$Fmm,$Fss); print V "Copy: $Fsi GMT ($path)\n" if $V>3; if ($Usi le $Fsi) { print V "$url\t< $path\t(not copied)\n" if $V>2; close F; return; } print V "Copy: $url newer than $path\n" if $V>3; } } } if (-f $path) { print V "Copy: $url exists.\n" if $V>3; if ($copyright && ©rightcheck($path)) { print V "$url\tskipped (copyright file)\n" if $V>1; close F; return; } if ($bplim > 0) { print V "Copy: No backup yet; unlink(\"$path\"\n" if $V>2; unlink($path); } else { print V "Copy: No backups; unlink(\"$path\"\n" if $V>3; unlink($path); } } if (!open(O,">$path")) { print V "Copy: Can't write \"$path\" ($!)\n" if $V>4; if (($dir,$fil) = ($path =~ m"^(.*)/(.+)$")) { # Missing directory? print V "Copy: dir=\"$dir\" fil=\"$fil\"\n" if $V>3; if (!mkdir($dir,0777)) { # Try to create the directory. print V "Copy: Can't mkdir \"$dir\" ($!)\n" if $V>0; close F; return; } print V "DIR\t\"$dir\" created.\n" if $V>1; if (!open(O,">$path")) { # Retry the open. print V "Copy: Can't write \"$path\" ($!)\n" if $V>0; close F; return; } } else { close F; return; } } print V "$url\t> $path\t(..." if $V>1; while ($buf = ) { print V "Copy: Got \"$buf\"\n" if $V>5; print O $buf; } print V " copied)\n" if $V>1; close F; close O; if ($Usi && $mtimes) { print V "Copy: Set mtime to $Usi ...\n" if $V>4; $Uts = &ts($UYY,$UM0,$UD1,$Uhh,$Umm,$Uss); utime(time,$Uts,$path); } } # UWD UD0 UD1 UM1 UM0 UMo UM0 UYY Uhh Umm Uss Utz Usi Uts # FWD FD0 FD1 FM1 FM0 FMo FM0 FYY Fhh Fmm Fss Ftz Fsi Fts # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Convert a 6-number time list to a timestamp. This probably only # works on Unix/POSIX systems. It'll be off by a day after 21000301. sub ts { local($YY,$M0,$D1,$hh,$mm,$ss) = @_; local($ly,$ts,$m,$yy); $yy = ($YY > 1900) ? ($YY - 1970) : $YY; # Year since epoch. --$D1; # Adjust for no zero days. $ts = ($yy * $spy); # Secs to start of this year. print "ts: ts=$ts to start of year $yy.\n" if $V>3; $ly = int(($yy + 2) / 4); # Leap years since epoch. $ts += $ly * $spd; # Add a day per leap year. print "ts: ts=$ts for $ly leap years.\n" if $V>3; for ($m=0; $m<$M0; $m++) { # Add months this year. $ts += $ML[$m] * $spd; print "ts: ts=$ts for month $m of this year.\n" if $V>3; } print "ts: ts=$ts for $m months this year.\n" if $V>3; if (($M0 > 2) && ($yy % 4 == 0)) { # If this is a leap year. $ts += $spd; print "ts: ts=$ts for this leap year.\n" if $V>3; } $ts += ($D1 * $spd); # Add days this month. print "ts: ts=$ts for $D1 days this month.\n" if $V>3; $ts += ($hh * $sph); # Add hours in day. print "ts: ts=$ts for $hh hours this day.\n" if $V>3; $ts += ($mm * $spm); # Add minutes in hour. print "ts: ts=$ts for $mm minutes this hour.\n" if $V>3; $ts += $ss; print "ts: ts=$ts for $ss seconds this minute.\n" if $V>3; print "ts: $YY/$M0/$D1 $hh:$mm:$ss => $ts.\n" if $V>3; @X = gmtime($ts); print "ts: $X[5]/$X[4]/$X[3] $X[2]:$X[1]:$X[0] <= $ts.\n" if $V>3; return $ts; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Parse a debug option, and set up the global $V verbose level and V # # verbose file. Note that we also leave the entire option string in # # the global $Vopt variable, not in a local variable. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Voption { ($Vopt) = @_; if ($Vopt =~ /^(\d)(.+)/) { $V = $1; $Vfil = $2; open(V,">$Vfil") || die "$0: Can't write \"$Vfil\" ($!)\n"; } elsif ($Vopt =~ /^(\d+)/) { $V = $1; open(V,">&STDOUT") } else { $V = $Vdfl; open(V,">&STDOUT"); } select V; $| = 1; select STDOUT; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # NAME # # HTTPcon - make HTTP connection. # # # # SYNOPSIS # # $stat = &HTTPcon(*F,'fubar.com:1234'); # # # # DESCRIIPTION # # This accepts a URL's host:port portion, and attempts to make the # # connection. If successful, we return 1 with F open to the TCP # # socket. If we fail, we return 0, and F may or may not be open. # # (Maybe we should close it.) # # # # AUTHOR # # John Chambers # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub HTTPcon { local(*F,$hp) = @_; local($a,$b,$c,$d,$host,$port,$this,$that,$This,$That); if (($host,$port) = ($hp =~ m"^(.*):(\d+)$")) { print V "HTTPcon: host=\"$host\" port=\"$port\"\n" if $V>4; } else { $host = $hp; $port = 80; print V "HTTPcon: host=\"$host\" port=$port.\n" if $V>4; } $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/; $thisaddr = "\0\0\0\0"; # Simpler way. ($name,$aliases,$type,$len,$thataddr) = gethostbyname($host); $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); ($a,$b,$c,$d) = unpack('C4',$thisaddr); $This = "$a.$b.$c.$d:0"; ($a,$b,$c,$d) = unpack('C4',$thataddr); $That = "$a.$b.$c.$d:$port"; if (socket(F,$AF_INET,$SOCK_STREAM,$proto)) { print V "HTTPcon: Got socket.\n" if $V>3; } else { print V "HTTPcon: Can't get socket ($!)\n" if $V>1; $exitstat = $!; return 0; } if (bind(F,$this)) { print V "HTTPcon: Bind to \"$This\" succeeded.\n" if $V>4; } else { print V "HTTPcon: Bind to \"$This\" failed ($!)\n" if $V>1; $exitstat = $!; return 0; } if (connect(F,$that)) { print V "HTTPcon: Connect to \"$That\" succeeded.\n" if $V>4; } else { print V "HTTPcon: Connect to \"$That\" failed ($!)\n" if $V>1; $exitstat = $!; return 0; } select(F); $| = 1; select(STDOUT); return 1; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here, we take a URL that is a path to something, plus an href that # # is believed to be a relative to the URL, and combine them into a # # URL. The main point of this routine is to try to correctly strip # # off the last field from the path. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub URLhref { local($path,$href) = @_; local($url,$prot,$host,$rest); if ($href =~ s"^/+"") { # Does href have initial slash? print V "URLhref: /href \"$href\"\n" if $V>3; if (($prot,$host,$rest) = ($path =~ m"^([^/:]+)://([^/]+)(/.*)$")) { print V "URLhref: \"$path\" trimmed \"$rest\"\n" if $V>3; $path = "$prot://$host"; } } elsif ($href =~ m"^\w*:") { print V "URLhref: URL \"$href\"\n" if $V>3; return $href; } else { if ($path =~ s"/([^/]+)$"") { # No: Trim path to directory. print V "URLhref: \"$path\" removed \"$1\"\n" if $V>3; } } $path =~ s"/+$""; $url = &URLtrim("$path/$href"); print V "URLhref: \"$path\" + \"$href\" =\n\t\"$url\"\n" if $V>3; $url; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # SYNOPSIS # URLopen(*FD,$URL) || die "..."; # # DESCRIPTION # This routine accepts a URL and attempts to open it for reading. # If successful, the return value is 1, and FD will be the open # file (a socket, actually). The caller can read the data from it. # If you're not going to exit after the EOF, t's a good idea to # close it when you're done, to prevent the connection from hanging # around. # # REQUIRES # We require the HTTPcon.pm module. # # AUTHOR # John Chambers # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub URLopen { local(*F,$url) = @_; local($p,$P,$h,$f); print V "URLopen: \"$url\"\n" if $V>4; return 0 if !&chkurl($url); if ($HTTPpxy) { print V "URLopen: pxy=\"$HTTPpxy\" url=\"$url\"\n" if $V>2; if (&HTTPcon(*F,$HTTPpxy)) { print V "URLopen: Connected to \"$HTTPpxy\"\n" if $V>3; print V "URLopen: Send \"GET $url HTTP/1.0\\n\\n\"\n" if $V>4; print F "GET $url HTTP/1.0\n\n"; $URLhdr = 1; # Note we're in the header. return 1; } print V "URLopen: Can't connect to proxy \"$HTTPpxy\"\n" if $V>0; return 0; } elsif (($p,$h,$f) = ($url =~ m'^(\w+)://([-_.:\w]+)(/.*)')) { print V "URLopen: \"$url\" p=\"$p\" h=\"$h\" f=\"$f\"\n" if $V>3; ($P = $p) =~ tr/a-z/A-Z/; if ($P eq 'HTTP') { print V "URLopen: \"$url\" HTTP protocol\n" if $V>3; if (&HTTPcon(*F,$h)) { print V "URLopen: Connected to \"$h\"\n" if $V>3; print V "URLopen: Send \"GET $f HTTP/1.0\\n\\n\"\n" if $V>4; print F "GET $f HTTP/1.0\n\n"; $URLhdr = 1; # Note we're in the header. return 1; } print V "URLopen: Can't connect to \"$h\"\n" if $V>0; return 0; } print V "URLopen: \"$url\" can't do protocol \"$p\"\n" if $V>0; $exitstat = 254; return 0; } if (-d $url && !$Dirs{$url}) { print V "URLopen: Directory \"$url\" ...\n" if $V>4; ++$Dirs{$url}; return &HTMLdir(*F,$url); } if (open(F,$url)) { print V "URLopen: Local file \"$url\" opened.\n" if $V>4; return 1; } print V "URLopen: Can't read \"$url\"\n" if $V>0; return 0; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This routine reads a directory and produces a simplified html # # listing. But you might want to reformat it in some other way that's # # more useful for your application. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub HTMLdir { local(*F,$d) = @_; if ($child = open(F,'-|')) { # Parent. print V "HTMLdir: Child started to list \"$d\"\n" if $V>1; return 1; # Parent returns the open file. } else { # Child reads the directory. if (opendir(DIR,$d)) { print "Index of $u\n"; print "

\n";
			print " Name\n";
			while ($f = readdir(DIR)) {
				if (-d "$d/$f") {
					if ($f eq '.') {		# Ignore self reference.
					} elsif ($f eq '..') {	# Perent reference.
						($p = $d) =~ s"/[^/]+/+$"/";
						print "\n";
						print " Parent directory\n";
					} else {
						print " $f/\n";
					}
				} else {
					print " $f\n";
				}
			}
			exit 0;
		} else {
			print V "HTMLdir: Can't open directory \"$d\" ($!)\n" if $V>0;
			exit $!;
		}
	}
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Shorten a URL in various ways.
sub URLtrim {
	local($url) = @_;
	local($org) = @_ if $V>3;
	$url =~ s"/*\#.*"";					# Strip off name references.
	$url =~ s"/*\?.*"";					# Strip off qualifiers.
	$url =~ s"/*\%.*"";					# What are these?
	while ($url =~ s"/\./"/") {}		# Strip out /./ self-references.
	while ($url =~ s"/[^/]+/\.\./"/") {}	# Strip out /foo/../ up-references.
	print V "URLtrim <= \"$org\"\n   => \"$url\"\n" if (($V>3) && ($url ne $org));
	$url;
}