#!/usr/bin/perl #!/space/imail/perl/bin/perl # # NAME # w3cat # # SYNOPSIS # w3cat [ URL | path ]... # # REQUIRES # per4 or perl5 and the following modules, which should be found in # the same directory where you found this program. # push @INC,"$ENV{HOME}/sh",'sh'; require "HTTPcon.pm"; # Makes HTTP connection, sends GET. require "HTMLdir.pm"; # Produces HTML listing of directory. require "URLopen.pm"; # Parses URL and returns file handle. # # DESCRIPTION # Given a list of URLs, this program reads them one at a time, and # writes their contents, catenated into one long string, to # standard output. Local file names may be used instead of URLs. # Directories are output in a simplified HTML format. # # If you want to learn how to do this stuff, you can study this # program. It is useful as a starting point for writing other # simple web clients. It's not nearly as difficult as people would # like you to believe. But the socket stuff uses several magical # incantations that "you just have to know"; see the required perl # module files for this socket magic. # # This is a webified perl version of the standard Unix cat(1) # program. # # OPTIONS # The default setup is to deliver only the data (contents) of a URL # and discard the header and tracing information. Here is the list # of our current options. The options may be combined into a single # string, as usual, with the qualification that options which have # an arg (O and P) must be the last in the string. Options may be # in any order, and apply to all subsequent URLs unless canceled by # another option. # # +D # Output the data [default]. # # -D # Don't output the data. # # +H # Include the HTTP header info in the output. # # -H # Don't include the HTTP header info in the output [default]. # # # -I # Don't send agent identification (default). # +I"agentid" # Send the quoted string as the agent identification. Some web # sites won't talk to you unless you send an acceptable string. # If there is no string, we send the string: # $dflagentid = "Mozilla/4.0 [en] (compatible; I; Linux 2.0)"; # # -O # Write the data to . Default is stdout. # # +P # Proxy gateway. If you are hidden behind a proxy, put the proxy # hostname (and optionally :port) in a -P option, and we will try # to indirect through the proxy server. # # -P # Proxy exceptions. The string should be a pattern. If # a URL matches this pattern, the proxy gateway isn't used. # # -R # Ignore redirects (default). # +R # Follow HTTP "Location:" redirects. # # +T # Enable WWW tracing. This sets a global flag that causes various # routines to produce lines of the form: # # These look like both HTTP header lines and HTML comments. Some # WWW tools (such as the "H" html viewer) can show these to tell # you which stage of a GET operation we have reached. # # -T # Timeout of seconds. The default is no timeout, meaning that # the underlying system's connect() will determine the timeout, # if any. # # -T # Disable WWW tracing [default]. # # ENVIRONMENT # We use the following from the environment: # # W3PROXY # The name (or address) and an optional :port for a proxy # gateway. URLs that don't match the W3NOPROXY will be fetched # indirectly via the proxy's web server. If not defined, we will # attempt direct TCP connections for all URLs. # # W3NOPROXY # A pattern which is applied to URLs, and if they match, no proxy # is used. That is, any URL that matches W3NOPROXY is considered # local, and we will access it directly. If not defined, we will # use W3PROXY for all URLs (if it exists). # # LIMITATIONS # So far only the http:// protocol is implemented; ftp://, file:// # and others may appear if I need them. If someone feels like # adding FTP code, you might send me a copy. # # HTTP "redirection" (the "Location:" HTTP header) is implemented # now via the +R option. By default, it is disabled and must be # handled by the caller, if desired. This mainly means that if you # omit the final '/' on a directory name, we will fail. This is not # considered a bug, so it'll probably never be fixed. # # DEBUGGING # You can use "perl -dw", of course. Or you can do the following: # setenv V_w3cat 5/tmp/w3cat.out # csh or tcsh users. # export V_w3cat=5/tmp/w3cat.out # ksh or bash users. # This will turn on the "print V" lines for $V in the range 0-5, and # write the verbose output to /tmp/w3cat.out. # # BUGS # Despite many attempts to detect failure, we still don't optimally # handle all the myriad things that can go wrong. # # AUTHOR # John Chambers # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # $| = 1; select STDERR; $| = 1; select STDOUT; $| = 1; ($me = $0) =~ s"^.*/""; $Vopt = $ENV{"V_$me"} || $ENV{"D_$me"} || 1; # Verbosity. if (($V,$Vfil) = ($Vopt =~ /^(\d)(.+)/)) { # Verbose output file. open(V,">$Vfil") || die "$0: Can't write \"$Vfil\" ($!)\n"; } else {$V = $Vopt; open(V,">&STDERR")} # Defaults to stderr. select V; $| = 1; select STDOUT; print V "$me: Started ", `date` if $V>1; #bufsiz = 10; # Small for testing. $bufsiz = 10000; # Large for routine use. $exitstat = 0; # Set this to get a failure exit status. $W3hdrs = 0; # Whether to output header lines. $W3data = 1; # Whether to output data. arg: for $u (@ARGV) { $moved = 0; $URLerr = "Don't know why"; # Set by URLopen when failures. if (($pfx,$opt) = ($u =~ /^([-+])(.*)/)) { while ($opt) { # Each time MUST remove at least one char. if ($opt =~ s/^D//i) { # +D or -D (whether to produce data) $W3data = ($pfx eq '+') ? 1 : 0; print V ($W3data ? "Do" : "Don't"), " produce data.\n" if $V>1; } elsif ($opt =~ s/^H//i) { # +H or -H (whether to produce headers) $W3hdrs = ($pfx eq '+') ? 1 : 0; print V ($W3hdrs ? "Do" : "Don't"), " produce headers.\n" if $V>1; } elsif ($opt =~ s/^I//i) { # +I or -P if ($pfx eq '-') { $W3agentid = ''; print V "$me: No agent identification." if $V>1; } else { $W3agentid = $opt || $dflagentid; # Rest of string is id. print V "$me: Agent \"$W3agentid\"\n" if $V>1; $opt = ''; } } elsif ($opt =~ s/^O//i) { # +O or -O (output file) $outfile = $opt; # Rest of string is file name. $opt = ''; print V "$me: Output to \"$outfile\"\n" if $V>1; } elsif ($opt =~ s/^P//i) { # +P or -P if ($pfx eq '-') { $W3nopxy = $opt; # Rest of string is pattern. print V "$me: Proxy exceptions are /$W3nopxy/\n" if $V>1; } else { $W3proxy = $opt; # Rest of string is proxy host. print V "$me: Proxy server is $W3nopxy.\n" if $V>1; } $opt = ''; } elsif ($opt =~ s/^R//i) { # +R or -R (whether to follow 302 redirects) $W302 = ($pfx eq '+') ? 1 : 0; print V ($W302 ? "Do" : "Don't"), " follow 302 redirects.\n" if $V>1; } elsif ($opt =~ s/^T//i) { # +T or -T (WWW tracing) or -T (timeout) if ($pfx eq '+') { # +T enables tracing. $W3trace = 1; print V "Do produce WWW tracing.\n" if $V>1; } elsif ($opt =~ s/^(\d+)//) { # -T\d+ is timeout. $HTTPtimeout = $1; print V "HTTPtimeout = $HTTPtimeout sec.\n" if $V>1; } else { # -T disables tracing. $W3trace = 0; print V "Don't produce WWW tracing.\n" if $V>1; } } else { print V "$me: unknown option \"$opt\" ignored.\n"; $opt =~ s/.//; # Discard this option char. } } } elsif (&URLopen(*U,$u)) { print V "$me: Opened \"$u\"\n" if $V>1; if ($HTTPtimeout > 0) { alarm $HTTPtimeout; $savsig = $SIG{ALRM}; $SIG{ALRM} = 'READalarm'; print V "$me: Set alarm after $HTTPtimeout sec.\n" if $V>2; } $statmax = 0; # Max status code seen. $staterr = ''; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's where we read the data from one URL and write it to # # standard output. If you want to do something else with the # # data, you should rewrite this loop: # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if ($outfile && !$outopen) { # Do we need to open the output? print V "$me: Open \"$outfile\"\n" if $V>1; if (open(O,">$outfile")) { # Try to open it for writing. print V "$me: Writing \"$outfile\" [$!]\n" if $V>1; } else { print V "$me: Can't write \"$outfile\" [$!]\n" if $V>0; $outfile = ''; } $outopen = 1; } print V "$me: Headers (URLhdr=$URLhdr) ...\n" if $V>1; hdr: while ($URLhdr && ($b = )) { # Read this URL's headers. $b =~ s/\s+$//; # Discard trailing white stuff. if ($W3hdrs) {if ($outfile) {print O "$b\n"} else {print "$b\n"}} if ($b) { if (($statcode,$statmsg) = ($b =~ /^HTTP\/[\d.]+\s+(\d+)\s+(.*)/)) { if ($statcode > $statmax) {$statmax = $statcode; $staterr = $statmsg} if ($W302 && $statcode == 302) { print "\n" if $V>1; $moved = 1; } } elsif ($moved && ($b =~ /^Location:\s*(.*)$/)) { print "\n" if $V>1; $u = $1; redo arg; } if ($statcode >= 400) { print "\n" if $W3trace; } } else { $URLhdr = 0; # Blank line ends headers. } } if ($statmax >= 400) { exit 1; } if ($HTTPtimeout > 0) {alarm $HTTPtimeout} print V "$me: Headers done.\n" if $V>1; if (!$W3data) { # Data not wanted? close(U); # Close this connection. next arg; # Go on to next URL. } # if ($W3hdrs) { # print V "$me: Writing NL\n" if $V>1; # if ($outfile) {print O "\n"} else {print "\n"} # } data: while ($n = read(U,$b,$bufsiz)) { print V "$me: Got $n bytes: \"$b\"\n" if $V>5; if (!$W3hdrs && $URLhdr) { # Suppressing header lines. print V "$me: HDR check in \"$b\"\n" if $V>1; if ($b =~ s/^.*\r\n\r\n//s) { $URLhdr = 0; # Found \n\n separator. } else { next; # No separator, discard it all. } } if ($outfile) {print O $b} else {print $b} if ($HTTPtimeout) {alarm $HTTPtimeout} } if (!defined $n) { print V "$me: Can't read \"$u\" ($URLerr)\n" if $V>0; $exitstat = $?; } } else { print V "$me: Can't open \"$u\" ($URLerr)\n" if $V>0; $exitstat = 1; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub READalarm { my $t = time - $HTTPopentime; print "\n" if $W3trace; exit -1; } exit $exitstat;