#!/usr/bin/perl
#
# NAME
#   w3stat
#
# SYNOPSIS
#   w3stat URL..
#
# DESCRIPTION
#   Given a list of URLs, this program gets their  status  info  (via
#   the  HTTP  "HEAD"  command)  one at a time, and writes the status
#   info to standard output.
#
#   This program is  "standalone";  it  doesn't  need  to  import  any
#   modules  from  anywhere.   So  if you want to learn how to do this
#   stuff, you can study this program. It's not nearly as difficult as
#   people would like you to believe.  But the socket stuff uses a lot
#   of magic incantations that "you just have to know".
#
# OPTIONS
#   None (yet ;-).
#
# LIMITATIONS
#   So far only the http:// protocol is implemented;  ftp://,  file://
#   and others may appear if I need them. If someone feels like adding
#   FTP code, you might send me a copy.
#
# REQUIRES
#   per4 or perl5.
#
# DEBUGGING
#    You can use "perl -dw", of course. Or you can do the following:
#     setenv V_w3stat 5/tmp/w3stat.out	# csh or tcsh users.
#     export V_w3stat=5/tmp/w3stat.out	# ksh or bash users.
#   This will turn on the "print V" lines and write to /tmp/w3stat.out.
#
# BUGS
#   Servers can differ by quite a lot in what info they return. So it
#   may be difficult for your script to make sense of it all.
#
#   The Last-Modified timestamp is returned in a form that is  nearly
#   worthless for any further processing. Why the @#*$^% couldn't the
#   WWW gang have used the ISO bigendian date format?
#
# AUTHOR
#   <a href="mailto:jc@trillian.mit.edu">John Chambers</a>
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

$| = 1;
($me = $0) =~ s"^.*/"";
$Vopt = $ENV{"V_$me"} || $ENV{"D_$me"} || 1;
if (($V,$Vfil) = ($Vopt =~ /^(\d)(.+)/)) {
	open(V,">$Vfil") || die "$0: Can't write \"$Vfil\" ($!)\n";
} else {$V = 1; open(V,">&STDERR")}
print V "$me: Started ", `date` if $V>1;

#bufsiz =    10;	# Small for testing.
$bufsiz = 10000;	# Large for routine use.
$exitstat = 0;

for $u (@ARGV) {
	$URLerr = "Don't know why";
	if (&URLopen(*U,$u)) {
		# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
		# Here's where we read the data from one URL and write it  to #
		# standard output.  If you want to do something else with the #
		# data, you should rewrite this loop:                         #
		# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
		while ($n = read(U,$buf,$bufsiz)) {
			print V "$me: Got $n bytes: \"$buf\"\n" if $V>5;
			print $buf;
		}
		if (undef $n) {
			print V "$me: Can't read \"$u\" ($URLerr)\n" if $V>0;
			$exitstat = $?;
		}
	} else {
		print V "$me: Can't open \"$u\" ($URLerr)\n" if $V>0;
		$exitstat = 1;
	}
}

exit $exitstat;
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This routine accepts a URL and attempts to open it for reading.  If #
# successful,  the return value is 1, and FD will be the open file (a #
# socket, actually). The caller can read the data from it.  If you're #
# not  going  to exit after the EOF, t's a good idea to close it when #
# you're done, to prevent the connection from hanging around.         #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub URLopen {
	local(*F,$url) = @_;
	local($p,$P,$h,$f);
	print V "URLopen: \"$url\"\n" if $V>4;
	if (($p,$h,$f) = ($url =~ m'^(\w+)://([-_.:\w]+)(/.*)')) {
		print V "URLopen: \"$url\" p=\"$p\" h=\"$h\" f=\"$f\"\n" if $V>2;
		($P = $p) =~ tr/a-z/A-Z/;
		if ($P eq 'HTTP') {
			print V "URLopen: \"$url\" HTTP protocol\n" if $V>2;
			if (&HTTPcon(*F,$h)) {
				print V "URLopen: Connected to \"$h\"\n" if $V>2;
				print V "URLopen: Send \"HEAD $f\" command.\n" if $V>4;
				print F "HEAD $f HTTP/1.0\n\n";
				return 1;
			}
			print V "URLopen: Can't connect to \"$h\"\n" if $V>0;
			return 0;
		}
		$URLerr = "can't do protocol \"$p\"";
		print V "URLopen: Can't open \"$url\" $URLerr\n" if $V>0;
		$exitstat = 254;
		return 0;
	}
	if (-d $url) {
		print V "URLopen: Directory \"$url\" ...\n" if $V>4;
		return &HTMLdir(*F,$url);
	}
	if (open(F,$url)) {
		print V "URLopen: Local file \"$url\" opened.\n" if $V>4;
		return 1;
	}
	$URLerr = "$!";
	print V "URLopen: Can't read \"$url\" ($URLerr)\n" if $V>0;
	$exitstat = int($!);
	return 0;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This routine reads a  directory  and  produces  a  simplified  html #
# listing. But you might want to reformat it in some other way that's #
# more useful for your application. We do the work in a child process #
# and return with F open to the pipe from the child.                  #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub HTMLdir {
	local(*F,$d) = @_;
	if ($child = open(F,'-|')) {	# Parent.
		return 1;	# Parent returns the open file.
	}
	# Child reads the directory.
	if (opendir(DIR,$d)) {
		print "<HEAD><TITLE>Index of $u</TITLE></HEAD><BODY>\n";
		print "<PRE>\n";
		print "<IMG SRC=\"/icons/blank.xbm\" ALT=\"     \"> Name\n";
		while ($f = readdir(DIR)) {
			if (-d "$d/$f") {
				if ($f eq '.') {		# Ignore self reference.
				} elsif ($f eq '..') {	# Perent reference.
					($p = $d) =~ s"/[^/]+/+$"/";
					print "<!-- p=\"$p\" -->\n";
					print "<IMG SRC=\"/icons/back.gif\" ALT=\"[DIR]\"> <A NAME=\"$f\" HREF=\"$p\">Parent directory</A>\n";
				} else {
					print "<IMG SRC=\"/icons/menu.gif\" ALT=\"[DIR]\"> <A NAME=\"$f\" HREF=\"$f/\">$f/</A>\n";
				}
			} else {
				print "<IMG SRC=\"/icons/text.gif\" ALT=\"[TXT]\"> <A NAME=\"$f\" HREF=\"$f\">$f</A>\n";
			}
		}
		exit 0;
	}
	print V "HTMLdir: Can't open directory \"$d\" ($!)\n" if $V>0;
	exit $!;
}

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# This accepts a URL's host:port portion, and attempts  to  make  the #
# connection.   If  successful,  we  return  1 with F open to the TCP #
# socket.  If we fail, we return 0, and F may or  may  not  be  open. #
# (Maybe we should close it.)                                         #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
sub HTTPcon {
	local(*F,$hp) = @_;
	local($host,$port);
	if (($host,$port) = ($hp =~ m"^(.*):(\d+)$")) {
		print V "HTTPcon: host=\"$host\" port=\"$port\"\n" if $V>4;
	} else {
		$host = $hp;
		$port = 80;
		print V "HTTPcon: host=\"$host\" port=$port.\n" if $V>4;
	}
	$AF_INET = 2;
	$SOCK_STREAM = 1;
	$sockaddr = 'S n a4 x8';
	($name,$aliases,$proto) = getprotobyname('tcp');
	($name,$aliases,$port) = getservbyname($port,'tcp')
		unless $port =~ /^\d+$/;
#	$thisaddr = pack('C4',0,0,0,0);	# Complex way.
	$thisaddr = "\0\0\0\0";			# Simpler way.
	($name,$aliases,$type,$len,$thataddr) = gethostbyname($host);
	$this = pack($sockaddr, $AF_INET, 0, $thisaddr);
	$that = pack($sockaddr, $AF_INET, $port, $thataddr);
	if (socket(F,$AF_INET,$SOCK_STREAM,$proto)) {
		print V "HTTPcon: Got socket.\n" if $V>2;
	} else {
		$URLerr = "Can't get socket ($!)";
		print V "HTTPcon: $URLerr\n" if $V>1;
		$exitstat = int($!);
		return 0;
	}
	if (bind(F,$this)) {
		print V "HTTPcon: Bind to \"$this\" succeeded.\n" if $V>4;
	} else {
		$URLerr = "Bind to \"$this\" failed ($!)";
		print V "HTTPcon: $URLerr\n" if $V>1;
		$exitstat = int($!);
		return 0;
	}
	if (connect(F,$that)) {
		print V "HTTPcon: Connect to \"$that\" succeeded.\n" if $V>4;
	} else {
		$URLerr = "Connect to \"$that\" failed ($!)";
		print V "HTTPcon: $URLerr\n" if $V>1;
		$exitstat = int($!);
		return 0;
	}
	select(F); $| = 1; select(STDOUT);
	return 1;
}

