#!/home/jmc/bin/perl -d
#
#=head1 NAME
#  w3grep - search for patterns in WWW files.
#  
#=head1 SYNOPSIS
#  w3grep  [options] RE [URL|dir]...
#  
#=head1 DESCRIPTION
#  This  program  searches one or more World Wide Web documents for a
#  pattern that matches the RE (Regular Expression), and shows you the
#  matching URLs and lines.
#  
#=head1 OPTIONS
#  Options  start  with  '-'  or  '+'  plus  a  letter, with possibly a
#  parameter (and no embedded spaces).  Some of  the  options  take  an
#  initial '+' to mean "enable" and '-' to mean "disable".  For others,
#  the '-' or '+' is not relevant.  If '+' is shown in the list below,,
#  then it is significant.
#  
#  -a
#    Produce a list of all tunes.  In this case, no input old  list  is
#    needed,  just  one  or  more URLs or directories.  (However, if no
#    input files are given, we will still read STDIN, so the parent  or
#    user must give an EOF or we won't terminate.)
#  
#  -d<depth>
#    This restricts the depth of directory searches to <depth>. This is
#    mostly  to  avoid  infinite  loops.   The default is 2.  Detecting
#    recursive loops turns out to be very difficult with  URLs,  so  we
#    don't attempt to do it.
#  
#  -h Produce just the raw "URL:data".
#  +h Produce html output.
#  
#  -l Show only file names.
#  +l Show matching lines (default).
#  
#  -t Produce html list as output.
#  +t Produce html table as output (requires +h).
#  
#=head1 BUGS
#  We haven't yet discovered how to make "streaming" work  with  URLs,
#  so we load the entire file into memory before doing the grep.  This
#  may result in large memory usage.
#  
#=head1 USES
#  When given a URL, this file uses the LWP::Simple modules,  available
#  at  any  <a href="http://www.perl.org/CPAN/CPAN.html">CPAN</a> site.
#  
#=head1 AUTHOR:
#  John Chambers <jc@trillian.mit.edu> "http://trillian.mit.edu/~jc/music/"
#=cut

use LWP::Simple;

$| = 1;
($me = $0) =~ s'.*/'';
$D = $ENV{"D_$me"} || $ENV{"T_$me"} || $ENV{"V_$me"} || 1;
@URL = ('http://localhost/');	# Default URL.

$wantcase = 1;	# True means case sensitive.
$wantline = 1;	# True means show matching lines.
$maxdepth = 2;	# Default depth limit for directories.
$depth    = 0;	# The current depth in directories.

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Scan the command-line arguments, processing them  as  we  go.   Input #
# files  are  read  and appended to @doc; URLs are accumulated in @URL. #
# Options are processed as read, so they will  only  affect  things  to #
# their right, except for URLS which we save for last.                  #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
for $a (@ARGV) {
	if (($fl,$opt) = ($a =~ m'^([-+])(.*)'i)) {
		if    ($opt =~ m'^A$'i)      {$alltunes  = 1}
		elsif ($opt =~ m'^D(\d+)$'i) {$maxdepth  = $1}
		elsif ($opt =~ m'^I$'i)      {$I         = ($fl eq '-') ? 'i' : ''}
		elsif ($opt =~ m'^L$'i)      {$wantline  = ($fl eq '-') ? 0 : 1}
		elsif ($opt =~ m'^H$'i)      {$wanthtml  = ($fl eq '-') ? 0 : 1}
		elsif ($opt =~ m'^T$'i)      {$wanttable = ($fl eq '-') ? 0 : 1}
		elsif ($opt =~ m'^V(\d+)$'i) {$D         = $1}
		else {
			print STDERR "$me: Unknown option \"$a\" ignored.\n" if $D>0;
		}
	} elsif (!$RE) {	# First non-option is RE.
		$RE = $a;
	} else {			# Other non-options are URLs.
		$URL[$urls++] = $a;
	}
}

die "Usage: $me [+-options] RE [URL]...\n" if !$RE;

for $u (@URL) {
	print STDERR "$me: URL=\"$u\"\n" if $D>2;
	&OneURL($u);
}

exit 0;

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Get a URL's contents, and look thru it for either lines that match
# the RE, or lines that contain hrefs to other URLs.  We recurse into
# any hrefs we find, up to maxdepth levels.
sub OneURL {
	local(@data,$href,$ishtml,$item,$line,$p,$txt,$url);
URL:
	for $url (@_) {
		print STDERR "$me/OneURL: Next file \"$url\"\n" if $D>1;
		if (-d $url) {				# Is it a local directory?
			if (opendir(DOC,$url)) {
				@data = grep !/^\./, readdir DOC;
				for $d (@data) {	# Convert it to html.
					$d =~ s#(.*)#<a href="$url/$1">$1</a>#
				}
				closedir DOC;
			} else {
				print STDERR "$me: Can't read \"$url\" ($!)\n";
				next URL;
			}
		} elsif (open(DOC,$url)) {	# Is it a local file?
			@data = <DOC>;
			close DOC;
		} else {					# It's not local; try remote.
			print STDERR "$me/OneURL: Load file \"$url\"\n" if $D>1;
			$txt = LWP::Simple::get($url);
			$foo = LWP::Simple::get($url);
			if ($txt eq undef) {	# Failed; no doc.
				print STDERR "$me/OneURL: Can't get \"$url\" ($!)\n" if $D>0;
				next URL;
			}
			if ($txt eq '') {		# Failed; empty doc.
				print STDERR "$me/OneURL: Null file \"$url\" ($!)\n" if $D>0;
				next URL;
			}
			@data = split(/\n/, $txt);
		}
		print STDERR "$me/OneURL: Got ", int(@data), " lines from $url\n" if $D>2;
		$ishtml = 0;				# Not (yet) known to be HTML format.
		print STDERR "$me: url=\"$url\"\n" if $D>2;
		++$depth;
		$url =~ s"/+$"";
		for $line (@data) {			# Run thru the data one line at a time.
			if (($I && ($line =~ m"$RE"i)) || ($line =~ $RE)) {
				if ($wantline) {
					if ($wanthtml) {
						if ($wanttable) {
							print "<dt>$url<dd>$line\n";
						} else {
							print "<tr><td>$url</td><td>$line</td></tr>\n";
						}
					} else {
						print "$url:$line\n";
					}
				} else {
					print "$url\n";
				}
			}
			next if !$line;
			if (($href,$item) = ($line =~ m#<a .*href="([^"]+)">(.*)</a>#i)) {
				$ishtml = 1;			# It looks like HTML format.
				if ($href =~ '/$') {	# Does the href imply a directory?
					if ($item =~ /Parent Dir/i) {	# Ignore parent directory.
						print STDERR "$me/OneURL: Ignore \"$item\"\n" if $D>3;
					} elsif ($item =~ /^\w*:/) {	# URL
						if ($depth < $maxdepth) {
#							&OneURL("$href");		# Recurse into remote directory.
							push @URL, $href;		# Add to list of URLs.
						}
					} else {
						if ($depth < $maxdepth) {
#							&OneURL("$url/$href");		# Recurse into remote directory.
							push @URL, "$url/$href";	# Add to list of URLs.
						}
					}
				} else {
					if ($depth < $maxdepth) {
#						&OneURL("$url/$href");		# Recurse into remote directory.
						push @URL, "$url/$href";	# Add to list of URLs.
					}
				}
			} elsif (-d $line) {
				&OneURL("$url/$line") if ($depth < $maxdepth);
			} elsif (-f $line) {
				if ($line =~ m'.*/(.*)\.abc$'i) {&OneURL("$url/$line")}
			} else {
				print STDERR "$me/OneURL: No url in \"$line\"\n" if $D>5;
			}
		}
		--$depth;
	}
}
