#!/home/jmc/bin/perl -d # #=head1 NAME # w3grep - search for patterns in WWW files. # #=head1 SYNOPSIS # w3grep [options] RE [URL|dir]... # #=head1 DESCRIPTION # This program searches one or more World Wide Web documents for a # pattern that matches the RE (Regular Expression), and shows you the # matching URLs and lines. # #=head1 OPTIONS # Options start with '-' or '+' plus a letter, with possibly a # parameter (and no embedded spaces). Some of the options take an # initial '+' to mean "enable" and '-' to mean "disable". For others, # the '-' or '+' is not relevant. If '+' is shown in the list below,, # then it is significant. # # -a # Produce a list of all tunes. In this case, no input old list is # needed, just one or more URLs or directories. (However, if no # input files are given, we will still read STDIN, so the parent or # user must give an EOF or we won't terminate.) # # -d # This restricts the depth of directory searches to . This is # mostly to avoid infinite loops. The default is 2. Detecting # recursive loops turns out to be very difficult with URLs, so we # don't attempt to do it. # # -h Produce just the raw "URL:data". # +h Produce html output. # # -l Show only file names. # +l Show matching lines (default). # # -t Produce html list as output. # +t Produce html table as output (requires +h). # #=head1 BUGS # We haven't yet discovered how to make "streaming" work with URLs, # so we load the entire file into memory before doing the grep. This # may result in large memory usage. # #=head1 USES # When given a URL, this file uses the LWP::Simple modules, available # at any CPAN site. # #=head1 AUTHOR: # John Chambers "http://trillian.mit.edu/~jc/music/" #=cut use LWP::Simple; $| = 1; ($me = $0) =~ s'.*/''; $D = $ENV{"D_$me"} || $ENV{"T_$me"} || $ENV{"V_$me"} || 1; @URL = ('http://localhost/'); # Default URL. $wantcase = 1; # True means case sensitive. $wantline = 1; # True means show matching lines. $maxdepth = 2; # Default depth limit for directories. $depth = 0; # The current depth in directories. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Scan the command-line arguments, processing them as we go. Input # # files are read and appended to @doc; URLs are accumulated in @URL. # # Options are processed as read, so they will only affect things to # # their right, except for URLS which we save for last. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # for $a (@ARGV) { if (($fl,$opt) = ($a =~ m'^([-+])(.*)'i)) { if ($opt =~ m'^A$'i) {$alltunes = 1} elsif ($opt =~ m'^D(\d+)$'i) {$maxdepth = $1} elsif ($opt =~ m'^I$'i) {$I = ($fl eq '-') ? 'i' : ''} elsif ($opt =~ m'^L$'i) {$wantline = ($fl eq '-') ? 0 : 1} elsif ($opt =~ m'^H$'i) {$wanthtml = ($fl eq '-') ? 0 : 1} elsif ($opt =~ m'^T$'i) {$wanttable = ($fl eq '-') ? 0 : 1} elsif ($opt =~ m'^V(\d+)$'i) {$D = $1} else { print STDERR "$me: Unknown option \"$a\" ignored.\n" if $D>0; } } elsif (!$RE) { # First non-option is RE. $RE = $a; } else { # Other non-options are URLs. $URL[$urls++] = $a; } } die "Usage: $me [+-options] RE [URL]...\n" if !$RE; for $u (@URL) { print STDERR "$me: URL=\"$u\"\n" if $D>2; &OneURL($u); } exit 0; # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Get a URL's contents, and look thru it for either lines that match # the RE, or lines that contain hrefs to other URLs. We recurse into # any hrefs we find, up to maxdepth levels. sub OneURL { local(@data,$href,$ishtml,$item,$line,$p,$txt,$url); URL: for $url (@_) { print STDERR "$me/OneURL: Next file \"$url\"\n" if $D>1; if (-d $url) { # Is it a local directory? if (opendir(DOC,$url)) { @data = grep !/^\./, readdir DOC; for $d (@data) { # Convert it to html. $d =~ s#(.*)#$1# } closedir DOC; } else { print STDERR "$me: Can't read \"$url\" ($!)\n"; next URL; } } elsif (open(DOC,$url)) { # Is it a local file? @data = ; close DOC; } else { # It's not local; try remote. print STDERR "$me/OneURL: Load file \"$url\"\n" if $D>1; $txt = LWP::Simple::get($url); $foo = LWP::Simple::get($url); if ($txt eq undef) { # Failed; no doc. print STDERR "$me/OneURL: Can't get \"$url\" ($!)\n" if $D>0; next URL; } if ($txt eq '') { # Failed; empty doc. print STDERR "$me/OneURL: Null file \"$url\" ($!)\n" if $D>0; next URL; } @data = split(/\n/, $txt); } print STDERR "$me/OneURL: Got ", int(@data), " lines from $url\n" if $D>2; $ishtml = 0; # Not (yet) known to be HTML format. print STDERR "$me: url=\"$url\"\n" if $D>2; ++$depth; $url =~ s"/+$""; for $line (@data) { # Run thru the data one line at a time. if (($I && ($line =~ m"$RE"i)) || ($line =~ $RE)) { if ($wantline) { if ($wanthtml) { if ($wanttable) { print "

$url

$line\n"; } else { print "$url$line\n"; } } else { print "$url:$line\n"; } } else { print "$url\n"; } } next if !$line; if (($href,$item) = ($line =~ m#(.*)#i)) { $ishtml = 1; # It looks like HTML format. if ($href =~ '/$') { # Does the href imply a directory? if ($item =~ /Parent Dir/i) { # Ignore parent directory. print STDERR "$me/OneURL: Ignore \"$item\"\n" if $D>3; } elsif ($item =~ /^\w*:/) { # URL if ($depth < $maxdepth) { # &OneURL("$href"); # Recurse into remote directory. push @URL, $href; # Add to list of URLs. } } else { if ($depth < $maxdepth) { # &OneURL("$url/$href"); # Recurse into remote directory. push @URL, "$url/$href"; # Add to list of URLs. } } } else { if ($depth < $maxdepth) { # &OneURL("$url/$href"); # Recurse into remote directory. push @URL, "$url/$href"; # Add to list of URLs. } } } elsif (-d $line) { &OneURL("$url/$line") if ($depth < $maxdepth); } elsif (-f $line) { if ($line =~ m'.*/(.*)\.abc$'i) {&OneURL("$url/$line")} } else { print STDERR "$me/OneURL: No url in \"$line\"\n" if $D>5; } } --$depth; } }