#!/space/imail/perl/bin/perl =head1 NAME HTML - Prototype HTML parser. =head1 SYNOPSIS HTML [file].. =head1 DESCRIPTION This is the bare bones of a program that reads HTML and splits it into tags and text. It passes these chunks to the &tag() and &txt() routines, to do with as they wish. =head1 OPTIONS =head1 EXAMPLES =head1 FILES =head1 BUGS =head1 SEE ALSO =head1 AUTHOR John Chambers =cut $| = 1; $intag = 0; line: for $line (<>) { chunk: while ($line) { if ($intag) { if ($line =~ s/^([^>]*)>//) { print "Atr: $1\n" if $1; $intag = 0; next chunk; } print "Atr: $line\n"; next line; } if ($line =~ s/^<([^\s\r>]*)//) { # $1 contains an HTML tag that is now removed. # The tag's attributes and the '>' are still in $line. &tag($1); $intag = 1; next chunk; } # $line starts with a non-tag. if ($line =~ s/^([^<]*)//) { # $1 is a chunk of text. &txt($1); next chunk; } # $line contains no more tags. &txt($line) if $line; next line; } } sub tag { local($t) = @_; print "TAG: $t\n"; } sub txt { local($t) = @_; print "Txt: $t\n"; }