#!/space/imail/perl/bin/perl
=head1 NAME
HTML - Prototype HTML parser.
=head1 SYNOPSIS
HTML [file]..
=head1 DESCRIPTION
This is the bare bones of a program that reads HTML and
splits it into tags and text. It passes these chunks to
the &tag() and &txt() routines, to do with as they wish.
=head1 OPTIONS
=head1 EXAMPLES
=head1 FILES
=head1 BUGS
=head1 SEE ALSO
=head1 AUTHOR
John Chambers
=cut
$| = 1;
$intag = 0;
line:
for $line (<>) {
chunk:
while ($line) {
if ($intag) {
if ($line =~ s/^([^>]*)>//) {
print "Atr: $1\n" if $1;
$intag = 0;
next chunk;
}
print "Atr: $line\n";
next line;
}
if ($line =~ s/^<([^\s\r>]*)//) {
# $1 contains an HTML tag that is now removed.
# The tag's attributes and the '>' are still in $line.
&tag($1);
$intag = 1;
next chunk;
}
# $line starts with a non-tag.
if ($line =~ s/^([^<]*)//) {
# $1 is a chunk of text.
&txt($1);
next chunk;
}
# $line contains no more tags.
&txt($line) if $line;
next line;
}
}
sub tag {
local($t) = @_;
print "TAG: $t\n";
}
sub txt {
local($t) = @_;
print "Txt: $t\n";
}