#!/usr/bin/perl #!/space/imail/perl/bin/perl # # NAME # H_fmt - HTML formatter # H_txt - HTML to plain text conversion. # # SYNOPSIS # H_fmt [option].. [file].. # H_txt [option].. [file].. # # DESCRIPTION # Read in a chunk of HTML, and reformat it. This program functions as a # conventional Unix-style filter program, reading from stdin or from files # named on the command line, and writing to stdout. Depending on name or # the H option, we produce either "pretty-printed" HTML output or plain # ASCII text. # # The main use of this program is in making HTML readable to a human. The # growing population of HTML tools have this unfortunate characteristic of # producing "machine readable" HTML. You don't have to tolerate it. Run it # through this program. # # The main formatting we do is to produce indenting (using tabs by default, # though you can specify the indent string). We have some tables of HTML # tags saying which of them are to start a new line, which are to end the # line, and whether to indent the contents. We also do a bit of cleaning up # of whitespace. # # There are some special transformations done by this program that were put # here for the benefit of specific projects that I've worked on. I've tried # to delete each project's goodies as I move on to the next one, but you # may find some relics. Use them if you wish. Here are some of the things # in the current version: # # OPTIONS # Options on the command line may start with '-' or '+'. For some options, # '-' means "disable" or "no", while '+' means "enable" or "yes". For other # options, there is no such distinction, and either may be used. Options # with no arg may be combined into one; if an option letter is encountered # that takes an arg, the rest of the arg string will be used. # # -C # Delete comments. # +C # Keep comments. Default. # # -LNK # Disable the special LNK-project processing. # +LNK # +LNK= # +LNK.= # Special processing for hyperlinks. We look for several sorts of # hyperlinks, and rewrite them so they will work when the pages are in the # directory. The .= means that files ending with . are # assumed to be found in the directory named. If no suffix is given, the # directory will be used for any file without a recognized suffix. Several # LNK options may be used to specify directories for different suffixes. # # -I'str' # Indent string, used once for each level of indentation. The default is a # single tab. For debugging purposes, -I'| ' is very useful, though it # gives invalid HTML. # # -H # Suppress HTML markup. The result is the "content" text formatted as # usual, but without any HTML tags. This is useful for converting HTML to # plain text. You may want to feed the result to further text formatters. # # -LM # -RM # (Not fully implemented yet) # Define margins. -LM sets the left margin to column ; -RM sets the # right margin to . The defaults are 0 and 70. Note that we treat # the tabs used for indenting as just a single character for this purpose. # # -Vn # Verbose level n. The default is -V1, which gives only error messages. # # FILES # This program functions like a "standard Unix filter". If you give it one # or more file names on its command line, it'll read them in sequence, and # effectively catenate them into one long file. If there are no file names # on the command line, we read from standard input. # # Output is always to standard output, which you can redirect as you wish. # # AUTHOR # John Chambers # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # $| = 1; ($me = $0) =~ s'.*/''; #$V = $ENV{"V_$me"} || $ENV{"D_$me"} || 1; #$ENV{'PATH'} = '/sbin:/usr/sbin:/usr/bin:/sh:/usr/local/bin'; $exitstat = 0; # Anyone can leave behind an exit status. &Vinit($ENV{"V_$me"} || $ENV{"D_$me"} || '1'); &EntTables; # Initialize the entity tables. &TagTables; # Initialize the tag tables. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Initialize assorted global variables: $html = ($me =~ /te*xt$/) ? 0 : 1; # Produce HTML tags. $ind = "\t"; # Indent string. $wantNL = 1; # True if we just wrote a newline. $NI = 0; # True if we just wrote indentation. $comments = 1; # Produce comments. $stag = 1; # Special handling for %Stag tags? $txtmax = 60; # Length limit for output text lines. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Initialize some global level-related lists: $IT[0] = 0; # Indent level for each tag. $I[0] = ""; # Indent strings (zero or more copies of $ind). $Ilvl = 0; # Current indent level. $T[0] = ""; # Tags at various levels. $Tlvl = 0; # Current tag level. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here we run thru the command-line args, looking for options. Anything that # doesn't look like an option gets stored in @files for later use. for $a (@ARGV) { VC("$me: Arg \"$a\"") if $V>5; if (($flg,$opts) = ($a =~ /^([-+])(.*)/)) { # + and - indicate options. VC("$me: flg=\"$flg\" opts=\"$opts\"") if $V>5; while ($opts =~ /^(.)(.*)/) { $opt = uc($1); $opts = $2; VC("$me: opt=\"$opt\" opts=\"$opts\"") if $V>5; if ($opt eq 'C') { $comments = ($flg eq '-' ? 0 : 1); # Whether to produce comments. } elsif ($opt eq 'I') { # Indentation. $ind = $opts; # The rest of the arg is the indent string. $opts = ''; } elsif ($opt eq 'H') { # Whether to produce HTML tags. $html = ($flg eq '-' ? 0 : 1); } elsif (($opt eq 'L' or $opt eq 'R') and ($opts =~ s"^M(\d*)"")) { $lmargin = ($1 or 3) if ($opt eq 'L'); $rmargin = ($1 or 72) if ($opt eq 'R'); } elsif ($opt eq 'L') { if ($opts =~ /^NK\b(.*)/i) { # Rewriting links. $LNK = 1; $LNKopt = $1; VC("$me: LNKopt=\"$LNKopt\"") if $V>2; if (($suf,$dir) = ($LNKopt =~ /^\.(\w+)=(.*)$/)) { VC("$me: LNK option is suf \"$suf\" = dir \"$dir\"") if $V>2; $LNKdir{$suf} = $dir; } elsif (($dir) = ($LNKopt =~ /^=(.*)$/)) { VC("$me: LNK option is dir \"$dir\"") if $V>2; $LNKdir{'='} = $dir; } else { VC("$me: LNK option \"$LNKopt\".") if $V>2; } $opts = ''; } else { print STDERR "$me: Unknown R option \"$1\" ignored.\n" if $V; } } elsif ($opt eq 'V') { # Verbose level. if ($opts =~ s/^(\d)//) {$V = $1} else {++$V} } else { print STDERR "$me: Unknown option \"$1\" ignored.\n" if $V; } } } else { push @files, $a; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here we read the input data, accumulating it in $txt, and calling oneline # to process as much as it can. Note that oneline may not process all of $txt # if there is an incomplete tag. When oneline returns, we simply read another # line and append it to $txt, and then ask oneline "Can you handle it now?" if (@files) { for $f (@files) { if (open(F,$f)) { for $l () { $txt .= $l; &oneline(); } } else { print STDERR "$me: Can't read \"$f\" [$!]\n" if $V; } } } else { for $l () { $txt .= $l; &oneline(); } } if ($txt) { # Any relic text left over? print "$txt"; # It's probably junk, but print it anyway. $wantNL = (substr($txt,-1,1) eq "\n") ? 1 : 0; $NI = 0; } done: print "\n" if !$wantNL; exit $exitstat; # That's the main program; here are the subroutines: # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # We process as much of $txt as we can. The only case where we return with # data still in $txt is when we find the start of a HTML tag with no closing # '>'. The above code will tack on another line, and call us again. sub oneline { while ($txt) { if ($txt =~ /^\s*)""s) { &comment($1); } else { return; # Return with incomplete comment tag. } } elsif ($txt =~ s"^]*)>""s) { &closetag($1); } elsif ($txt =~ s"^<([^>]*)>""s) { &opentag($1); } else { VL("$me: > not found.") if $V>6; return; # Return with incomplete comment tag. } } elsif ($txt =~ s"^(\s+)"") { if ($PRE) { print $1; } else { unless ($wantNL or $NI) { print ' '; ++$txtlen; $wantNL = $NI = 0; } } } elsif ($txt =~ s"^([^<\s]+)"") { &wordout($1); } else { print "\n"if $V; print $txt; $NI = $wantNL = 0; $txt = ''; } } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This is our HTML-entity-to-ANSI-char routine. It is only called when we are # doing text output; if $html is true, we pass on such entities unchanged. # sub e2c { local($ent) = @_; local($val) = $HTMLentval{$ent}; if (defined $val) { return pack('c',$val); } if ($ent eq 'quot') {return '"'} # They left this one out. return "&$ent;" } sub d2c { local($ent) = @_; local($val) = int($ent); return pack('c',$val); } sub x2c { local($ent) = @_; return pack('c',hex($ent)); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub wordout { local($word) = @_; if (!$html) { $word =~ s/\&#(x[0-9A-Fa-f]+);/&x2c($1)/ge; $word =~ s/\&#(\d+);/&d2c($1)/ge; $word =~ s/\&(\w+);/&e2c($1)/ge; } unless ($PRE) { if ($txtlen > $txtmax) { VL("$me: wantNL+I because txtlen=$txtlen > txtmax=$txtmax.") if $V>2; &NLI(); $txtlen = 0; $wantNL = 0; $NI = 1; } elsif ($wantNL and !$NI and $I[$Ilvl]) { VL("$me: I because wantNL=$wantNL NI=$NI I[$Ilvl]=\"$I[$Ilvl]\"") if $V>2; &NLI(); $txtlen = 0; $wantNL = 0; $NI = 1; } } print $word; $wantNL = $NI = 0; $txtlen += length($word); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub comment { local($cmt) = @_; # if ($cmt =~ /^\n"; # $wantNL = 1; # return; # } print $cmt if $comments || $inscript; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Handle a close tag. We scan the tag stack for the matching open tag, and # # pop the stack back to there, resetting the indentation. The VC() line is to # # flag mismatches in nesting tags, but we usually don't use it. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub closetag { local($Tag) = @_; # Tag in random case. local($TAG) = uc($Tag); # Tag in upper case. local($CTAG) = "/$TAG"; # Tag with initial '/'. if ($PRE) { print "" if $html; if ($TAG eq 'PRE') { VL("$me/opentag: /PRE") if $V>1; $PRE = 0; } return; } if ($stag and $Stag{$TAG}) { # Optional kludge to discard some attributes. $Tag = &specialtag($TAG,$Tag,'/'); } if ($Ntag{$TAG} or $Xtag{$TAG}) { # Is tag nesting type? while (($Tlvl > 0) and ($TAG ne $T[$Tlvl])) { # VC("<$T[$Tlvl]> closed by "); if ($Ntag{$T[$Tlvl]}) { closetag($T[$Tlvl]); } else { if (--$Tlvl < 0) {$Tlvl = 0} } } if (--$Tlvl < 0) {$Tlvl = 0} # $Ilvl = $IT[$Tlvl]; # Restore this tag's indent level. } else { if ($Itag{$TAG} > 0) {if (--$Ilvl < 0) {$Ilvl = 0}} } if ($html) { # Do we need newline + indent before this tag? if ($Ftag{$CTAG}) { # Is tag first on new line? VL("$me/closetag: NLI because Ftag{$CTAG}=$Ftag{$CTAG} and wantNL=$wantNL") if $V>2; &NLI(); # Newline + previous indent. } print ""; $wantNL = $NI = 0; } $Ilvl = $IT[$Tlvl]; # Restore previous tag's indent level. if ($html and $Ltag{$CTAG}) { VL("$me/closetag: NLI because Ltag{$CTAG}=$Ltag{$CTAG} and wantNL=$wantNL") if $V>2; print "\n"; $wantNL = 1 } elsif ($Ttag{$TAG} && $Ltag{$TAG}) { # Text mode. VL("$me/closetag: NLI after $TAG because Ttag{/$TAG}=$Ttag{$CTAG} Ltag{$TAG}=$Ltag{$TAG}") if $V>2; &NLI(); } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Produce newline. sub wantNL { if (!$wantNL) { VL("$me/wantNL: Produce wantNL with wantNL=$wantNL.") if $V>2; print "\n"; $wantNL = 1 } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Produce newline and indent. sub NLI { if (!$wantNL) { VL("$me/NLI: Produce wantNL with wantNL=$wantNL.") if $V>2; print "\n"; $wantNL = 1 } if ($I[$Ilvl]) { VL("$me/NLI: Indent I[$Ilvl]=\"$I[$Ilvl]\"") if $V>2; if ($html) { print "$I[$Ilvl]" } else { print "\t" if $I[$Ilvl] } $NI = 1; $wantNL = 0 } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Process an open tag. This mostly concerns getting the indentation right, # # setting Ilvl and Tlvl appropriately. We also do a check for the special # # hyperlink processing. Note that our param contains the tag and attributes # # but not the <> delimiters. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub opentag { local($Tag) = @_; if ($PRE) { print "" if $html; return; } local($TAG,$tag,$att); if (($tag,$att) = ($Tag =~ m"^([^\s]+)\s+(.*)$")) { } else { # Split TAG into tag and attributes. $tag = $Tag; $att = ''; } $TAG = uc($tag); # Canonicalize tags to upper case. if ($TAG eq 'PRE') { # Special kludge for PRE tag. VL("$me/opentag: PRE") if $V>1; $PRE = 1; print "\n<$Tag>"; return; } if ($Xtag{$TAG}) { # Exclusive tag? if ($T[$Tlvl] eq $TAG or $T[$Tlvl] eq $Xtag{$TAG}) { if (--$Tlvl < 0) {$Tlvl = 0} # Close previous instance of this tag. $Ilvl = $IT[$Tlvl]; # Restore this tag's indent level. } } if ($Ntag{$TAG} or $Xtag{$TAG}) { # Nesting tag? $T[++$Tlvl] = $TAG; # Add to stack of nesting tags. } if ($html) { if ($Ftag{$TAG}) { VL("$me/opentag: NLI because html=$HTML Ftag{$TAG}=\"Ftag{$TAG}\"") if $V>2; &NLI(); } } elsif ($Ttag{$TAG}) { # Newline during text mode for these tags. VL("$me/opentag: NLI because html=$HTML Ttag{$TAG}=\"$Ttag{$TAG}\"") if $V>2; &NLI(); } if ($stag and $Stag{$TAG}) { # Optional kludge to discard some attributes. $Tag = &specialtag($TAG,$Tag); } if ($LNK and $Htag{$TAG}) { $Tag = &Htag($TAG,$tag,$att); } print "<$Tag>" if $html; $wantNL = $NI = 0; if ($wantNL = $Ltag{$TAG}) { # Does this tag end a line? if ($wantNL > 0) { # Positive means it does. print "\n"; $wantNL = 1; $txtlen = 0; } elsif ($wantNL < 0) { # Negative means yes if at end of line. if ($txt =~ s"^\s*\n\s*"") { print "\n"; $wantNL = 1; $txtlen = 0; } else { # Text follows on same line. $wantNL = 1; # Lie to suppress newline. $txtlen = length($Tag) + 2; } } } if ($Itag{$TAG} > 0) { # Is this an indenting tag? $Ilvl++; $I[$Ilvl] = $ind x $Ilvl; } $IT[$Tlvl] = $Ilvl; # Note indent level for current tag. } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Hyper-link tags need some extra handling. Each of these is a unique kludge. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Htag { local($TAG,$tag,$att) = @_; local($new,$a,$v); while ($att) { VC("Htag att=\"$att\""); if ($att =~ s/^(\w+)="([^"]*)"\s*//) { VC("Htag 1=$1 2=\"$2\""); $a = lc($1); if ($a eq 'href') { $new .= " $1=\"" . &LNKurl($TAG,$2) . "\""; } elsif ($a eq 'src') { $new .= " $1=\"" . &LNKurl($TAG,$2) . "\""; } else { $new .= " $1=\"$2\""; } } elsif ($att =~ s/^(\w+)=([^\s]*)\s*//) { $new .= " $1=$2"; VC("Htag 1=$1 2=$2"); } else { $att =~ s/^([^\s]*)\s*//; VC("Htag 1=$1"); $new .= " $1"; } } return "$tag$new"; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub LNKjs { local($x) = @_; if (($init,$base,$suff,$rest) = ($x =~ /^(.*)'([-.\w]+)\.(html*|gif|js)'(.*)/)) { return $init . "'" . ($LNKdir{lc($suff)} || $LNKdir{'='} || '') . "/$base.$suff" . "'" . $rest; } return $x; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub Min {local $v = shift; for (@_) {$v = $_ if $_ < $v}; $v} sub Max {local $v = shift; for (@_) {$v = $_ if $_ > $v}; $v} # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Some URL rewrite. sub URL { local($url) = @_; while ($url =~ s"[^/]/\.\./"") {} return $url } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here we get a tag and its URL. We return the URL, possibly modified. sub LNKurl { local($TAG,$url) = @_; local($p,$uri,$v); local($id) = "$me/LNKurl"; VC("$id: ../ in URL") if ($url =~ /\.\.\//); return URL($url) if ($url =~ m"^\w+://"); return URL($url) if ($url =~ m"^/"); if (($p,$uri) = ($url =~ /^(\w+:)(.*)$/)) { VC("$id protocol \"$p\" \"$uri\""); if (uc($p) eq 'JAVASCRIPT:') { return &URL($p . &LNKjs($uri)); } } if (%LNKdir) { VC("$id LNKdir defined."); if ($url =~ /\.(\w+)$/) { VC("$id $1 suffix found."); if ($dir = $LNKdir{$1}) { VC("$id $1 suffix is \"$dir\""); return &URL("$p$dir/$url"); } } else { VC("$id no suffix on \"$url\""); } if ($dir = $LNKdir{'='}) { VC("$id dir is \"$dir\""); return &URL("$p$dir/$url"); } else { VC("$id no default directory"); } } else { VC("$id no directories at all"); } return &URL($url); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # This is used to implement special-case kludges for some tags. It's entirely # ad hoc, depending on the needs of the current project. # sub specialtag { local($T,$t,$f) = @_; if ($T eq 'SCRIPT') { if ($f eq '/') { if (--$inscript) {$inscript = 0} } else { ++$inscript; } return $t; } # if ($T eq 'IMG' or $T eq 'TD' or $T eq 'TABLE') { # $t =~ s/\s*\bwidth="*\d+%*"*//i; # $t =~ s/\s*\bheight="*\d+%*"*//i; # } # if ($T eq 'AREA') { # $t =~ s/\bhref="telnet:(.*?)"/href="<#1:TelnetLink(___)#>"/i; # } $t; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Print the @T stack around $T[$t]. This is a debug kludge. # sub PT { local($t) = @_; local($t1,$t2,$t3); $t1 = $T[$t-1]; $t2 = $T[$t]; $t3 = $T[$t+1]; print V ""; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's our verbose-message output routines. We're currently set up to write # all our verbose messages to a log file. If we decide to send it to stdout, # here is where the changes should be made. # sub VC {print V "\n"} # Message as HTML comment. sub VM {print V @_,"
\n"} # Message with break and newline. sub VL {print V @_,"\n"} # Message with just newline. sub VS {print V @_} # Message string only. sub Vinit { # Verbose-mode initialization. %Vhost = ( # Max verbose level for remote hosts. 'localhost' => 1, ); %Vprog = ( # Max verbose level for CGI programs. 'Session' => 3, 'Verify' => 1, ); $V = &Min(($Vprog{$me} || 1), ($Vhost{$ENV{REMOTE_HOST}} || 1)) if !defined($V); $Vfile = "$me.log"; # File for verbose output. if ($V > 0) { # Do we need a verbose-output file? if ($Vfile) { # Do we have a verbose-output file name? if (!open(V,">$Vfile")) { # Open the verbose-output file. print "
Can't write \"$Vfile\" [$!]
\n" if $V; open(V,">-"); } } else { # No verbose-output file, use STDOUT. open(V,">-"); } } select V; $| = 1; select STDOUT; if ($V>3) { # Explain the value of $V ... VM("$me: Vprog{$me}=\"$Vprog{$me}\"") if $Vprog{$me}; VM("$me: Vhost{$ENV{REMOTE_HOST}}=\"$Vhost{$ENV{REMOTE_HOST}}\"") if defined $Vhost{$ENV{REMOTE_HOST}}; VM("$me: V=\"$V\" initially.") if $V>2; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's the table of HTML3.2 entity encodings: sub EntTables { %HTMLentval = ( 'quot' => 34, # quotation mark = APL quote, U+0022 ISOnum 'amp' => 38, # ampersand, U+0026 ISOnum 'lt' => 60, # less-than sign, U+003C ISOnum 'gt' => 62, # greater-than sign, U+003E ISOnum 'nbsp' => 160, # no-break space = non-breaking space, U+00A0 ISOnum 'iexcl' => 161, # inverted exclamation mark, U+00A1 ISOnum 'cent' => 162, # cent sign, U+00A2 ISOnum 'pound' => 163, # pound sign, U+00A3 ISOnum 'curren' => 164, # currency sign, U+00A4 ISOnum 'yen' => 165, # yen sign = yuan sign, U+00A5 ISOnum 'brvbar' => 166, # broken bar = broken vertical bar, U+00A6 ISOnum 'sect' => 167, # section sign, U+00A7 ISOnum 'uml' => 168, # diaeresis = spacing diaeresis, U+00A8 ISOdia 'copy' => 169, # copyright sign, U+00A9 ISOnum 'ordf' => 170, # feminine ordinal indicator, U+00AA ISOnum 'laquo' => 171, # left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum 'not' => 172, # not sign, U+00AC ISOnum 'shy' => 173, # soft hyphen = discretionary hyphen, U+00AD ISOnum 'reg' => 174, # registered sign = registered trade mark sign, U+00AE ISOnum 'macr' => 175, # macron = spacing macron = overline = APL overbar, U+00AF ISOdia 'deg' => 176, # degree sign, U+00B0 ISOnum 'plusmn' => 177, # plus-minus sign = plus-or-minus sign, U+00B1 ISOnum 'sup2' => 178, # superscript two = superscript digit two = squared, U+00B2 ISOnum 'sup3' => 179, # superscript three = superscript digit three = cubed, U+00B3 ISOnum 'acute' => 180, # acute accent = spacing acute, U+00B4 ISOdia 'micro' => 181, # micro sign, U+00B5 ISOnum 'para' => 182, # pilcrow sign = paragraph sign, U+00B6 ISOnum 'middot' => 183, # middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum 'cedil' => 184, # cedilla = spacing cedilla, U+00B8 ISOdia 'sup1' => 185, # superscript one = superscript digit one, U+00B9 ISOnum 'ordm' => 186, # masculine ordinal indicator, U+00BA ISOnum 'raquo' => 187, # right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum 'frac14' => 188, # vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum 'frac12' => 189, # vulgar fraction one half = fraction one half, U+00BD ISOnum 'frac34' => 190, # vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum 'iquest' => 191, # inverted question mark = turned question mark, U+00BF ISOnum 'Agrave' => 192, # latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1 'Aacute' => 193, # latin capital letter A with acute, U+00C1 ISOlat1 'Acirc' => 194, # latin capital letter A with circumflex, U+00C2 ISOlat1 'Atilde' => 195, # latin capital letter A with tilde, U+00C3 ISOlat1 'Auml' => 196, # latin capital letter A with diaeresis, U+00C4 ISOlat1 'Aring' => 197, # latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1 'AElig' => 198, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1 'Ccedil' => 199, # latin capital letter C with cedilla, U+00C7 ISOlat1 'Egrave' => 200, # latin capital letter E with grave, U+00C8 ISOlat1 'Eacute' => 201, # latin capital letter E with acute, U+00C9 ISOlat1 'Ecirc' => 202, # latin capital letter E with circumflex, U+00CA ISOlat1 'Euml' => 203, # latin capital letter E with diaeresis, U+00CB ISOlat1 'Igrave' => 204, # latin capital letter I with grave, U+00CC ISOlat1 'Icirc' => 206, # latin capital letter I with circumflex, U+00CE ISOlat1 'Iuml' => 207, # latin capital letter I with diaeresis, U+00CF ISOlat1 'ETH' => 208, # latin capital letter ETH, U+00D0 ISOlat1 'Ntilde' => 209, # latin capital letter N with tilde, U+00D1 ISOlat1 'Ograve' => 210, # latin capital letter O with grave, U+00D2 ISOlat1 'Oacute' => 211, # latin capital letter O with acute, U+00D3 ISOlat1 'Ocirc' => 212, # latin capital letter O with circumflex, U+00D4 ISOlat1 'Otilde' => 213, # latin capital letter O with tilde, U+00D5 ISOlat1 'Ouml' => 214, # latin capital letter O with diaeresis, U+00D6 ISOlat1 'times' => 215, # multiplication sign, U+00D7 ISOnum 'Oslash' => 216, # latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1 'Ugrave' => 217, # latin capital letter U with grave, U+00D9 ISOlat1 'Uacute' => 218, # latin capital letter U with acute, U+00DA ISOlat1 'Ucirc' => 219, # latin capital letter U with circumflex, U+00DB ISOlat1 'Uuml' => 220, # latin capital letter U with diaeresis, U+00DC ISOlat1 'Yacute' => 221, # latin capital letter Y with acute, U+00DD ISOlat1 'THORN' => 222, # latin capital letter THORN, U+00DE ISOlat1 'szlig' => 223, # latin small letter sharp s = ess-zed, U+00DF ISOlat1 'agrave' => 224, # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1 'aacute' => 225, # latin small letter a with acute, U+00E1 ISOlat1 'acirc' => 226, # latin small letter a with circumflex, U+00E2 ISOlat1 'atilde' => 227, # latin small letter a with tilde, U+00E3 ISOlat1 'auml' => 228, # latin small letter a with diaeresis, U+00E4 ISOlat1 'aring' => 229, # latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1 'aelig' => 230, # latin small letter ae = latin small ligature ae, U+00E6 ISOlat1 'ccedil' => 231, # latin small letter c with cedilla, U+00E7 ISOlat1 'egrave' => 232, # latin small letter e with grave, U+00E8 ISOlat1 'eacute' => 233, # latin small letter e with acute, U+00E9 ISOlat1 'ecirc' => 234, # latin small letter e with circumflex, U+00EA ISOlat1 'euml' => 235, # latin small letter e with diaeresis, U+00EB ISOlat1 'igrave' => 236, # latin small letter i with grave, U+00EC ISOlat1 'iacute' => 237, # latin small letter i with acute, U+00ED ISOlat1 'icirc' => 238, # latin small letter i with circumflex, U+00EE ISOlat1 'iuml' => 239, # latin small letter i with diaeresis, U+00EF ISOlat1 'eth' => 240, # latin small letter eth, U+00F0 ISOlat1 'ntilde' => 241, # latin small letter n with tilde, U+00F1 ISOlat1 'ograve' => 242, # latin small letter o with grave, U+00F2 ISOlat1 'oacute' => 243, # latin small letter o with acute, U+00F3 ISOlat1 'ocirc' => 244, # latin small letter o with circumflex, U+00F4 ISOlat1 'otilde' => 245, # latin small letter o with tilde, U+00F5 ISOlat1 'ouml' => 246, # latin small letter o with diaeresis, U+00F6 ISOlat1 'divide' => 247, # division sign, U+00F7 ISOnum 'oslash' => 248, # latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1 'ugrave' => 249, # latin small letter u with grave, U+00F9 ISOlat1 'uacute' => 250, # latin small letter u with acute, U+00FA ISOlat1 'ucirc' => 251, # latin small letter u with circumflex, U+00FB ISOlat1 'uuml' => 252, # latin small letter u with diaeresis, U+00FC ISOlat1 'yacute' => 253, # latin small letter y with acute, U+00FD ISOlat1 'thorn' => 254, # latin small letter thorn with, U+00FE ISOlat1 'yuml' => 255, # latin small letter y with diaeresis, ); } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here's a routine to initialize our tag tables. This task was moved into a # # subroutine so that it can be positioned out of the way, and not clutter up # # the start of the program. # # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # sub TagTables { # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Here is a table that controls indentation. The usual value is 1, # # meaning to indent the contents by 1 level. A few have a value of 0, # # meaning to not indent the contents. Two special kludges: A value of # # '' means to outdent the contents to the left edge; this may be used # # for such tags as
 and