$debug = 1;
$whitespace_significant = 0;
# global variables:
# $line_buffer is line buffer
# $line_count is input line number.
$line_buffer = "";
$line_count = 0;
sub parse_html {
local ($file) = @_;
open (HTML, $file) || die "Could not open $file: $!\nStopped";
&parse_html_stream ();
close (HTML);}
# Global input HTML is the handle to the stream of HTML
sub parse_html_stream {
local ($token, $new);
## initialization
@stack=();
$line_count = 0;
$line_buffer = "";
## application specific initialization
&html_begin_doc();
main:
while (1) {
# if whitespace does not matter, trim any leading space.
if (! $whitespace_significant) {
$line_buffer =~ s/^\s+//;}
# now dispatch on the type of token
if ($line_buffer =~ /^(\s+)/) {
$token = $1;
$line_buffer = $';
&html_whitespace ($token);}
# This will lose if there is more than one comment on the line!
elsif ($line_buffer =~ /^(\)/) {
$token = $1;
$line_buffer = $';
&html_comment ($token);}
elsif ($line_buffer =~ /^(\]*\>)/) {
$token = $1;
$line_buffer = $';
&html_comment ($token);}
elsif ($line_buffer =~ /^(\<\/[^\>]*\>)/) {
$token = $1;
$line_buffer = $';
&html_etag ($token);}
elsif ($line_buffer =~ /^(\<[^!\/][^\>]*\>)/) {
$token = $1;
$line_buffer = $';
&html_tag ($token);}
elsif ($line_buffer =~ /^([^\s<]+)/) {
$token = $1;
$line_buffer = $';
$token = &substitute_entities($token);
&html_content ($token); }
else {
# No valid token in buffer. Maybe it's empty, or maybe there's an
# incomplete tag. So get some more data.
$new = ;
if (! defined ($new)) {last main;}
# if we're trying to find a match for a tag, then get rid of embedded newline
# this is, I think, a kludge
if ($line_buffer =~ /^\ && $line_buffer =~ /\n$/) {
chop $line_buffer;
$line_buffer .= " ";}
$line_buffer .= $new;
$line_count++;}
}
## cleanup
&html_end_doc();
if ($#stack > -1) {
print STDERR "Stack not empty at end of document\n";
&print_html_stack();}
}
sub html_tag {
local ($tag) = @_;
local ($element) = &tag_element ($tag);
local (%attributes) = &tag_attributes ($tag);
# the tag might minimize (be an implicit end) for the previous tag
local ($prev_element);
while (&Minimizes(&stack_top_element(), $element)) {
$prev_element = &stack_pop_element ();
if ($debug) {
print STDERR "MINIMIZING $prev_element with $element on $line_count\n";}
&html_end ($prev_element, 0);}
push (@stack, $tag);
&html_begin ($element, $tag, *attributes);
if (&Empty($element)) {
pop(@stack);
&html_end ($element, 0);}
}
sub html_etag {
local ($tag) = @_;
local ($element) = &tag_element ($tag);
# pop stack until find matching tag. This is probably a bad idea,
# or at least too general.
local ( $prev_element) = &stack_pop_element();
until ($prev_element eq $element) {
if ($debug) {
print STDERR "MINIMIZING $prev_element with /$element on $line_count \n";}
&html_end ($prev_element, 0);
if ($#stack == -1) {
print STDERR "No match found for /$element. You will lose\n";
last;}
$prev_element = &stack_pop_element();}
&html_end ($element, 1);
}
# For each element, the names of elements that minimize it.
# This is of course totally HTML dependent and probably I have it wrong too
$Minimize{"DT"} = "DT:DD";
$Minimize{"DD"} = "DT";
$Minimize{"LI"} = "LI";
$Minimize{"P"} = "P:DT:LI:H1:H2:H3:H4:BLOCKQUOTE:UL:OL:DL";
$Minimize{"TD"} = "TD:TH";
$Minimize{"TH"} = "TD:TH";
# Does element E2 minimize E1?
sub Minimizes {
local ($e1, $e2) = @_;
local ($value) = 0;
foreach $elt (split (":", $Minimize{$e1})) {
if ($elt eq $e2) {$value = 1;}}
$value;}
$Empty{"BASE"} = 1;
$Empty{"BR"} = 1;
$Empty{"HR"} = 1;
$Empty{"IMG"} = 1;
$Empty{"ISINDEX"} = 1;
$Empty{"LINK"} = 1;
$Empty{"META"} = 1;
$Empty{"NEXTID"} = 1;
$Empty{"INPUT"} = 1;
# Empty tags have no content and hence no end tags
sub Empty {
local ($element) = @_;
$Empty{$element};}
sub print_html_stack {
print STDERR "\n ==\n";
foreach $elt (reverse @stack) {print STDERR " $elt\n";}
print STDERR " ==========\n";}
# The element on top of stack, if any.
sub stack_top_element {
if ($#stack >= 0) { &tag_element ($stack[$#stack]);}}
sub stack_pop_element {
&tag_element (pop (@stack));}
# The element from the tag, normalized.
sub tag_element {
local ($tag) = @_;
$tag =~ /<\/?([^\s>]+)/;
local ($element) = $1;
$element =~ tr/a-z/A-Z/;
$element;}
# associative array of the attributes of a tag.
sub tag_attributes {
local ($tag) = @_;
$tag =~ /^<[A-Za-z]+\s+(.*)>$/;
&parse_attributes($1);}
# string should be something like
# KEY="value" KEY2="longer value" KEY3="tags o doom"
# output is an associative array (like a lisp property list)
# attributes names are not case sensitive, do I downcase them
# Maybe (probably) I should substitute for entities when parsing attributes.
sub parse_attributes {
local ($string) = @_;
local (%attributes);
local ($name, $val);
get: while (1) {
if ($string =~ /^ *([A-Za-z]+)=\"([^\"]*)\"/) {
$name = $1;
$val = $2;
$string = $';
$name =~ tr/A-Z/a-z/;
$attributes{$name} = $val; }
elsif ($string =~ /^ *([A-Za-z]+)=(\S*)/) {
$name = $1;
$val = $2;
$string = $';
$name =~ tr/A-Z/a-z/;
$attributes{$name} = $val;}
elsif ($string =~ /^ *([A-Za-z]+)/) {
$name = $1;
$val = "";
$string = $';
$name =~ tr/A-Z/a-z/;
$attributes{$name} = $val;}
else {last;}}
%attributes;}
# Thanks for Carl Gonsalves cgonsalv@mks.com for this code
sub substitute_entities {
local ($string) = @_;
$string =~ s/<//g;
$string =~ s/"/\"/g;
$string =~ s/ / /g;
$string =~ s/¡/¡/g;
$string =~ s/¢/¢/g;
$string =~ s/£/£/g;
$string =~ s/¤/¤/g;
$string =~ s/¥/¥/g;
$string =~ s/¦/¦/g;
$string =~ s/§/§/g;
$string =~ s/¨/¨/g;
$string =~ s/©/©/g;
$string =~ s/ª/ª/g;
$string =~ s/«/«/g;
$string =~ s/¬/¬/g;
$string =~ s///g;
$string =~ s/®/®/g;
$string =~ s/¯/¯/g;
$string =~ s/°/°/g;
$string =~ s/±/±/g;
$string =~ s/²/²/g;
$string =~ s/³/³/g;
$string =~ s/´/´/g;
$string =~ s/µ/µ/g;
$string =~ s/¶/¶/g;
$string =~ s/·/·/g;
$string =~ s/¸/¸/g;
$string =~ s/¹/¹/g;
$string =~ s/º/º/g;
$string =~ s/»/»/g;
$string =~ s/¼/¼/g;
$string =~ s/½/½/g;
$string =~ s/¾/¾/g;
$string =~ s/¿/¿/g;
$string =~ s/À/À/g;
$string =~ s/Á/Á/g;
$string =~ s/Â/Â/g;
$string =~ s/Ã/Ã/g;
$string =~ s/Ä/Ä/g;
$string =~ s/Å/Å/g;
$string =~ s/Æ/Æ/g;
$string =~ s/Ç/Ç/g;
$string =~ s/È/È/g;
$string =~ s/É/É/g;
$string =~ s/Ê/Ê/g;
$string =~ s/Ë/Ë/g;
$string =~ s/Ì/Ì/g;
$string =~ s/Í/Í/g;
$string =~ s/Î/Î/g;
$string =~ s/Ï/Ï/g;
$string =~ s/Ð/Ð/g;
$string =~ s/Ñ/Ñ/g;
$string =~ s/Ò/Ò/g;
$string =~ s/Ó/Ó/g;
$string =~ s/Ô/Ô/g;
$string =~ s/Õ/Õ/g;
$string =~ s/Ö/Ö/g;
$string =~ s/Ø/Ø/g;
$string =~ s/Ù/Ù/g;
$string =~ s/Ú/Ú/g;
$string =~ s/Û/Û/g;
$string =~ s/Ü/Ü/g;
$string =~ s/Ý/Ý/g;
$string =~ s/Þ/Þ/g;
$string =~ s/ß/ß/g;
$string =~ s/à/à/g;
$string =~ s/á/á/g;
$string =~ s/â/â/g;
$string =~ s/ã/ã/g;
$string =~ s/ä/ä/g;
$string =~ s/å/å/g;
$string =~ s/æ/æ/g;
$string =~ s/ç/ç/g;
$string =~ s/è/è/g;
$string =~ s/é/é/g;
$string =~ s/ê/ê/g;
$string =~ s/ë/ë/g;
$string =~ s/ì/ì/g;
$string =~ s/í/í/g;
$string =~ s/î/î/g;
$string =~ s/ï/ï/g;
$string =~ s/ð/ð/g;
$string =~ s/ñ/ñ/g;
$string =~ s/ò/ò/g;
$string =~ s/ó/ó/g;
$string =~ s/ô/ô/g;
$string =~ s/õ/õ/g;
$string =~ s/ö/ö/g;
$string =~ s/÷/÷/g;
$string =~ s/ø/ø/g;
$string =~ s/ù/ù/g;
$string =~ s/ú/ú/g;
$string =~ s/û/û/g;
$string =~ s/ü/ü/g;
$string =~ s/ý/ý/g;
$string =~ s/þ/þ/g;
$string =~ s/ÿ/ÿ/g;
local($ch);
while ($string =~ /([^;]*);/) {
if ($1 == 38){
$ch = "&";
}else{
$ch=sprintf("%c", $1);}
$string =~ s/[^;]*;/$ch/g;
}
# do amperstand conversion last !!!
$string =~ s/&/&/g;
$string;}
@HTML_elements = (
"A",
"ADDRESS",
"B",
"BASE",
"BLINK", # Netscape addition :-(
"BLOCKQUOTE",
"BODY",
"BR",
"CITE",
"CENTER", # Netscape addition :-(
"CODE",
"DD",
"DIR",
"DIV",
"DFN",
"DL",
"DT",
"EM",
"FORM",
"H1", "H2", "H3", "H4", "H5", "H6",
"HEAD",
"HR",
"HTML",
"I",
"ISINDEX",
"IMG",
"INPUT",
"KBD",
"LI",
"LINK",
"MENU",
"META",
"NEXTID",
"OL",
"OPTION",
"P",
"PRE",
"SAMP",
"SELECT",
"STRIKE",
"STRONG",
"TABLE",
"TD",
"TH",
"TR",
"TITLE",
"TEXTAREA",
"TT",
"U",
"UL",
"VAR",
);
sub define_element {
local ($element) = @_;
$Begin{$element} = "Noop";
$End{$element} = "Noop";}
foreach $element (@HTML_elements) {&define_element($element);}
# do nothing
sub Noop {
local ($element, $xxx) = @_;}
# called when a tag begins. Dispatches using Begin
sub html_begin {
local ($element, $tag, *attributes) = @_;
local ($routine) = $Begin{$element};
if ($routine eq "") {
print STDERR "Unknown HTML element $element ($tag) on line $line_count\n";}
else {eval "&$routine;"}}
# called when a tag ends. Explicit is 0 if tag end is because of minimization
# not that you should care.
sub html_end {
local ($element, $explicit) = @_;
local ($routine) = $End{$element};
if ($routine eq "") {
print STDERR "Unknown HTML element \"$element\" (END $explicit) on line $line_count\n";}
else {eval "&$routine(\"$element\", $explicit)";}}
sub html_content {
local ($word) = @_;
}
sub html_whitespace {
local ($whitespace) = @_;}
sub html_comment {
local ($tag) = @_;}
# redefine these for application-specific initialization and cleanup
sub html_begin_doc {}
sub html_end_doc {
print "\n";
}
# return a "true value" when loaded by perl.
1;
rfc.pl 100444 12317 144 5016 6572617254 10710 0 ustar jdavis parc # Routines for HTML handling of an RFC
# load these after loading html-to-ascii.pl because they redefine some things.
# Gosh, it sure would be nice to have an object oriented language for this,
# so I didn't have to duplicate code in both files.
# Jim Davis, July 15 1994
# 3 Aug 94 changed META tag handling.
$lines_per_page = 58;
$columns_per_line = 72;
$left_margin = 3;
# Need this info to generate header lines.
$author = "(no author)";
$status = "Internet Draft";
$title = "(no title)";
$date = "(no date)";
# The values are read from META elements in the HEAD, e.g.
#
#
#
# number of blank lines after header, before text.
$top_skip = 2;
# blank lines before footer
$bottom_skip = 2;
$bottom_margin = $lines_per_page - $bottom_skip - 1 ;
$End{"HEAD"} = "end_head";
sub end_head {
local ($element) = @_;
&set_header_variables_from_meta_tags();
$ignore_text = 0;}
sub set_header_variables_from_meta_tags {
$author = $Variable{"author"};
$status = $Variable{"status"};
$title = $Variable{"title"};
$date = $Variable{"date"};}
# headers are flush left
sub begin_header {
local ($element, $tag) = @_;
$left_margin = 0;
&skip_n_lines ($Skip_Before{$element}, 5);}
sub end_header {
local ($element) = @_;
$left_margin = 3;
&skip_n_lines ($Skip_After{$element});}
# evil kludge
$first_pre = 1;
sub begin_pre {
local ($element, $tag) = @_;
if ($first_pre) {
$left_margin = 0;
$first_pre = 0;}
$whitespace_significant = 1;}
sub end_pre {
local ($element) = @_;
$left_margin = 3;
$whitespace_significant = 0;}
# Called by tformat
sub do_header {
local ($save_left) = $left_margin;
local ($save_right) = $right_margin;
$left_margin = 1; $right_margin = $columns_per_line;
&print_lcr_line ($status, $title, $date);
$left_margin = $save_left; $right_margin = $save_right;
&print_blank_lines ($top_skip);}
sub do_footer {
&print_blank_lines ($bottom_skip);
local ($save_left) = $left_margin;
local ($save_right) = $right_margin;
$left_margin = 1; $right_margin = $columns_per_line;
&print_lcr_line ($author, "", "[Page $page]");
$left_margin = $save_left; $right_margin = $save_right;
print "\014\n";
$page++;}
$End{"BODY"} = "end_document";
sub end_document {
local ($element) = @_;
# might not want to fill the last page
$fill_page_length = $flush_last_page;
&finish_page ();}
1;
tformat.pl 100444 12317 144 6731 6241706116 11604 0 ustar jdavis parc # Simple text formatter
# Jim Davis 17 July 94
# modified 11 Nov 96 to optionally add an extra space at the end of
# a sentence.
# current page, line, and column numbers.
$page = 1;
$line = 1;
$column = 1;
$left_margin = 1;
$right_margin = 72;
# lines on page before footer. or 0 if no limit.
$bottom_margin = 58;
# add newlines to make page be full length?
$fill_page_length = 1;
# should I print an extra space after the end of a sentence?
$add_space_after_sentencep = 0;
sub print_word_wrap {
local ($word) = @_;
if (($column + ($whitespace_significant ? 0 : 1)
+ length ($word) ) > ($right_margin + 1)) {
&fresh_line();}
if ($column > $left_margin && !$whitespace_significant) {
print " ";
$column++;}
print $word;
$column += length ($word);
if ($add_space_after_sentencep) {
$lastchar = substr($word, -1, 1);
if ($lastchar eq "." || $lastchar eq "?") {
print " "; # might exceed margin by 1, oh well
++$column;
}}
}
sub print_whitespace {
local ($char) = @_;
if ($char eq " ") {
$column++;
print " ";}
elsif ($char eq "\t") {
&get_to_column (&tab_column($column));}
elsif ($char eq "\n") {
&new_line();}
else {
die "Unknown whitespace character \"$char\"\nStopped";}
}
sub tab_column {
local ($c) = @_;
(int (($c-1) / 8) + 1) * 8 + 1;}
sub fresh_line {
if ($column > $left_margin) {&new_line();}
while ($column < $left_margin) {
print " "; $column++;}}
sub finish_page {
# Add extra newlines to finish page.
# You might not want to do this on the last page.
if ($fill_page_length) {
while ($line < $bottom_margin) {&cr();}}
&do_footer ();
$line = 1; $column = 1;}
sub start_page {
if ($page != 1) {
&do_header ();}}
sub print_n_chars {
local ($n, $char) = @_;
local ($i);
for ($i = 1; $i <= $n; $i++) {print $char;}
$column += $n;}
# need one NL to end current line, and then N to get N blank lines.
sub skip_n_lines {
local ($n, $room_left) = @_;
if ($bottom_margin > 0 && $line + $room_left >= $bottom_margin) {
&finish_page();
&start_page();}
else {
local ($i);
for ($i = 0; $i <= $n; $i++) {&new_line();}}}
sub new_line {
if ($bottom_margin > 0 && $line >= $bottom_margin) {
&finish_page();
&start_page();}
else {&cr();}
&print_n_chars ($left_margin - 1, " ");}
# used in footer and header where we don't respect the bottom margin.
sub print_blank_lines {
local ($n) = @_;
local ($i);
for ($i = 0; $i < $n; $i++) {&cr();}}
sub cr {
print "\n";
$line++;
$column = 1;}
# left, center, and right tabbed items
sub print_lcr_line {
local ($left, $center, $right) = @_;
&print_tab_left (1, $left);
&print_tab_center (($right_margin - $left_margin) / 2, $center);
&print_tab_right ($right_margin, $right);
&cr();}
sub print_tab_left {
local ($tab_column, $string) = @_;
&get_to_column ($tab_column);
print $string; $column += length ($string);
}
sub print_tab_center {
local ($tab_column, $string) = @_;
&get_to_column ($tab_column - (length($string) / 2));
print $string; $column += length ($string);
}
sub print_tab_right {
local ($tab_column, $string) = @_;
&get_to_column ($tab_column - length($string));
print $string;
$column += length ($string);
}
sub get_to_column {
local ($goal_column) = @_;
if ($column > $goal_column) {print " "; $column++;}
else {
while ($column < $goal_column) {
print " "; $column++;}}}
1;
html-parser.html 100444 12317 144 6260 6572621564 12726 0 ustar jdavis parc
Cheap HTML parser in perl
Cheap HTML parser
Jim Davis
davis@dri.cornell.edu
July 1994
Aug 1998
This is code for doing simple processing on HTML. I know there are
bugs and limitations in the code, but it suffices for simple purposes.
Among the limitations: This is an HTML parser, not an SGML parser - it
does not accept a DTD, rather the model of HTML is built into the
code. Also it does not validate the HTML - it will attempt to parse
invalid documents, and the results are undefined if the document is in
error.
It runs under perl 4.0 patch level 36. I don't
know about other versions of perl.
This directory contains:
- parse-html.pl
- A simple HTML parser written in perl. As it parses the HTML, it calls
routines (which you may redefine) for each tag encountered, and for whitespace and content.
You can redefine these routines so as to process the HTML document.
- html-to-ascii.pl
- Uses the HTML parser to generate a plain ASCII version of an HTML document.
- html-ascii.pl
- The actual routines to generate the ASCII.
- tformat.pl
- A lowlevel text formatter used for generating ASCII. More or less like
a subset of nroff
- html-to-rfc.pl
- Uses the HTML parser to generate a plain ASCII version of an HTML, with
special formatting requirements for Internet drafts and RFCs
- rfc.pl
- Additional routines required for RFC formatting (e.g. page headers and footers)
Generating RFCs from HTML
The RFC format requires there be a header and footer containing, among
other things, the name of the authors, a short title, and so on. You
specify values for these fields with META tags as shown
by the following example.
<META name="status" content="Internet Draft">
<META name="title" content="Internet audio protocol">
<META name="date" content="July 1983">
<META name="author" content="Nixon, Haldeman">
(The META tag is not officially part of HTML, it was proposed by Roy
Fielding.) The tags should be in the HEAD.
Although I don't see that it's required, it seems to be the custom that RFCs have a left margin of three characters, so this code does that too.
Since table processing doesn't work, I suggest you use the PRE tag
for the title page. As a special hack, the very first PRE tag is
not indented.
Known bugs