=head1 NAME

PlexTree::SGML - convert HTML/XML files into compounds and back

=head1 SUMMARY

This plugin registers two augmentation and substitution filters
sgmldec and sgmlenc for converting SGML documents into a compound
representation and back.

There is also an (unfinished) subroutine html_cleanup() for cleaning
up HTML documents represented as compounds.

=head1 DETAILS

An SGML (also XML or HTML) file can be represented as a PlexTree
according to the following rules:

An SGML element is represented by a meta string, where the string
is the name of the element.

Attributes of that SGML element are represented as dictionary entries,
with attribute names as text-string keys, and attribute values stored
as the corresponding text-string values.

Child strings and elements of an SGML element are recursively
represented as list child nodes, i.e. child strings with text tag
represent textual content and child-strings with meta tag represent
SGML elements.

Each SGML processing instruction of the form <?name content?> is
represented as a hyper string "name", with a single text-string list
element "content".

SGML entities, in particular those that cannot be converted into
UTF-8, can be represented as super strings (which do not inlude the &
and ; delimiters).

ASP inserts of the form <%content%> can be represented as para
strings.

If the SGML encoder encounters a trans string, it will be inserted
into the generated SGML file as is. The decoder preserves some
non-SGML format extensions, e.g. Internet Explorer conditional-comment
markers, this way.

TODO: finish documentation

=cut

package PlexTree::SGML;

use 5.014;    # so local $@ doesn't clobber die()
use strict;
no locale;
use Carp;
use PlexTree;

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(sgmlenc sgmldec html_cleanup);

sub DEBUG () { 0 }   # change to 1 to activate debugging output

# Load table of HTML character entities from
# https://html.spec.whatwg.org/multipage/named-characters.html
# https://html.spec.whatwg.org/entities.json

require PlexTree::entities;

# Convert UTF-8 strings into UTF-8 encoded SGML strings, that is replace
# SGML meta-characters &<> and all 7-bit ASCII control characters
# with equivalent character entities. Optionally replace even all
# non-ASCII UTF-8 sequences with numerical character references (NCRs).
sub utf8_to_sgml {
    use Encode;
    my ($s, $opt) = @_;
    $s =~ s/&/&amp;/g;
    $s =~ s/</&lt;/g;
    $s =~ s/>/&gt;/g;
    $s =~ s/([\x00-\x08\x0b\x0c\x0e-\x1f\x7f])/sprintf("&#%d;", ord($1))/ge;
    if ($opt->{'ncr'}) {
	no bytes;
	my $u;
	$s =~ s/([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/$u=$1,Encode::_utf8_on($u),sprintf("&#%d;", ord($u))/ge;
    }
    return $s;
}

sub charref_to_utf8($) {
    my ($s) = @_;

    if ($s =~ /^\#([0-9]+)$/) {
	# decimal numeric character reference
	return pack("U", $1);
    } elsif ($s =~ /^\#[x]([0-9a-fA-F]+)$/) {
	# hexadecimal numeric character reference
	return pack("U", hex($1));
    } else {
	if (exists $PlexTree::SGML::entities{$s}) {
	    return $PlexTree::SGML::entities{$s};
	} else {
	    die("Unknown SGML/HTML entity reference '$s'\n");
	}
    }
}

sub sgml_to_utf8 {
    my ($s) = @_;
    my @r = ();

    while ($s !~ /\G\z/) {
	if ($s =~ /\G([^&]+)/gc) {
	    push @r, $1;
	} elsif ($s =~ /\G&([^<>\'\";&\s]+);/gc) {
	    push @r, charref_to_utf8($1);
	} else {
	    $s =~ /\G(.)/gc;
	    push @r, $1;
	}
    }
    return join('', @r);
}

# SGML encoder

PlexTree::register_filter('sgmlenc', \&sgmlenc);

sub sgmlenc {
    my ($parent, $kref, $arg, $type) = @_;
    my $out = PlexTreeMem->new($parent, $kref);
    my $input = $type eq 'a' ? $parent : $arg->cl(0);
    die("Missing input argument in filter sgmlenc\n") unless $input;
    my $o;
    my %opt;
    my $public_id;
    my $dtd = '';

    # prepare options and preamble
    if ($input->cd(meta('xml'))) {
	$opt{'xml'} = 1;
    }
    if ($arg->cd('ncr')) {
	# output numerical character references instead of
	# non-ASCII UTF-8 sequences
	$opt{'ncr'} = 1;
    }
    if (($o = $input->cd(meta('prescript')))) {
	# handle processing instruction that needs to appear before
	# the DOCTYPE declaration (e.g., for PHP's ob_start())
	$dtd .= join('', render($o, \%opt));
    }
    if ($opt{'xml'} && !$arg->cd('noxmldecl')) {
	$dtd .= "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    }
    my $doctype = $input->cd('*doctype');
    my $doctype_name = $doctype->str;
    # output DOCTYPE declaration if available
    if ($doctype) {
	$public_id = $doctype->get('public');
	$dtd .= "<!DOCTYPE $doctype_name";
	if (defined $public_id) {
	    $dtd .= " PUBLIC \"$public_id\"";
	} else {
	    $dtd .= " SYSTEM" if $doctype->cd('system');
	}
        my $system_id = $doctype->get('system');
        $dtd .= " \"$system_id\"" if defined $system_id;
	$dtd .= ">\n";
	# If we output HTML or XHTML, special care is needed to
	# deal with implicit #CDATA in <script> and <style>
	$opt{'html'}  = $public_id =~ m'^-//W3C//DTD HTML ' ||
            (!defined $public_id && lc($doctype_name) eq 'html');
	$opt{'xhtml'} = $public_id =~ m'^-//W3C//DTD XHTML ';
    }
    # output document comment
    # (any comment that we want to add at the start of the file must
    # come after the DOCTYPE declaration, because any comment before
    # forces Internet Explorer into quirks mode)
    my $comment = $arg->get('comment');
    if (defined $comment) {
	$comment =~ s/\\/\\\\/g;     # escape backslash
	$comment =~ s/(?<=-)-/\\-/g; # escape -- by inserting backslash
	$dtd .= "<!-- $comment -->\n";
    }

    # read in doctype-related data
    my $doctype_arg = $arg->cd('doctype');
    if (!$doctype_arg) {
        if (defined $public_id) {
            $doctype_arg = $PlexTree::SGML::default_doctypes->cd(text($public_id));
        } elsif (lc($doctype_name) eq 'html') {
            # render HTML5 like HTML 4.01
            $doctype_arg = $PlexTree::SGML::default_doctypes->cd(
                text "-//W3C//DTD HTML 4.01//EN");
        }
        #print $doctype_arg->print if DEBUG();
    }
    if ($doctype_arg) {
	$o = $doctype_arg->cd('empty');
	map { $opt{'empty'}->{$_->str}=1 } grep { $_->tag == TEXT } $o->keys
	    if $o;
    }

    my @s = render($input, \%opt);

    $out->setstr(join('', $dtd, @s));

    return $out;
}

sub render {
    my ($c, $opt) = @_;
    my @s = ();

    if ($c->tag == TEXT) {
	# plain text
	push @s, utf8_to_sgml($c->str, $opt);
    } elsif ($c->tag == META) {
	# element
	# start tag
	my $element = $c->str;
	push @s, '<' . $element;
	# attributes
	foreach my $d ($c->keys_ordered) {
	    next unless $d->tag == TEXT;
	    my $name = $d->str;
	    my $value = $c->cd($d);
	    if ($value->tag == TEXT) {
		push @s, " $name=";
		if ($value->isempty) {
		    $value = $name;
		} else {
		    $value = $value->str;
		}
		$value = utf8_to_sgml($value, $opt);
		if ($value =~ /\'/ >= $value =~ /\"/) {
		    $value =~ s/\"/&quot;/g;
		    push @s, '"' . $value . '"';
		} else {
		    $value =~ s/\'/&apos;/g;
		    push @s, "'" . $value . "'";
		}
	    } elsif ($value->isempty) {
		push @s, " $name";
	    } elsif ($value->tag == HYPER) {
		# SGML/HTML/XML do not permit processing instructions
		# in attribute values, but PHP programmers love them there,
		# so we need to extend our notion of SGML a bit
		push @s, " $name=\"<?", $value->str;
		push @s, ' ' unless $value->str =~ /^=?$/;
		push @s, $value->getl(0), "?>\"";
	    } elsif ($value->tag == PARA) {
		# ASP insert
		push @s, " $name=\"<%", $value->str, "%>\"";
	    } else {
		# $value->debug(undef, {'maxdepth'=>3});
		# print $c->print;
		die("Unexpected tag value " . $value->tag .
		    " encountered in SGML tree in value of attribute ".
		    "'$name' of element '$element'\n");
	    }
	}
	if ($c->listlen) {
	    push @s, '>';
	    # element content
	    foreach my $d ($c->list) {
		if ($d->tag == TEXT) {
		    # plain text
		    my $text = $d->str;
		    my $pcdata = utf8_to_sgml($text, $opt);
		    warn("SGML plaintext has unexpected list elements\n")
			if $d->listlen;
		    # Elements <script> or <style> contain #CDATA in HTML,
		    # but #PCDATA in XHTML, therefore careful consideration
		    # is needed whether to escape &<> with character entities
		    # or not.
		    if (($element eq 'script' || $element eq 'style') &&
			$pcdata ne $text) {
			if ($opt->{'html'}) {
			    # In HTML, <script> and <style> contain #CDATA,
			    # therefore bypass utf8_to_sgml() here.
			    warn("<$element> content contains $1 end tag\n")
				if $text =~ /(<\/$element>)/i;
			    push @s, $text;
			} elsif ($opt->{'xhtml'}) {
			    # In XHTML, <script> and <style> contain #PCDATA,
			    # but for compatibility with HTML we use
			    # CDATA marked sections and bypass utf8_to_sgml().
			    # However those marked sections are not part of
			    # HTML, and therefore they can only be used
			    # if we know how to uncomment them in the language
			    # used in the content (e.g., JavaScript, CSS),
			    # which we look up from the type attribute.
			    # XHTML is quite messy here!
			    my $type = $c->get('type');
			    if (($element eq 'script' &&
                                 (!defined $type || $type eq '' ||
                                  # https://html.spec.whatwg.org/multipage/scripting.html#attr-script-type
                                  $type =~ m!^(?:text|application)/(?:x-)?(?:ecma|java|j|live)script(?:1\.\d)?\z!aai)) ||
                                ($element eq 'style' &&
                                 (!defined $type || $type eq '' || $type =~ m!^text/css\z!aai))) {
                                push @s, ("/*<![CDATA[*/", $text,
					  "/*]]>*/");
			    } else {
				push @s, $pcdata;
				warn("Using character entity in XHTML ".
				     "element <$element> due to unknown ".
				     "type attribute value '$type'.\n");
			    }
			} else {
			    # not HTML or XHTML
			    push @s, $pcdata;
			}
		    } else {
			# not <script>/<style> or no character entities needed
			push @s, $pcdata;
		    }
		} else {
		    push @s, render($d, $opt);
		}
	    }
	    push @s, '</' . $element . '>'; # end tag
	} else {
	    # empty content
	    if ($opt->{'empty'}->{$c->str}) {
		# no end-tag required thanks to DTD
		push @s, ($opt->{'xml'} ? ' />': '>');
	    } else {
		# add end tag
		push @s, '></' . $c->str . '>';
	    }
	}
    } elsif ($c->tag == HYPER) {
	# processing instruction
	push @s, '<?' . $c->str;
	push @s, ' ' unless $c->str =~ /^=?$/;
	push @s, $c->getl(0) .'?>';
    } elsif ($c->tag == SUPER) {
	# named entity
	push @s, '&' . $c->str .';';
    } elsif ($c->tag == PARA) {
	# ASP insert
	push @s, '<%' . $c->str .'%>';
    } elsif ($c->tag == TRANS) {
	# raw SGML
	push @s, $c->str;
    } elsif ($c->tag == CTRL && $c->str eq '' && $c->dirlen == 0) {
	# empty control string can be used to group a list of elements
	# together in a way that will not be visible in the SGML file
	foreach my $d ($c->list) {
	    push @s, render($d, $opt);
	}
    } else {
	# $c->debug(undef, {'maxdepth'=>2});
	die("Unexpected tag value " . $c->tag . " encountered in SGML tree\n");
    }

    return @s;
}

# SGML decoder

PlexTree::register_filter('sgmldec', \&sgmldec);

sub fixcase($$) {
    my ($fixcase, $s) = @_;
    if (defined $fixcase) {
	$s = uc($s) if $fixcase eq 'A';
	$s = lc($s) if $fixcase eq 'a';
    }
    return $s;
}

# This function can be called according to the conventions of a
# PlexTree augmentation or substitution filter.
#
# If called directly by an application, the caller may optionally
# provide additional parameters via the hash reference $param:
#
# $param->{attribute_added}
#            The function &{$param->{attribute_added}}($elem, $name, $value)
#            is called each time after an attribute was added to an element.
#            Call-back parameters:
#              $elem    Compound cursor of the current element
#              $name    Name of the new attribute (string)
#              $value   Value of the new (string)
sub sgmldec {
    my ($parent, $kref, $arg, $type, $param) = @_;
    my $tree = PlexTreeMem->new($parent, $kref);
    my $out = $tree; # our output cursor
    my $input = $type eq 'a' ? $parent : $arg->cl(0);
    die("Missing input argument in filter sgmldec\n") unless $input;
    my $s = $input->str;  # string to be parsed (read-only, do not modify)
    $param = {} unless defined $param;
    my $o;
    my $fixcase = 'a';
    my $public_id;
    my $system_id;
    my $doctype_name;
    my $xml = 0;

    # skip any UTF-8 BOM (0xef 0xbb 0xbf = U+FEFF) at start of file
    # (some Windows tools annoyingly insist on adding one, e.g. Notepad)
    $s =~ /\G\x{ef}\x{bb}\x{bf}/gc;

    # parse preamble
    while ($s !~ /\G\z/) {
	if ($s =~ /\G\s*<\?xml\s+.*?\??>\r?\n?/gc) {
	    # preserve XML declaration
	    $xml = 1;
	    my $xmldecl = $tree->addkey(meta('xml'));
	} elsif ($s =~ /\G<\?(?:([[:alpha:]_:][[:alnum:]\._:-]*)\s|(=|))(.*?)\?>/gcs) {
	    # other processing instruction
	    my $pitarget = $1 || $2;
	    my $pidata = $3;
	    # represent PI as hyper string containing the PI target,
	    # and append a text string with the PI data
	    $tree->addkey(meta 'prescript',
			  (hyper $pitarget)->append(text $pidata));
	} elsif ($s =~ /\G\s*<!DOCTYPE\s+([[:alpha:]_:][[:alnum:]\._:-]*)
                        (?:\s+(?:SYSTEM|
                                 (?:PUBLIC\s+(?:\"([^\"]*)\"|\'([^\']*)\')))
                              (?:\s+(?:\"([^\"]*)\"|\'([^\']*)\'))?
                        )?\s*>\s*/gcxi) {
	    # preserve public DOCTYPE declaration
	    $doctype_name = $1;
	    $public_id = $2 || $3;
	    $system_id = $4 || $5;
	    die("Second DOCTYPE declaration encountered")
		if $tree->cd(meta 'doctype');
	    my $doctype = $tree->addkey(meta 'doctype');
	    $doctype_name = fixcase($fixcase, $doctype_name);
	    $doctype->setstr($doctype_name);
	    if (defined $public_id) {
		$public_id =~ s/\s+/ /g;
		$public_id =~ s/^\s*(.*?)\s*$/$1/g;
		$doctype->addkey('public')->setstr($public_id);
	    }
	    $doctype->addkey('system')->setstr($system_id)
		if defined $system_id;
	} elsif ($s =~ /\G<!(\"[^\"]*\"|\'[^\']*\'|[^\'\">]*)*>\s*/gc) {
	    # skip other declarations (comments, etc.)
	} else {
	    last;
	}
    }

    # read in doctype-related options
    my $doctype = $arg->cd('doctype');
    $public_id = $arg->get('public_id') unless defined $public_id;
    if (!$doctype) {
        if (defined $public_id) {
            $doctype = $PlexTree::SGML::default_doctypes->cd(text $public_id);
            #print $doctype->print;
        } elsif (lc($doctype_name) eq 'html') {
            # parse HTML5 like HTML 4.01
            $doctype = $PlexTree::SGML::default_doctypes->cd(
                text "-//W3C//DTD HTML 4.01//EN");
        }
    }
    my %empty;
    my %nest;
    my %nestentry;
    my %cdata;
    my %insert;
    if ($doctype) {
	$o = $doctype->cd('empty');
	map { $empty{$_->str}=1 } grep { $_->tag == TEXT } $o->keys
	    if $o;
	if ($o = $doctype->cd('nest')) {
	    # iterate over entries in the nest set
	    foreach my $p ($o->keys) {
		my $v = $o->cd($p);
		# iterate over key elements -> parent elements
		foreach my $a ($p->keys) {
		    next unless $a->tag == TEXT;
		    $nestentry{$a->str} = 1;
		    # iterate over value elements -> their possible child el'ts
		    foreach my $b ($v->keys) {
			next unless $b->tag == TEXT;
			$nest{$a->str . '>' . $b->str} = 1;
		    }
		}
	    }
	}
	if ($o = $doctype->cd('insert')) {
	    # iterate over entries in the insert set
	    foreach my $triggers ($o->keys) {
		next unless $triggers->len == 0; # skip comments
		my $actions = $o->cd($triggers);
		# iterate over trigger elements
		foreach my $trigger ($triggers->keys) {
		    next unless $trigger->tag == TEXT;
		    foreach my $parent ($actions->keys) {
			next if $parent->tag == CTRL;
			my $toinsert = $actions->cd($parent);
			my $p = $parent->str;
			$p = "[$p]" if $parent->tag == HYPER;
			$insert{$p . '>' . $trigger->str} =
			    $toinsert->str;
		    }
		}
	    }
	}
	if ($o = $doctype->cd('cdata')) {
	    map { $cdata{$_->str}=1 } grep { $_->tag == TEXT } $o->keys;
	}
	if (defined($o = $doctype->get('fixcase'))) {
	    $fixcase = $o;
	}
    }

    # parser
    my $cdata_content = 0;
    local $@;
    eval {
	my $first = 1;
	while ($s !~ /\G\z/) {
	    if ($s =~ /\G&([^<>\'\";&\s]+);/gc) {
		# character entity
		my $charref = $1;
		die("Unexpected character entity '$charref' " .
		    "before first SGML element tag\n")
		    if $first;
                eval {
                    my $s = charref_to_utf8($charref);
                    $out = $out->append0 if ($out->tag != TEXT);
                    $out->appstr($s);
                };
                if ($@) {
                    # preserve character references that
                    # could not be converted to UTF-8
                    if ($out->tag == TEXT) {
                        $out = $out->parent;
                    }
                    $out->append(super $charref);
                    die($@);
                }
	    } elsif ($s =~ /\G&/gc) {
		# & that is not initiating a character entity
		die("Unexpected '&' " .
		    "before first SGML element tag\n")
		    if $first;
		$out = $out->append0 if ($out->tag != TEXT);
		$out->appstr('&');
	    } elsif ($cdata_content ?
		     ($s =~ /\G((?:(?!<\/).)+)/gcs) :
		     ($s =~ /\G([^<&]+)/gc)) {
		# regular PCDATA or CDATA
		my $cdata = $1;
		print "CDATA$cdata_content: '$cdata'\n" if DEBUG();
		if ($first) {
		    $cdata =~ s/^\s+//;
		    die("Unexpected '$1' before first " .
			"SGML element tag\n")
			if $cdata =~ /^(.{1,10})/s;
		    next;
		}
		$out = $out->append0 if ($out->tag != TEXT);
		$out->appstr($cdata);
		$cdata_content = 0;
	    } elsif ($s =~ /\G<!\[\s*RCDATA\s*\[(.*?)\]\]>/gcs) {
		# RCDATA marked section
		my $rcdata = $1;
		print "RCDATA section: '$rcdata'\n" if DEBUG();
		die("Unexpected RCDATA section before first " .
		    "SGML element tag\n") if $first;
		$out = $out->append0 if ($out->tag != TEXT);
		$out->appstr(sgml_to_utf8($rcdata));
	    } elsif ($s =~ /\G<!\[\s*CDATA\s*\[(.*?)\]\]>/gcs) {
		# CDATA marked section
		my $cdata = $1;
		print "CDATA section: '$cdata'\n" if DEBUG();
		die("Unexpected CDATA section before first " .
		    "SGML element tag\n") if $first;
		$out = $out->append0 if ($out->tag != TEXT);
		$out->appstr($cdata);
	    } elsif ($s =~ /\G<!\[\s*IGNORE\s*\[(.*?)\]\]>/gcs) {
		# ignore IGNORE marked section
	    } elsif ($s =~ /\G(<!--\[if [\w\s!()&|]+\]>)/gc ||
		     $s =~ /\G(<!\[endif\]-->)/gc) {
		# preserve Internet Explorer conditional-comment markers
		# http://msdn.microsoft.com/en-us/library/ms537512%28v=vs.85%29.aspx
		my $cc = $1;
		# close character data
		if ($out->tag == TEXT) {
		    $out = $out->parent;
		}
		$out->append(trans $cc);
	    } elsif ($s =~ /\G<!--(?:[^-]|-[^-])*-->/gc) {
		# skip comment
	    } elsif ($s =~ /\G<\?(?:([[:alpha:]_:][[:alnum:]\._:-]*)\s|(=|))(.*?)\?>/gcs) {
		# processing instruction
		my $pitarget = $1 || $2;
		my $pidata = $3;
		# close character data
		if ($out->tag == TEXT) {
		    $out = $out->parent;
		}
		# represent PI as hyper string containing the PI target,
		# and append a text string with the PI data
		$out->append0->set(hyper $pitarget)->append(text $pidata);
	    } elsif ($s =~ /\G<%(.*?)%>/gcs) {
		# <%...%> insert (used by Microsoft's Active Server Pages, ASP)
		my $data = $1;
		# close character data
		if ($out->tag == TEXT) {
		    $out = $out->parent;
		}
		# represent ASP insert as para string containing the ASP code
		$out->append0->set(para $data);
	    } elsif ($s =~ /\G(<([[:alpha:]_:][[:alnum:]\._:-]*)((?:\s+[[:alpha:]_:][[:alnum:]\._:-]*\s*(?:=\s*\"[^\"]*\"|=\s*\'[^\']*\'|=\s*[^\'\">\s\/]*||=\s*\"<\?.*?\?>\"|=\s*\"<%.*?%>\"|))*)\s*(\/?)>)/gc) {
		# element start tag
		my $starttag = $1;
		my $element = $2;
		my $att = $3;
		my $end = $4;
		$element = fixcase($fixcase, $element);
		# close character data
		if ($out->tag == TEXT) {
		    $out = $out->parent;
		}
		# should we close anything else first?
		while (exists $nestentry{$out->str} &&
		       !exists $nest{$out->str . '>' . $element}) {
		    print "AUTO-END ELEMENT " . $out->str . "\n" if DEBUG();
		    $out = $out->parent;
		    die("SGML element '$element' is not permitted here\n")
			unless $out && $out->tag == META;
		}
		# should we open anything first?
		my $parent = $out->parent;
		my $toinsert;
		while (1) {
		    if ($out->tag == META) {
			$parent = $out->str;
		    } else {
			$parent = '[ROOT]';
		    }
		    $toinsert = $insert{$parent.'>'.$element};
		    last unless defined $toinsert;
		    # append new element
		    my $new;
		    if ($first) {
			$new = $out;
			undef $first;
		    } else {
			$new = $out->append0;
		    }
		    print "AUTO-START ELEMENT $toinsert\n" if DEBUG();
		    $new->settstr(META, $toinsert);
		    $parent = $out;
		    $out = $new;
		}
		# append new element
		my $new;
		if ($first) {
		    $new = $out;
		    undef $first;
		} else {
		    $new = $out->append0;
		}
		$new->settstr(META, $element);
		$cdata_content = $cdata{$element};
		print "START ELEMENT $element\n" if DEBUG();
		# parse attributes
		if (defined $att) {
                    my @order;
		    while ($att !~ /\G\z/) {
			if ($att =~ /\G\s+([[:alpha:]_:][[:alnum:]\._:-]*)\s*=\s*\"<\?(?:([[:alpha:]_:][[:alnum:]\._:-]*)\s|(=|))(.*?)\?>\"/gc) {
			    my $name = $1;
			    my $pitarget = $2 || $3;
			    my $pidata = $4;
			    # represent PI as hyper string containing the PI target,
			    # and append a text string with the PI data
			    $new->addkey($name)->set(hyper $pitarget)->append(text $pidata);
                            push @order, $name;
			} elsif ($att =~ /\G\s+([[:alpha:]_:][[:alnum:]\._:-]*)\s*=\s*\"<%(.*?)%>\"/gc) {
			    my $name = $1;
			    my $data = $2;
			    # represent ASP insert as para string
			    $new->addkey($name)->set(para $data);
                            push @order, $name;
			} elsif ($att =~ /\G\s+([[:alpha:]_:][[:alnum:]\._:-]*)\s*=\s*\"([^\"]*)\"/gc ||
				 $att =~ /\G\s+([[:alpha:]_:][[:alnum:]\._:-]*)\s*=\s*\'([^\']*)\'/gc ||
				 $att =~ /\G\s+([[:alpha:]_:][[:alnum:]\._:-]*)\s*=\s*([a-zA-Z0-9\._:-]+)/gc) {
			    # normal attribute=value pair
			    my $name = $1;
			    my $value = $2;
			    $name = fixcase($fixcase, $name);
			    my $v = $new->addkey($name);
                            push @order, $name;
			    if (defined $value) {
				$v->setstr(sgml_to_utf8($value));
			    }
			    print "  ATTRIBUTE $name='$value'\n" if DEBUG();
                            &{$param->{attribute_added}}($new, $name, $value)
                                if $param->{attribute_added};
			} elsif ($att =~ /\G\s+([[:alpha:]_:][[:alnum:]\._:-]*)/gc) {
			    my $name = $1;
			    $name = fixcase($fixcase, $name);
			    $new->addkey($name);
                            push @order, $name;
			    print "  ATTRIBUTE $name\n" if DEBUG();
                            &{$param->{attribute_added}}($new, $name)
                                if $param->{attribute_added};
			} elsif ($att !~ /\G\s+/gc) {
			    die("Unexpected character sequence encountered in SGML tag attributes '$att'\n");
			}
		    }
                    # record order of attributes as well
                    $new->set_key_order(@order) if @order > 1;
		}
		# consider self-ending start tags
		if ($end eq '/' || exists $empty{$element}) {
		    # element is already finished here
		    $cdata_content = 0;
		    print "SELF-END ELEMENT $element\n" if DEBUG();
		} else {
		    $out = $new;
		}
	    } elsif ($s =~ /\G(<\/([[:alpha:]_:][[:alnum:]\._:-]*)\s*>)/gc) {
		# element end tag
		my $endtag = $1;
		my $element = $2;
		$element = fixcase($fixcase, $element);
		# close character data
		if ($out->tag == TEXT) {
		    $out = $out->parent;
		}
		$cdata_content = 0;
		# close elements until one is encountered that matches $element
		my $leaving;
		while (1) {
		    $leaving = $out;
		    $out = $out->parent;
		    last if $leaving->tag == META && $leaving->str eq $element;
		    print "AUTO-END ELEMENT ".$leaving->str."\n" if DEBUG();
		    die("SGML tag $endtag attempts to close an element ".
			"'$element' that was not open\n")
			unless ($out && $leaving->nid != $tree->nid)
		};
		print "END ELEMENT " . $leaving->str . "\n" if DEBUG();
		if ($leaving->nid == $tree->nid) {
		    # now only whitespace and comments may be left
		    $s =~ /\G\s*(?:<!--([^-]|-[^-])*-->\s*)*/gc;
		    die("Unexpected '$1' after end of top-level " .
			"SGML element '$element'\n")
			if $s =~ /\G(.{1,10})/s;
		}
	    } else {
		die("Unexpected character sequence encountered in SGML file\n");
	    }
	    print "PATH: " . join('/', map {$_->tag == META() &&
						$_->str} $out->path)
		. "\n" if DEBUG();
	}
    };
    if ($@) {
	# exception handling
	my $err = PlexTreeMem->new;
	chomp($@);
	$err->setstr($@);
	#$err->addkey('input_suffix')->setstr($s);
	#$err->addkey('input')->setstr($str->str);
	# determine line and column position of error
	my $char = pos $s;
	my @l = split(/\n/, $s);
	my $line = 0;
	while ($char > 0 && $char - (length($l[$line]) + 1) >= 0) {
	    $char -= length($l[$line]) + 1;
	    $line++;
	}
	$err->addkey('line')->setstr($l[$line]);
	$err->addkey('errrow')->setstr($line + 1);
	$err->addkey('errcol')->setstr($char + 1);
	die($err);
    }

    return $tree;
}

# The DTD used below differes in minor aspect from the W3C spec and is
# slightly more generous to allow for historic practice and some
# broken XHTML generators:
#
# - addition of Netscape's <embed> and <noembed> elements according to
#   http://www.yoyodesign.org/doc/dtd/html4-embed.html.en
#
# - in the XHTML 1.0 spec, <script> and <style> have actually PCDATA
#   content, not CDATA as in HTML and here, so we are a
#   bit more tolerant here, since too few people fully understand all
#   the restrictions of XHTML

our $default_doctypes = c(<<'EOT');
(
  "-//W3C//DTD HTML 4.01//EN"=(
    empty={
       .c='Elements with no content that close themselves',
       area, base, basefont, br, col, frame, hr, img, input, isindex,
       link, meta, param
    },
    insert={
       .c='Opening element A listed here as {A}(B=C) will cause C to be opened first if B is the most recently open element',
       {tr}={table=tbody},
       {.c='%flow;', p, h1, h2, h3, h4, h5, h6, ul, ol, pre, dl, div,
        noscript, blockquote, form, hr, table, fieldset,
        address, noembed, script, tt, i, b, u, s, strike, big, small,
        em, strong, dfn, code, samp, kbd, var, cite, abbr,
        acronym, a, img, applet, object, font, basefont, br,
        script, map, q, sub, sup, span, bdo, iframe, embed,
        input, select, textarea, label, button, +PCDATA
       }={html=body, +ROOT=html},
       {head, body, frameset}={ +ROOT=html },
       {title, base, script, style, meta, link, object}={html=head, +ROOT=html}
    },
    nest={
       .c='If an open element is listed here, then an attempt to open another element than the one listed here under it will close the open element first',
       {head}={title, base, script, style, meta, link, object},
       {title,script,style}=,
       {p,dt}={.c='%inline;', tt, i, b, u, s, strike, big, small,
        em, strong, dfn, code, samp, kbd, var, cite, abbr,
        acronym, a, img, applet, object, font, basefont, br,
        script, map, q, sub, sup, span, bdo, iframe, embed,
       input, select, textarea, label, button, +PCDATA},
       {li,th,td,dd}={.c='%flow;',
            p, h1, h2, h3, h4, h5, h6, ul, ol, pre, dl, div, center,
            noscript, noframes, blockquote, form, isindex, hr, table, fieldset,
            address, noembed, script, tt, i, b, u, s, strike, big, small,
            em, strong, dfn, code, samp, kbd, var, cite, abbr,
            acronym, a, img, applet, object, font, basefont, br,
            script, map, q, sub, sup, span, bdo, iframe, embed,
            input, select, textarea, label, button, +PCDATA},
       {option}={+PCDATA},
       {thead, tfoot, tbody}={tr},
       {colgroup}={col},
       {tr}={th,td}
    },
    cdata={script, style},
    fixcase='a'
  ),
  "-//W3C//DTD HTML 3.2 Final//EN"        =.sr("-//W3C//DTD HTML 4.01//EN"),
  "-//W3C//DTD HTML 4.0//EN"              =.sr("-//W3C//DTD HTML 4.01//EN"),
  "-//W3C//DTD HTML 4.0 Transitional//EN" =.sr("-//W3C//DTD HTML 4.01//EN"),
  "-//W3C//DTD HTML 4.01 Transitional//EN"=.sr("-//W3C//DTD HTML 4.01//EN"),
  "-//W3C//DTD XHTML 1.0 Strict//EN"      =.sr("-//W3C//DTD HTML 4.01//EN"),
  "-//W3C//DTD XHTML 1.0 Transitional//EN"=.sr("-//W3C//DTD HTML 4.01//EN")
)
EOT

sub html_cleanup {
    my ($h) = @_;

    return unless $h->tag == META;
    my $el = lc($h->str);
    # collapse consecutive strings
    for (my $i = 0; $i < $h->listlen - 1; $i++) {
	my $this = $h->cl($i);
	if ($this->tag == TEXT) {
	    my $next;
	    while (($next = $h->cl($i+1)) &&
		   $next->tag == TEXT) {
		$this->setstr($this->str . $next->str);
		$next->cut;
	    }
	}
    }
    if ($el =~ /^pre|script|style$/) {
	# do not touch any whitespace in preformatted and non-HTML text
	return;
    } elsif ($el =~ /^html|head|table|thead|tbody|tr|ul|ol|dl|select$/) {
	# any white-space in PCDATA children and empty PCDATA children
	# can be removed
	foreach my $c ($h->list) {
	    if ($c->tag == TEXT()) {
		my $s = $c->str;
		$s =~ s/^\s+//;
		$s =~ s/\s+$//;
		$s =~ s/\s+/ /;
		if ($s eq '') {
		    $c->cut;
		} else {
		    $c->setstr($s);
		}
	    } else {
		html_cleanup($c);
	    }
	}
    } elsif ($el =~ /^p|h[1-6]|div|center|blockquote|address|tt|i|b|u|s|strike|big|small|em|strong|dfn|code|samp|kbd|var|cite|abbr|acronym|a|font|sub|sup|span|$/) {
	# initial and final whitespace can be removed, intermediate
	# whitespace and strings can be collapsed
	my $c;
	# remove initial whitespace
	if (($c = $h->cl(0)) && $c->tag == TEXT) {
	    my $s = $c->str;
	    $s =~ s/^\s+//;
	    if ($s eq '') {
		$c->cut;
	    } else {
		$c->setstr($s);
	    }
	}
	# remove final whitespace
	if (($c = $h->cl($h->listlen - 1)) && $c->tag == TEXT) {
	    my $s = $c->str;
	    $s =~ s/\s+$//;
	    if ($s eq '') {
		$c->cut;
	    } else {
		$c->setstr($s);
	    }
	}
	# reduce intermediate whitespace
	foreach $c ($h->list) {
	    if ($c->tag == TEXT) {
		my $s = $c->str;
		$s =~ s/\s+/ /g;
		if ($s eq '') {
		    $c->cut;
		} else {
		    $c->setstr($s);
		}
	    }
	}
	# recurse
	foreach $c ($h->list) {
	    html_cleanup($c);
	}
    } else {
	# just recurse
	foreach my $c ($h->list) {
	    html_cleanup($c);
	}
    }
}

1;
