#!/usr/bin/perl
#
# This tool performs a quick syntax check of the two main types of
# input files that ucampas reads: *-b.html and uconfig.txt files.
# To main purpose of this tool is to be used in the pre-commit hook script
# of a version-control system, such that syntax error get caught before
# they reach the repository, to avoid users without a local ucampas
# installation polluting the revision history with their attempts
# to fix syntax errors.
#
# For this reason, this tool reads files to be checked from standard
# input and processes one one file at a time, without accessing the file
# system.
#
# Note that this is not a full HTML validator. It only spots syntax errors
# that ucampas would complain about.

# this software requires Perl 5.14 or newer
use 5.014;  # implies use strict;

use FindBin qw($RealBin);     # find directory where this file is located ...
use lib $RealBin, "$RealBin/perl-PlexTree";     # ... and add it to @INC
use NavTree;
use PlexTree;
use PlexTree::SGML;
use Encode qw(decode encode);
use Encode::Byte;

sub usage {
    print STDERR <<EOT;
Usage:

  $0 file-b.html ...
  cat file-b.html | $0 -s file-b.html

Options:

  -s filename    Read file content from standard input and use provided
                 filename in error messages and to identify file type
                 (*.html or uconfig.txt)

EOT
    exit 1;
}

sub check_html {
    my ($fn, $lines) = @_;
    my $src = text(join('', @{$lines}));
    my %id;

    eval {
	my $arg = PlexTree->new();
	$arg->append(text(join('', @{$lines})));
	$arg->setatt(public_id => '-//W3C//DTD HTML 4.01//EN');
	PlexTree::SGML::sgmldec(undef, undef, $arg, 's',
				{ attribute_added => sub {
				    my ($elem, $name, $value) = @_;
				    if ($name eq 'id') {
					# check for duplicate ID attributes
					if (exists $id{$value}) {
					    my $e1 = $elem->str;
					    my $e0 = $id{$value}->str;
					    die("duplicate id='$value' ",
						"in <$e0> and <$e1>, ",
						"must be unique\n");
					}
					$id{$value} = $elem;
				    }
				  }
				} );
    } or
      die PlexTree::print_error($@, $fn);
}

sub check_uconfig {
    my ($fn, $lines) = @_;
    my $uconfig;

    eval { $uconfig = c(join('', '(', @{$lines}, ' )')); } or
	die(PlexTree::print_error($@, $fn));
}

# Check an array @{$lines} of all lines (binary strings)
# loaded from UTF-8 plaintext file $fn and die with a
# detailed error message if non-UTF-8 characters are encountered.
sub check_utf8 {
    my ($lines, $fn) = @_;
    my $ln = 0;
    my (@error, @error_fixed);

    foreach my $line (@{$lines}) {
	$ln++;
	my $b = $line;
	my $u = decode('UTF-8', $b, Encode::FB_QUIET);
	if (length($b)) {
	    push @error, "$fn:$ln:".(length($u)+1).":Invalid UTF-8 sequence\n";
	    # show malformed UTF-8 sequences as replacement character
	    push @error, encode('UTF-8', decode('UTF-8', $line));
	    # Check if it might have been Windows CP-1252
	    eval {
		$u = decode('cp1252', $line, Encode::FB_CROAK);
	    };
	    push @error_fixed, encode('UTF-8', $u) unless $@;
	    last if @error > 18;
	}
    }
    if (@error) {
	push @error, <<'EOT';

The input contained characters that were not encoded using the
expected Unicode UTF-8 method. Often these are CP1252 characters from
Microsoft Office documents. On Windows you could fix them with the
Notepad++ function “Convert to UTF-8 without BOM”.
Common culprits: ‘’“”•€£°±²³µäè
EOT
	push @error, <<'EOT', @error_fixed if @error_fixed;

After CP-1252 to UTF-8 conversion, these lines would read:

EOT
	die @error;
    }
}

# Check the file content provided in the already opened file handle $fh.
# Refer to the filename $fn in error messages and determine the syntax
# to be checked based on the filename, which must match either *.html or
# uconfig.txt.
sub check_file {
    my ($fh, $fn) = @_;

    # read input file completely and check for valid UTF-8
    my @lines;
    while (<$fh>) {
	push @lines, $_;
    }
    check_utf8(\@lines, $fn);

    if ($fn =~ /\.html?$/i) {
	check_html($fn, \@lines);
    } elsif ($fn =~ /(?:^|\/)uconfig\.txt$/) {
	check_uconfig($fn, \@lines);
    } else {
	die("$fn: unsupported file type, must be *.html or uconfig.txt");
    }
}

my $parse_options = 1;

usage() unless @ARGV;

while (@ARGV) {
    $_ = shift @ARGV;
    if ($parse_options && /^-/) {
	if ($_ eq '-s') {
	    # read file content from standard input
	    my $fn = shift @ARGV;
	    usage() unless $fn;
	    my $fh;
	    open($fh, '<-')
		|| die("Cannot read standard input: $!\n");;
	    check_file($fh, $fn);
	    close($fh);
	} elsif ($_ eq '--') {
	    $parse_options = 0;
	} else {
	    usage();
	}
    } else {
	my $fh;
	open($fh, '<', $_) ||
	    die("Cannot read file '$_': $!\n");
	check_file($fh, $_);
	close($fh);
    }
}
