#!/usr/pkg/bin/perl
eval 'exec /usr/pkg/bin/perl -S $0 ${1+"$@"}'
    if $running_under_some_shell;
##
##  htmlfix -- Fixup HTML markup code
##  Copyright (c) 1997-2000 Ralf S. Engelschall, All Rights Reserved. 
##  Copyright (c) 2000 Denis Barbier
##

require 5.003;

BEGIN { $^W = 0; } # get rid of nasty warnings

use lib "/usr/pkg/lib/wml/perl/lib";
use lib "/usr/pkg/lib/wml/perl/lib/powerpc-netbsd-thread-multi";

use Getopt::Long 2.13;
use Image::Size;
use IO::File 1.06;

#
#   process command line
#
sub usage {
    print STDERR "Usage: htmlfix [options] [file]\n";
    print STDERR "\n";
    print STDERR "Options:\n";
    print STDERR "  -o, --outputfile=<file>  set output file instead of stdout\n";
    print STDERR "  -F, --fix=<fixes>        select which fix to apply\n";
    print STDERR "  -S, --skip=<fixes>       skip specified fixes\n";
    print STDERR "  -v, --verbose            verbose mode\n\n";
    print STDERR "Fixes are a comma separated list of (default is to process them all)\n";
    print STDERR "  imgalt : add ALT attributes to IMG tags\n";
    print STDERR "  imgsize: add WIDTH/HEIGHT attributes to IMG tags\n";
    print STDERR "  summary: add SUMMARY attribute to TABLE tags\n";
    print STDERR "  center : change proprietary CENTER tag to standard DIV tag\n";
    print STDERR "  space  : fix trailing spaces in tags\n";
    print STDERR "  quotes : add missing quotes for attributes and missing '#' character\n           to color attributes\n";
    print STDERR "  indent : indent paragraphs\n";
    print STDERR "  comment: out-comment tags\n";
    print STDERR "  tagcase: perform tag case-conversion\n";
    exit(1);
}
$opt_v = 0;
$opt_o = '-';
$opt_F = 'imgalt,imgsize,summary,center,space,quotes,indent,comment,tagcase';
$opt_S = '';
$Getopt::Long::bundling = 1;
$Getopt::Long::getopt_compat = 0;
if (not Getopt::Long::GetOptions(
    "v|verbose",
    "F|fix=s",
    "S|skip=s",
    "o|outputfile=s")) {
    &usage;
}

sub verbose {
    my ($str) = @_;
    if ($opt_v) {
        print STDERR "** HTMLfix:Verbose: $str\n";
    }
}
sub error {
    my ($str) = @_;
    print STDERR "** HTMLfix:Error: $str\n";
    exit(1);
}
sub warning {
    my ($str) = @_;
    if (not $opt_q) {
        print STDERR "** HTMLfix:Warning: $str\n";
    }
}

#
#   read input file
#
if (($#ARGV == 0 and $ARGV[0] eq '-') or $#ARGV == -1) {
    $in = new IO::Handle;
    $in->fdopen(fileno(STDIN), 'r') || error("cannot load STDIN: $!");
    local ($/) = undef;
    $buffer = <$in>;
    $in->close() || error("cannot close STDIN: $!");
}
elsif ($#ARGV == 0) {
    $in = new IO::File;
    $in->open($ARGV[0]) || error("cannot load $ARGV[0]: $!");
    local ($/) = undef;
    $buffer = <$in>;
    $in->close() || error("cannot close $ARGV[0]: $!");
}
else {
    &usage;
}

#
#   processing loop
#
$bytes = 0;

#
#   Definitions of fixups
#   Some attention has been paid for efficiency in regular expressions,
#   this is why they appear more complicated than needed.
#

#
#   FIXUP 1: add WIDTH/HEIGHT/ALT attributes to <img>-tags
#
sub ProcessImgTag {
    my ($attr) = @_;
    my ($image, $width, $height, $scale);
    my ($Nwidth, $Nheight, $Pwidth, $Pheight);

    if (   $attr =~ m|SRC\s*=\s*"([^"]*)"|is
        or $attr =~ m|SRC\s*=\s*(\S+)|is    ) {
        $image = $1;

        ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
         $atime,$mtime,$ctime,$blksize,$blocks) = stat($image);
        $bytes += $size;

        #   add WIDTH and HEIGHT to speed up display
        $width  = -1;
        $height = -1;
        $scale  =  1;
        if (   $attr =~ m/WIDTH\s*=\s*([0-9%]+|\*)/is
            or $attr =~ m/WIDTH\s*=\s*"([0-9%]+|\*)"/is) {
            $width = $1;
        }
        if (   $attr =~ m/HEIGHT\s*=\s*([0-9%]+|\*)/is
            or $attr =~ m/HEIGHT\s*=\s*"([0-9%]+|\*)"/is) {
            $height = $1;
        }
        if (   $attr =~ s/SCALE\s*=\s*([0-9]+)%//is
            or $attr =~ s/SCALE\s*=\s*"([0-9]+)%"//is) {
            $scale = $1 / 100;
        }
        if (   $attr =~ s/SCALE\s*=\s*([0-9.]+)//is
            or $attr =~ s/SCALE\s*=\s*"([0-9.]+)"//is) {
            $scale = $1;
        }
        if ($width  eq '*' or $width  == -1 or
            $height eq '*' or $height == -1   ) {
            if (-f $image) {
                ($Pwidth, $Pheight) = Image::Size::imgsize($image);

                #    width given, height needs completed
                if ((not ($width  eq '*' or $width  == -1)) and 
                         ($height eq '*' or $height == -1)     ) {
                    if ($width == $Pwidth) {
                        $Nheight = $Pheight;
                    }
                    else {
                        $Nheight = int(($Pheight/$Pwidth)*$width);
                    }
                }
                #   height given, width needs completed
                elsif ((not ($height eq '*' or $height == -1)) and 
                            ($width  eq '*' or $width  == -1)     ) {
                    if ($height == $Pheight) {
                        $Nwidth = $Pwidth;
                    }
                    else {
                        $Nwidth = int(($Pwidth/$Pheight)*$height);
                    }
                }
                #   both width and height needs completed
                elsif (($height eq '*' or $height == -1) and 
                       ($width  eq '*' or $width  == -1)    ) {
                    $Nwidth  = $Pwidth;
                    $Nheight = $Pheight;
                }

                #   optionally scale the dimensions
                if ($scale != 1) {
                    $Nwidth  = int($Nwidth  * $scale);
                    $Nheight = int($Nheight * $scale);
                }

                #   now set the new values
                if ($width eq '*') {
                    $attr =~ s|(WIDTH\s*=\s*)\S+|$1$Nwidth|is;
                    &verbose("substituting width for $image: ``width=$Nwidth''");
                }
                elsif ($width == -1) {
                    $attr .= " width=$Nwidth";
                    &verbose("adding width for $image: ``width=$Nwidth''");
                }
                if ($height eq '*') {
                    $attr =~ s|(HEIGHT\s*=\s*)\S+|$1$Nheight|is;
                    &verbose("substituting height for $image: ``height=$Nheight''");
                }
                elsif ($height == -1) {
                    $attr .= " height=$Nheight";
                    &verbose("adding height for $image: ``height=$Nheight''");
                }
            }
            else {
                #   complain
                &verbose("cannot complete size of $image: file not found");
                #   and make sure the =* placeholder constructs are removed
                $attr =~ s|WIDTH\s*=\s*\*||is;
                $attr =~ s|HEIGHT\s*=\s*\*||is;
            }
        }
    }

    return $attr;
}
sub Fixup_imgalt {
    $bufferN = '';
    while ($buffer =~ s|^(.*?)(<[iI][mM][gG]\s+)([^>]+?)(/?>)||s) {
        ($pre, $tag, $attr, $end) = ($1, $2, $3, $4);
        if (    $attr !~ m|ALT\s*=\s*"[^"]*"|is
            and $attr !~ m|ALT\s*=\s*\S+|is) {
            &verbose("adding ALT for $image");
            $attr .= ' alt=""';
        }
        $bufferN .= $pre . $tag . $attr . $end;
    }
    $buffer = $bufferN . $buffer;
}
sub Fixup_imgsize {
    $bufferN = '';
    while ($buffer =~ s|^(.*?)(<[iI][mM][gG]\s+)([^>]+?)(/?>)||s) {
        ($pre, $tag, $attr, $end) = ($1, $2, $3, $4);
        $bufferN .= $pre . $tag . &ProcessImgTag($attr) . $end;
    }
    $buffer = $bufferN . $buffer;
}

#
#   FIXUP 2: add summary attribute to <table>-tags
#
sub Fixup_summary {
    &verbose("adding summary attribute to <table>");

    my $last = 0;
    $bufferN = '';
    while ($buffer =~ m|\G(.*?)(<[tT][aA][bB][lL][eE])([^>]*?)(/?>)|gs) {
        $last = pos($buffer);
        ($pre, $begin, $attr, $end) = ($1, $2, $3, $4);

        #   add a SUMMARY="" tag to make HTML lints happy
        if ($attr !~ m|SUMMARY\s*=|i) {
            $attr .= ' summary=""';
        }
        $bufferN .= $pre . $begin . $attr . $end;
    }
    $buffer = $bufferN . substr($buffer, $last);
}

#
#   FIXUP 3: change <center>..</center> to <div align=center>..</div>
#
sub Fixup_center {
    &verbose("replacing <center>..</center> by <div align=center>..</div>");

    $buffer =~ s|<[cC][eE][nN][tT][eE][rR](\s[^>]*)?>|<div align="center"$1>|g;
    $buffer =~ s|</[cC][eE][nN][tT][eE][rR]>|</div>|g;
}

#
#   FIXUP 4: fix trailing space in tags
#
sub Fixup_space {
    &verbose("trailing space in tags");

    #   Only space characters are removed, neither tabs nor newlines
    $buffer =~ s| +>|>|g;
    $buffer =~ s|([^\s])/>|$1 />|g;
}

#
#   FIXUP 5: add quotations to attribute values and
#            add missing '#' char to color attributes 
#
sub Fixup_quotes {
    &verbose("add quotes to attributes");

    my $last = 0;
    $bufferN = '';
    while ($buffer =~ m|\G(.*?)(<[a-zA-Z_][^>]*>)|sg) {
        $last = pos($buffer);
        ($prolog, $tag) = ($1, $2);
        $tag =~ s@([A-Za-z_-]+=)([^\s\"\'><\[]+)(\s|/?>)@$1"$2"$3@sg;
        $tag =~ s|([A-Za-z_-]+=")([0-9A-Fa-f]{6}"[\s/>])|$1#$2|sg;
        $bufferN .= $prolog.$tag;
    }   
    $buffer = $bufferN . substr($buffer, $last);
}


#
#   FIXUP 6: paragraph indentation 
#
sub ProcessIndentContainer {
    my ($attr, $data) = @_;
    my ($num, $size, $pad, $prefix);
    
    #   determine amount of padding
    $num  = 0;
    $size = 4;
    $attr =~ s/num\s*=\s*"?(\d+)"?/$num = $1, ''/ige;
    $attr =~ s/size\s*=\s*"?(\d+)"?/$size = $1, ''/ige;

    #   pad the data
    if ($num > 0) {
        $pad = ' ' x ($num * $size);
        $data =~ s/^/$pad/mg;
    }
    elsif ($num == 0) {
        ($prefix) = ($data =~ m|^\n*([ \t]+)|s);
        if (length($prefix) > 0) {
            $data =~ s/^$prefix//mg;
        }
    }
    return $data;
}
sub Fixup_indent {
    &verbose("paragraph indentation");

    if ($buffer =~ m|<[iI][nN][dD][eE][nN][tT][\s>]|) {
        $bufferN = '';
        while ($buffer =~ s|^(.*?)<indent([^>]*)>(.*?)</indent>||is) {
            ($pre, $attr, $data) = ($1, $2, $3);
            $bufferN .= $pre . &ProcessIndentContainer($attr, $data);
        }
        $buffer = $bufferN . $buffer;
    }
}

#
#   FIXUP 7: out-commenting tags
#
sub Fixup_comment {
    &verbose("remove commenting tags");
    $buffer =~ s|<[a-zA-Z_][a-zA-Z0-9-]*#.*?>||sg;
    $buffer =~ s|</[a-zA-Z_][a-zA-Z0-9-]*#>||sg;
}

#
#   FIXUP 8: tag case translation
#
sub doit_upper {
    ($prefix, $body) = @_;
    $prefix =~ s/^(.+)$/\U$1\E/;
    $body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|\/?>|\s)/\U$1\E$2/sg;
    return $prefix.$body;
}
sub doit_lower {
    ($prefix, $body) = @_;
    $prefix =~ s/^(.+)$/\L$1\E/;
    $body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|\/?>|\s)/\L$1\E$2/sg;
    return $prefix.$body;
}
sub ProcessTagConv {
    my ($attr, $data) = @_;
    my ($case);
    
    #   determine case translation type
    $case = 'upper';
    $attr =~ s/case\s*=\s*"?(upper|lower)"?/$case = lc($1), ''/ige;

    #   and then translate the data
    if ($case eq 'upper') {
        $data =~ s|(<[a-zA-Z][a-zA-Z0-9_-]*\s*/?>)|\U$1\E|sg;
        $data =~ s|(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?/?>)|&doit_upper($1,$2)|sge;
        $data =~ s|(<\/[a-zA-Z][a-zA-Z0-9_-]*\s*/?>)|\U$1\E|sg;
    }
    else {
        $data =~ s|(<[a-zA-Z][a-zA-Z0-9_-]*\s*/?>)|\L$1\E|sg;
        $data =~ s|(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?>)|&doit_lower($1,$2)|sge;
        $data =~ s|(<\/[a-zA-Z][a-zA-Z0-9_-]*\s*/?>)|\L$1\E|sg;
    }
    return $data;
}
sub Fixup_tagcase {
    &verbose("tag case translation");

    if ($buffer =~ m|<[tT][aA][gG][cC][oO][nN][vV][\s>]|) {
        $bufferN = '';
        while ($buffer =~ s|^(.*?)<tagconv([^>]*)>(.*?)</tagconv>||is) {
            ($pre, $attr, $data) = ($1, $2, $3);
            $bufferN .= $pre . &ProcessTagConv($attr, $data);
        }
        $buffer = $bufferN . $buffer;
    }
}

#
#   process all required fixups
#
foreach (split(',', $opt_S)) {
    $opt_F =~ s/\b$_\b//;
}
foreach (split(',', $opt_F)) {
    $fixup = 'Fixup_' . $_;
    &$fixup if defined (&$fixup);
}

#
#   statistic
#
&verbose("Total amount of images: $bytes bytes");

#
#   write to output file
#
if ($opt_o eq '-') {
    $out = new IO::Handle;
    $out->fdopen(fileno(STDOUT), "w") || error("cannot write into STDOUT: $!");
}
else {
    $out = new IO::File;
    $out->open(">$opt_o") || error("cannot write into $opt_o: $!");
}
$out->print($buffer) || error("cannot write into $opt_o: $!");
$out->close() || error("cannot close $opt_o: $!");

exit(0);

##EOF##
