#!/usr/bin/perl -w

=pod

HTX v0.7 - Hhtml To Xhtml Convertor

Copyright (C) 2004-2008 Jamie Cheetham
    Email: jamie at softham.co.uk

##############################################################

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

##############################################################

For further information see the README file.

=cut

use strict;
use Getopt::Long;

my ($dos, $mac, $multi, $help, $version, $verbose, $tty, $filename, $output) = '';
my $prog_version = '0.7.8';

# Handle command line arguments
GetOptions('dos' => \$dos,
           'mac' => \$mac,
           'multi' => \$multi,
           'tty' => \$tty,
           'help' => \$help,
           'verbose|ver' => \$verbose,
           'version|v' => \$version) || die "usage: htx [-h|-v] [-m|-d] [-ver] [-t] filename [output-filename]\n\n";
version() if ($version);
usage() if ($help);
die "ERROR: Cannot select DOS and Mac text files simultaneously.\n\n" if ($dos && $mac);
die "ERROR: No filename specified.\n\n" if (@ARGV == 0);

# Populate data arrays
my @empty_tags = qw/area base basefont br col frame hr img input isindex link meta param/;
my @min_attr = qw/compact checked declare readonly disabled selected defer ismap nohref noshade nowrap multiple noresize/;
my @old_doctypes1 = ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Transitional//EN"\s*?>',
                     '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Strict//EN"\s*?>',
                     '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Frameset//EN"\s*?>');
my @old_doctypes2 = ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Transitional//EN"'.
                     '\s+"http://www.w3.org/TR/html4/loose.dtd">',
                     '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Strict//EN"'.
                     '\s+"http://www.w3.org/TR/html4/strict.dtd">',
                     '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01? Frameset//EN"'.
                     '\s+"http://www.w3.org/TR/html4/frameset.dtd">');
my @new_doctypes = ('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'."\n".
                    '  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'."\n".
                    '  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">',
                    '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"'."\n".
                    '  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">');

# Process each file in turn
foreach my $argument (@ARGV) {
    $filename = $argument;
    if ($multi) {$output = $filename;} else {$output = $ARGV[1] || $filename;}
    if (!-e $filename) {
        warn "WARNING: File not found: $filename\n";
        next;
    }
    if (!-r $filename) {
        warn "WARNING: Unable to read file: $filename\n";
        next;
    }
    if (!-T $filename) {
        warn "WARNING: Cannot process non-text file: $filename\n";
        next;
    }
    convert();
    last if (!$multi);
}

print "Done.\n\n" if (!$tty);

exit;


sub convert {
    print "Opening $filename...\n" if ($verbose);

    # Load the file and slurp it into a string
    open (INPUT, "< $filename") or die "ERROR: Unable to read file: $filename.\n\n";
    my $string = do { local $/; <INPUT> };
    close INPUT;

    print "Processing $filename...\n" if ($verbose);

    # Remove CR characters to change from Windows to Linux line breaks
    $string =~ s/\r//g if ($dos);

    # Replace CR characters to change from Mac to Linux line breaks
    $string =~ s/\r/\n/g if ($mac);

    # Warn about old ICRA data
    if ($verbose) {
        if ($string =~ m/<meta http-equiv="pics-label" /i) {
            warn "WARNING: Old pics-label tag detected. ".
                 "It is recommended that you regenerate it at http://www.icra.org/label/\n";
        }
    }

    # Process each tag containing a = individually, ignoring ones starting with <? or <! or <%
    my @tags = ($string =~ m/(<[^?!%][^>]+?=.+?>)/gm);
    foreach my $tag (@tags) {
        my $new_tag = $tag;
       
        # Double quote unquoted alphanumeric attribute values
        $new_tag =~ s/(\w+?)='([^='"]+?)'([ |>])/$1="$2"$3/g;
        $new_tag =~ s/(\w+?)=([^\s"']+?)([ |>])/$1="$2"$3/g;

        # Make chars between < and =", containing no ", lowercase
        $new_tag =~ s/^(<[^"]+?)="/\L$1\E="/;

        # Make chars between "  and =", containing no ", lowercase
	$new_tag =~ s/"(\s)([^"]+?)="/"$1\L$2\E="/g;

        # Make chars between " and >, containing no ", lowercase
        $new_tag =~ s/("[^"]+?) *?>$/\L$1\E>/;

        # Make the values of the align, valign and shape properties lowercase
        $new_tag =~ s/ (v?)align="(.+?)"/ $1align="\L$2\E"/g;
        $new_tag =~ s/ shape=\"(.+?)"/ shape="\L$1\E"/;

        $string =~ s/\Q$tag\E/$new_tag/g;
    }

    # Make chars between < and >, containing no " and not starting with <! or <? or <%, lowercase
    $string =~ s/(<[^?!%][^"]*?>)/\L$1\E/g;

    # Add closing slash to empty tags
    foreach (@empty_tags) {
        $string =~ s/(<$_.*?)("?) ?>/$1$2 \/>/gs;
    }
    $string =~ s# / /># />#g;

    # Process each and every tag individually, ignoring ones starting with <? or <! or <%
    @tags = ($string =~ m/(<[^?!%][^>]+?>)/gm);
    foreach my $tag (@tags) {
        my $new_tag = $tag;

        # Correct attribute minimization
        foreach (@min_attr) {
            $new_tag =~ s/ $_([ |>])/ $_="$_"$1/g;
        }

        # Make hex colour codes lowercase
        $new_tag =~ s/(\#[A-Fa-f0-9]{3})/\L$1\E/g;
        $new_tag =~ s/(\#[A-Fa-f0-9]{6})/\L$1\E/g;

        # Check for unmatched double quotes
        if (($tag =~ tr/"//) & 1) {warn "WARNING: $tag is potentially invalid\n";}

        $string =~ s/\Q$tag\E/$new_tag/g;
    }

    # Change the name attribute to id in <a> and <map> tags
    $string=~ s/<(a|map)( |.*?)name="(.*?)"( |>)/<$1$2id="$3"$4/g;

    # Change the value of the clear attribute to lowercase in <br> tags
    $string=~ s/<br( |.*?)clear="(.*?)" /<br$1clear="\L$2\E" /g;

    # Update the HTML tag itself
    $string =~ s/<html.*?>/<html xmlns="http:\/\/www.w3.org\/1999\/xhtml" xml:lang="en">/g;

    # Update doctype or add if missing
    for(my $count = 0; $count < 3 ; $count++) {
        last if ($string =~ s/$old_doctypes1[$count]/$new_doctypes[$count]/i);
        last if ($string =~ s/$old_doctypes2[$count]/$new_doctypes[$count]/i);
    }
    if (!($string =~ m/<!DOCTYPE /i)) {$string = $new_doctypes[0]."\n\n".$string;}

    print "Writing $output...\n" if ($verbose);

    # Output the new file
    if ($tty) {print $string;}
    else {
        open (OUTPUT, ">$output") or die "ERROR: Unable to write to $output\n\n";
        binmode OUTPUT;
        print OUTPUT $string;
        close OUTPUT;
    }
}

sub version {
    print "HTX version $prog_version, Copyright (C) 2004-2008 Jamie Cheetham\n";
    print "Softham: http://www.softham.co.uk/\n\n";
    exit;
}

sub usage {
    print << "EOF";
HTML To XHTML Convertor $prog_version, Copyright (C) 2004-2008 Jamie Cheetham

Usage: htx [--dos|--mac] [--verbose] <filename> [<output filename>]
       htx [--dos|--mac] [--verbose] [--multi] <file pattern1> [<file pattern2> ...] 
       htx [--help|--version]

If the output filename isn't specified in single-file mode, the initial file is
overwritten with the updated code.

Options:

  -d --dos
    Convert line breaks from Windows text files to use in Unix/Linux.

  -ma --mac
    Convert line breaks from Mac text files to use in Unix/Linux.

  -mu --multi
    Process multiple files simultaneously and overwrite them.

  -h --help
    Display this help and exit.

  -ver --verbose
    Display extra information while processing.

  -v --version
    Display version number and exit.

EOF
    exit;
}

__END__
