#!/usr/local/bin/perl
#htmlsrpl.pl: HTML-aware search-and-replace; acts either only outside HTML/SGML
#             tags, or only within HTML/SGML tags; can also upper-case tag names
#
# Typical use:
#
#   perl htmlsrpl.pl [options] infile.html > outfile.html
#
# Where options have the form "option=value"; all options should precede
# filename arguments on the command line.  (See the documentation.)
#
# Copyright H. Churchyard 1994, 1995 -- freely redistributable.  This code is
# awk-influenced (so sue me).  Tested under Perl 4 (I'm still not sure whether
# the fact that "s/$x/$y/" is equivalent to "s/$x/$y/e" is a bug or not).
#
#  Version 1.0 12/21/94 -- Preliminary version.
#  Version 1.01 12/22/94 -- Minor bugfix.
#  Version 1.1 1/7/95 -- Added inside=, inmost=, oustside= , etc.  Included in
# htmlchek 4.0 release.
#  Version 1.11 1/22/95 -- Added "Changed!/Unchanged" final status messages.
# Included in htmlchek 4.1 release.
#
eval "exec /usr/local/bin/perl -S $0 $*"
    if $running_under_some_shell; # this emulates #! processing on NIH machines.
#process any FOO=bar switches
$old= ''; $new = ''; $intags = 0; $regexp = 0; $regeval = 0; $upcase = 0;
$lines = 0; $delete = 0; $case = 0; $slash=0; $inmost=''; $inside = '';
$outside = '';
eval '$'.$1.'$2;' while $ARGV[0] =~ /^(old=|new=|intags=|lines=|regexp=|regeval=|upcase=|delete=|case=|slash=|inmost=|inside=|outside=)(.*)/ && shift;
$[ = 1;                 # set array base to 1
$, = ' ';               # set output field separator
$\ = "\n";              # set output record separator
foreach $X (@ARGV) {
    if ($X =~ /^[^=]+=/) {
        print STDERR "Apparent misspelled or badly-placed command-line option $&";
        print STDERR "Attempting to continue anyway...";}}
$filstr = join(' ',@ARGV); $changed = 0;
if ($lines)  {$/ = "\0777"; $* = 1;}
else {$/ = "\n";}
if (($outside) && (!(($inside) || ($inmost)))) {$applyit = 1;}
else {$applyit = 0;}
#
$unpair{'!--'} = 1; $unpair{'!DOCTYPE'} = 1; $unpair{'BASE'} = 1;
$unpair{'BR'} = 1; $unpair{'COMMENT'} = 1; $unpair{'HR'} = 1;
$unpair{'IMG'} = 1; $unpair{'INPUT'} = 1; $unpair{'ISINDEX'} = 1;
$unpair{'LINK'} = 1; $unpair{'META'} = 1; $unpair{'NEXTID'} = 1;
$unpair{'ATOP'} = 1; $unpair{'LEFT'} = 1;
$unpair{'OVER'} = 1; $unpair{'OVERLAY'} = 1; $unpair{'RIGHT'} = 1;
$unpair{'TAB'} = 1; $unpair{'BASEFONT'} = 1; $unpair{'WBR'} = 1;
$nestvar = 0; $numins = 0; $numout = 0;
if ($inmost) {
    $inmost =~ tr/a-z/A-Z/;
    if ($inmost =~ /[^-.a-zA-Z0-9]/) {
        die 'Non-alphanumeric value of inmost= was specified';}
    if (defined $unpair{$inmost}) {
        die "Non-pairing tag $inmost specified as value of inmost=";}}
if ($inside) {
   $numins = (@inarr = split(/,/, $inside));
        for ($i = 1; $i <= $numins; ++$i) {
            $inarr[$i] =~ tr/a-z/A-Z/;
            if ((!$inarr[$i]) || ($inarr[$i] =~ /[^-.a-zA-Z0-9]/)) {
               die 'Non-alphanumeric value of inside= was specified';}
            if (defined $xxin{$inarr[$i]}) {
               die 'Duplicate values of inside= were specified';}
            if (defined $unpair{$inarr[$i]}) {
               die "Non-pairing tag $inarr[$i] specified as value of inside=";}
            else {
               $xxin{$inarr[$i]} = 1;}}}
if ($outside) {
   $numout = (@outarr = split(/,/, $outside));
        for ($i = 1; $i <= $numout; ++$i) {
            $outarr[$i] =~ tr/a-z/A-Z/;
            if ((!$outarr[$i]) || ($outarr[$i] =~ /[^-.a-zA-Z0-9]/)) {
               die 'Non-alphanumeric value of outside= was specified';}
            if (defined $xxout{$outarr[$i]}) {
               die 'Duplicate values of outside= were specified';}
            if (defined $xxin{$outarr[$i]}) {
               die "Tagname $outarr[$i] specified as both outside= and inside=";}
            if (defined $unpair{$outarr[$i]}) {
               die "Non-pairing tag $outarr[$i] specified as value of outside=";}
            else {
               $xxout{$outarr[$i]} = 1;}}}
#
if ((!$old) && (!$upcase)) {die "No `old=' string was specified";}
if (($delete) && (($new) || ($regexp) || ($regeval))) {
     die "Incompatible option specified with `delete=1'";}
if (($regexp) && ($regeval)) {die 'Both regexp=1 and regeval=1 specified';}
if (($case) && (!$delete) && (!$regexp) && (!$regeval)) {
     die 'Option case=1 specified without any of regexp=1, regeval=1, or delete=1 also being specified';}
if ($delete) {$slash=1;}
if (($upcase) || ($delete) || ($slash)) {$intags = 1;}
#
# Main
#
# Variable ``$state'' is one if there is an unresolved `<', zero otherwise.
#  ``$lastbeg'' is zero if no `<' has ocurred in $_, otherwise it points to the
# character immediately after the last `<' encountered.
#
$xRS = "\n"; $state = 0;
while (<>) {
    if ($_ =~ /$xRS$/o) { # strip record separator, allow for last line to
        chop;}            # be unterminated.
    $lastbeg = 0; $currsrch = 1; $txtbeg = 1;
    while ((((substr($_, $currsrch) =~ /[<>]/) eq 1) &&
      ($RSTART = length($`)+1)) != 0) {
        $currsrch = ($currsrch + $RSTART);
        if (substr($_, ($currsrch - 1), 1) eq '<') {
            if ($state) {
                print "\nERROR!";
                die "Multiple `<' without `>' ERROR!";}
            else {
                if (($currsrch > length($_)) ||
                  (substr($_, $currsrch, 1) =~ /^[ \t]$/)) {
                    print "\nERROR!";
                    die "Whitespace after `<': Bad SGML syntax ERROR!";}
                else {
                    if ($currsrch > ($txtbeg + 1)) {
                        if ((!$intags) && (($applyit) || (!(($inmost) ||
                          ($numins) || ($numout))))) {
                            printf "%s", &changeht(substr($_, $txtbeg,
                              ($currsrch - ($txtbeg + 1))));}
                        else {
                            printf "%s", substr($_, $txtbeg,
                              ($currsrch - ($txtbeg + 1)));}}
                $deletit = 0;
                $lastbeg = $currsrch; $state = 1;}}}
        else {
            if (substr($_, ($currsrch - 1), 1) eq '>') {
                if ($state == 0) {
                    next;}        #`>' without `<'
                else {
                    &parsetag($currsrch - 1);
                    if (!$deletit) {printf "%s", '>';}
                    $txtbeg = $currsrch; $state = 0;}}
            else {die 'Internal error, ignore';}}}
#At EOL:
    if ($state == 1) {
        &parsetag(length($_) + 1);}
    elsif ($txtbeg <= length($_)) {
        if ((!$intags) && (($applyit) || (!(($inmost) || ($numins) ||
          ($numout))))) {
            printf "%s", &changeht(substr($_, $txtbeg));}
        else {printf "%s", substr($_, $txtbeg);}}
    if (!(($state) && ($deletit))) {printf "\n";}}
#
#END routine:
#
if ($state) {
    die "Was awaiting a `>' ERROR! at END";}
if ($changed) {
    print STDERR "Changed! on input", $filstr;}
else {
    print STDERR "Unchanged on input", $filstr;}
#
#
sub parsetag {
    local($inp) = @_;
    $docap = $lastbeg;
    if (!$lastbeg) {
        $strx = '' ; $lastbeg = 1;}
    else {$strx= '<';}
    if ($inp != $lastbeg) {
        $str = &upc(substr($_, $lastbeg, ($inp - $lastbeg)));
        if (($oldapply) || (!(($inmost) || ($numins) ||($numout)))) {
            if (($slash) && ($docap) && ($str =~ /^\//))
                {$strx = ($strx . '/'); $str= substr($str, 2);}
            if ($delete) {
                if ($docap) {&getdel($str);}
                if (!$deletit) {printf "%s%s", $strx, $str;}
                else {$changed=1;}}
            else {
                if (($intags) && ($old))
                  {printf "%s%s", $strx, &changeht($str);}
                else {printf "%s%s", $strx, $str;}}}
        else {printf "%s%s", $strx, $str;}}}
#
sub upc {
    local($upcx) = @_;
    if ($docap) {
        $upcx =~ /^[^ \t\n]+/;
        ($tagname = $&) =~ tr/a-z/A-Z/;
        if ($upcase) {$upcx = ($tagname . $');}
        $oldapply = $applyit;
        #tag stack accounting
        if ((($inmost) || ($numins)|| ($numout)) &&
          (!(defined $unpair{$tagname}))) {
            $applyit = 1; $clostag = '';
            if ($tagname !~ /^\//) {
                ++$nestvar;
                $nestarr[$nestvar] = $tagname;}
            else {
                $clostag = substr($tagname,2);
                while ($nestarr[$nestvar] ne $clostag) {
                    --$nestvar;
                    if ($nestvar <= 0) {
                        print "\nERROR!";
                        die "/$clostag tag encountered when apparently not in $clostag element";}}
                --$nestvar;}
            if (($inmost) && ($nestarr[$nestvar] ne $inmost)) {
                $applyit = 0;}
            if ($numins) {
                if ($nestvar < $numins)  {$applyit = 0;}
                else {
                    $mask = 1;
                    $stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " ");
                    foreach $X (keys %xxin) {
                        if (index($stackstr,(" " . $X . " ")) <= 0) {
                            $mask = 0;}}
                    if (($applyit) && ($mask)) {$applyit = 1;}
                    else {$applyit = 0;}}}
            if (($numout) && ($nestvar)) {
                $mask = 1;
                $stackstr = (" " . join(" ",@nestarr[1..$nestvar]) . " ");
                foreach $X (keys %xxout) {
                ##print $stackstr,"XX",(" " . $X . " ");##debugXX
                    if (index($stackstr,(" " . $X . " ")) > 0) {
                        $mask = 0;}}
                if (($applyit) && ($mask)) {$applyit = 1;}
                else {$applyit = 0;}}
             if ($clostag) {$oldapply = $applyit;}}}
    return $upcx;}
#
sub getdel {
    local($inz) = @_;
    $inz =~ /^[^ \t\n]+/;
    $X = $&;
    if ($case) {
        if ($X =~ /$old/io) {
            $deletit = 1;}}
    else {
        if ($X =~ /$old/o) {
            $deletit = 1;}}}
#
sub changeht {
    local($field) = @_;
    if ($regeval) {
        if ($case) {
            $X = ($field =~ s/$old/$new/eeigo);}
        else {
            $X = ($field =~ s/$old/$new/eego);}
        if ($X) {$changed = 1;}
        return $field;}
    elsif ($regexp) {
        if ($case) {
            $X = ($field =~ s/$old/$new/igo);}
        else {
            $X = ($field =~ s/$old/$new/go);}
        if ($X) {$changed = 1;}
        return $field;}
    else {
        $startf = 1; $newf = '';
        while (($ndx = index(substr($field,$startf),$old)) > 0) {
            $changed = 1;
            $newf = ($newf . substr($field,$startf,($ndx-1)) . $new);
            $startf = ($startf + ($ndx-1) + length($old));}
        $newf = ($newf . substr($field,$startf));
        return $newf;}}
##EOF
