#!/usr/bin/perl

# Conversion of PostgreSQL documentation from Docbook 4.2 sgml-format into
# Docbook 4.2 xml-format.
#
# Based on a script from Jürgen Purtz <juergen@purtz.de>
# 
# The script expands the SGML constructs 'shorttags' and 'empty elements'. Additionally
# it handles one special postgres case.
#

use strict;
use warnings;
use autodie; # die if problem reading or writing a file


# --------------  Input  -------------
# read complete STDIN (slurp mode)
my $content = do { local $/; <> };


$content =~ s/ class="PARAMETER"/ class="parameter"/g;

# --------------  Empty (per definition in DTD) elements  --------------

# List of 'empty' elements in Docbook. They don't need to have an end tag.
# eg: <x><xref linkend="sql-createtype"></x> (there is neither '</xref>' nor '/>')
# Close them considering line breaks. Afaik PostgreSQL uses only 'xref', 'co' and 'footnoteref'.
# In addition to the Docbook elements we handle the colspec and spanspec elements of cals tables.
my $emptyElements =
   'anchor|area|audiodata|beginpage|co|coref|footnoteref|graphic|imagedata|inlinegraphic|sbr|' .
   'textdata|varargs|videodata|void|xref|colspec|spanspec';
# As one of the following steps we use the tool 'osx'. osx tries to close the empty tags again, which results in
# unwanted additional - and in some cases unvalid - CDATA. As long as osx is used we must use the long
# notation of empty elements.
$content =~ s/<(($emptyElements)\s+.*?)>/<$1\/>/sg; # some are closed, others not.


# ---------------  Shorttags  ------------------------

# Prevent replacing tags in comments
$content =~ s/<!((?:--(?:[^-]*|-[^-]+)*--\s*))>/"<!".($1 =~ s!<!°!sgr).">"/sge;
$content =~ s/<!((?:--(?:[^-]*|-[^-]+)*--\s*))>/"<!".($1 =~ s!>!§!sgr).">"/sge;

# Construct an expression which matches tags and the ACCORDING shorttag: "</>"
# The idea is to handle the tree of nodes from its leafs to the top with
# one s/<x>...</>/<x>...</x>/g command per level.
# Don't use greedy pattern. We must match the nearest </>.

# Define the pattern for (multiple) attributes: whitespaces, any string up to > or />
# example: <x example-attr-name="attr-value" \n b="bb"/>
my $attr = '(\s+(((?!>)(?!/>).)+?))?';

# Define the pattern for shorttags.
my $regex;
$regex = qr/ <(\w[\w-]*?)(${attr})>     # regular start-tag. Catch tagname as $1 and attributes as $2
             (?'content'                # catch content in variable $content
              (((                       # negative look ahead:
                (?!<[\w-]+?${attr}>)    #   not a regular start-tag
                (?!<[\w-]+?${attr}\/>)  #   not an empty tag
                (?!<\/[\w-]+?>)         #   not a regular end-tag
                (?!<\/>)                #   not a shorttag
               ).                       # move foreward 
                 ){0,32000}+ )*+        # to overcome the Perl 32K limit, it's neccessary to split
                                        # $content into many chunks. Possessive quantifiers speeds
                                        # up performance.
             )                          #
             (<\/([\w-]+?)?>)           # followed by a shorttag or a regular end-tag
           /xs;

# Perform the expansion of shorttags. As of the recursive nature of the node tree, it's necessary
#   a) to work with a loop which processes the tree from leaf nodes to root node
#   b) to convert the matching shorttags to some form of regular content, which differs
#      from SGML/XML-syntax. We use ° and § as they do not occur in the PostgreSQL docs.
# (There is a way to match recursive REs - but not to replace them, afaik.)

# the loop
while ($content =~ s/$regex/°$1$2§$+{content}°\/$1§/sg) {};

# restore the SGML/XML syntax
$content =~ s/°/</g;
$content =~ s/§/>/g;

print $content;

