#!/bin/bash
: <<END_COMMENT

  Migration of PostgreSQL's documentation from Docbook 4.5
  to Docbook 5.2 format.

  J. Purtz, juergen@purtz.de, September 2022 and June 2024

  Description: see README.md


END_COMMENT


# --------------  individual environment  -------------------

#  --->  Adopt the three variables to your situation!  <---
ToolDir=~/02_IT/82_pgWork/docbook4_to_docbook5.2
#FromRoot=~/02_IT/83_pgSrc/postgresql_13
#FromRoot=~/02_IT/83_pgSrc/postgresql_14
#FromRoot=~/02_IT/83_pgSrc/postgresql_15
#FromRoot=~/02_IT/83_pgSrc/postgresql_16
FromRoot=~/02_IT/83_pgSrc/postgresql_17
ToRoot=~/02_IT/83_pgSrc/postgresql_db5.2

# derived variables
FromSgmlDir=${FromRoot}/doc/src/sgml
ToSgmlDir=${ToRoot}/doc/src/sgml


# ------------  control the migration steps  ---------------

doInit=true
doRealModifications=true
doUpgrade=true
doValidation=true
doSgmlDiff=false
doHtmlDiff=false


# ------------------  init  ---------------------------------
# remove previous conversion and re-generate complete file
# structure including git

if [[ $doInit == true ]]; then

  echo -e "\nGenerate initial file structure at: ${ToRoot} as copy of: ${FromRoot}"
  cd $ToolDir

  # remove old directory structure and create new one
  if [[ -d $ToRoot ]]; then
    echo -e "Removing complete old conversion"
    rm -rf $ToRoot
  fi

  echo -e "Creating directory structure and copying existing files"
  cp -r $FromRoot $ToRoot

  # supply entities and Relax NG schema
  cp w3centities-f.ent $ToSgmlDir
  cp docbook.rng $ToSgmlDir

  # clean up
  echo -e "Clean up the new directory"
  cd $ToSgmlDir
  rm -f postgres.html postgres.txt postgres-full.xml postgres.epub *.fo *.pdf \
        *.texixml *.texi *.info db2texi.refs  
  rm -fr html/ html-stamp man1/ man3/ man7/ man-stamp

  # The newer PG versions don't contain the file 'version.sgml' and others in the tar-ball.
  # If necessary, we generate them.
  for target in version.sgml features-supported.sgml features-unsupported.sgml errcodes-table.sgml \
                keywords-table.sgml wait_event_types.sgml targets-meson.sgml; do 
    if [[ `grep $target Makefile` && ! -f $target ]]; then
      make $target
    fi
  done

  # modify one of our Perl scripts
  if [[ -f generate-keywords-table.pl ]]; then
    sed -i -E -e 's/ id=/ xml:id=/' generate-keywords-table.pl
  fi

  # modify Makefile: keep xmllint for output, add Jing for validation
  # newer versions work with postgres-full.xml: $(XMLLINT) $(XMLINCLUDE) --output $@ --noent --valid $<
  # duplicate xmllint line and remove '--valid' from first line
  sed -i -E -e 's/((.* --output .* )--valid (.*))/\2\3\n\1\n/' Makefile
  # old and new PG versions: change validation from xmllint to jing
  sed -i -E -e 's/(.* --valid .*)/# Generate output with xmllint; validate with jing\n#\1\n	jing docbook.rng postgres.sgml/' Makefile

fi


# -----  real modifications  ---------------------------
if [[ $doRealModifications == true ]]; then

  echo -e "\nRealModifications ...\n"
  cd $ToolDir

  . doRealModifications.sh $ToSgmlDir

fi


# -----  db4 --> db5.2  ---------------------------------
# This step contains some dummy changes to cheat xsltproc.
# They are reverted after xsltproc has done its job.
#
# Among others the previous init-step has copied the sgml files to the 'to' directory.
# Here, their modification takes place.

if [[ $doUpgrade == true ]]; then

  echo -e "1: formal changes; 2: db4 -> db5 migration; 3: revert formal changes\n"
  cd $ToSgmlDir

  for toFile in *sgml ref/*sgml; do

    if [[ $toFile == "filelist.sgml" ]]; then
      # add a line to move parameter entity 'reference' from postgres.sgml to here
      sed -i -E -e 's/(.*allfiles.sgml.*)/\1\n\<!ENTITY reference  SYSTEM "reference.sgml"\>/' $toFile
      # Change name of PG's entity 'parallel' to 'parallel-query' because it is an
      # predefined entity. But keep the original filename!
      sed -i -e 's/parallel.*SYSTEM /parallel-query SYSTEM /' $toFile
      # add a comment in the file
      sed -i -E -e "s/(parallel-query.*)/\1  <!-- in a global context the entity-name \'parallel\' has a different meaning -->/" $toFile
    fi

    # omit some special files from the conversion
    if [[ $toFile == "filelist.sgml"             ||    \
          $toFile == "ref/allfiles.sgml"         ||    \
          $toFile == "features-supported.sgml"   ||    \
          $toFile == "features-unsupported.sgml" ||    \
          $toFile == "errcodes-table.sgml"       ||    \
          $toFile == "keywords-table.sgml"       ||    \
          $toFile == "version.sgml"                    \
        ]]; then
      continue
    fi

    # -----  prepare 'tmp.sgml' for intermediate changes
    if [[ $toFile == "postgres.sgml" ]]; then
      # In postgres.sgml we will conserve some entity-definitions 
      cat  $toFile  >tmp.sgml
      # consider the changed entity name
      sed -i -e 's/&parallel;/\&parallel-query;/' tmp.sgml

      # Two external entities in postgres.sgml will be thrown away by the
      # conversion anyway. We will recover them later.
      sed -i -e 's/%version;//' tmp.sgml
      sed -i -e 's/%filelist;//' tmp.sgml

    else
      # Some sgml files are not well-formed, they have multiple root elements.
      # We surround all files with a single <dummy> tag and remove it after
      # the conversion.
      echo "<dummy>" >tmp.sgml
      cat  $toFile  >>tmp.sgml
      echo "</dummy>" >>tmp.sgml
    fi

    # There is no way to hinder the xslt processor to replace entities
    # by their values. Hide them and restore the original syntax
    # later, eg: &version;  -->  _amp_version;  --> &version;
    sed -i -e 's/&/_amp_/g' tmp.sgml

    # If <pubdate> contains a valid day, the xslt processor outputs it in
    # a different syntax. Avoid this by changing the tag-name.
    sed -i -e 's/pubdate>/pubdatex>/g' tmp.sgml


    # The xslt migration script removes CDATA (why? how?). The consequence is
    # that the text within CDATA is parsed and leads to many errors.
    # (Adding cdata-section-elements="screen programlisting" had no effect.)
    # We preserve CDATA by surrounding it with start/end comment tags.
    # first step: substitute all '--' within CDATA because they are not allowed in comments
    perl $ToolDir/cdata.pl <tmp.sgml >tmp1.sgml
    rm tmp.sgml
    mv tmp1.sgml tmp.sgml    
    # second step: imbed all CDATA within a comment
    sed -i -e 's/<\!\[CDATA\[/<!-- <![CDATA[/' tmp.sgml
    sed -i -e 's/]]>/]]> -->/' tmp.sgml


    # -----  perform the conversion ------------------------------------------------------
    # btw: switch back from 'tmp.sgml' to '$toFile'
    # the db4->db5 migration step which is developed by the DocBook
    # team; ignore some frequent warnings

    xsltproc --novalid --nodtdattr --encoding "utf-8" $ToolDir/db4-upgrade.xsl tmp.sgml \
             2> >(grep -Pv "(Found \w* inside |Converting ulink to (link|uri)\. )") >$toFile


    # ------------------------------------------------------------------------------------


    # restore CDATA by removing comment tags and modified '-'
    sed -i -e 's/<\!-- <\!\[CDATA\[/<![CDATA[/' $toFile
    sed -i -e 's/]]> -->/]]>/' $toFile
    sed -i -e 's/_minus_/-/g'  $toFile

    # restore <pubdate>
    sed -i -e 's/pubdatex>/pubdate>/g' $toFile

    # restore entities
    sed -i -e 's/_amp_/\&/g' $toFile

    if [[ $toFile == "postgres.sgml" ]]; then

      # postgres.sgml needs further modifications
      # a) remove first line (the conversion-comment) and
      #    second line (<book...>
      sed -i -e '1,2d' $toFile
      # b) insert entity definitions to the head of postgres.sgml
      cat <<EOT >tmp.sgml
<?xml version="1.0" encoding="utf-8"?>
<!-- doc/src/sgml/postgres.sgml -->

<!--
The (outdated) use of DOCTYPE serves merely for the definition
of character entities and parameter entities.
The validation process doesn't use this DTD-syntax, it uses
DocBook's RELAX NG schema where entities are unknown.

The use of character entities (eg: &mdash;) instead of hex-values
supports the readability of the source for authors.

The replacement of parameter entities (eg: %filelist;) by the
more XML-conforming xi:include syntax isn't possible without
major changes in many files:
  - Every xml/sgml-file needs a single root element.
  - In every xml/sgml-file we must re-declare namespace(s).
The reason is that parameter entities perform a plain text
substitution whereas xi:include creates trees and combines them.

-->

<!DOCTYPE book [

<!-- Predefined character entities: html, latin1, MathML, ... -->
<!--  via 'http' ...
<!ENTITY % w3centities PUBLIC
         "-//W3C//ENTITIES Combined Set//EN//XML"
         "http://www.w3.org/2003/entities/2007/w3centities.ent"
>
-->

<!-- ... or by local installations -->
<!ENTITY % w3centities-f PUBLIC
         "-//W3C//ENTITIES Combined Set//EN//XML"
         "w3centities-f.ent"
>
%w3centities-f;

<!-- PG specific entities -->
<!ENTITY % version SYSTEM "version.sgml">
%version;
<!ENTITY % filelist SYSTEM "filelist.sgml">
%filelist;

<!--
Zero-width space.  Use this to allow line breaks at desirable places in
table cells, examples, etc. without causing an unwanted space when the
break is not needed in a wider output rendering.  The name is an abbreviation
for the publicly known entity 'ZeroWidthSpace'.
-->
<!ENTITY zwsp "&#x200B;">

]>

<book xmlns="http://docbook.org/ns/docbook"
      xmlns:xlink="http://www.w3.org/1999/xlink"
      xmlns:xi="http://www.w3.org/2001/XInclude"
      xmlns:m="http://www.w3.org/1998/Math/MathML"
      xml:lang="en"
      xml:id="postgres"
      version="5.2"
      >

EOT

      cat $toFile >>tmp.sgml
      cp tmp.sgml $toFile

    else
      # remove <dummy> tag: first (a comment), second, and last line
      sed -i -e '1,2d;$d' $toFile

    fi

  done # loop over all sgml files

  # remove the latest version of tmp.sgml
  rm tmp.sgml

  for file in ref/alter_conversion.sgml ref/create_conversion.sgml \
              ref/drop_conversion.sgml  keywords-table.sgml
  # ???  for some reason the attribute 'id' within some files is not converted  ????
  do
    if [[ -f $file ]]; then
      sed -i -e 's/ id=/ xml:id=/' $file
    fi
  done
  

  # In some PG versions there is a single xml-file. He contains a different formatting and
  # needs only few changes. We do the conversion manually.
  if [[ -f $FromSgmlDir/standalone-install.xml ]]; then
    toFile=standalone-install.xml
    cp   $FromSgmlDir/standalone-install.xml $toFile
    sed -i -e '2d;s/ id=/ xml:id=/'          $toFile
  fi

fi


# -----  validate the result  ---------------------------
if [[ $doValidation == true ]]; then

  echo -e "\nValidation ...\n"
  cd $ToSgmlDir
  jing docbook.rng postgres.sgml

fi

# -----  report SGML differences  ---------------------------
if [[ $doSgmlDiff == true ]]; then

  # use 'diff' plus 'egrep' to ignore or search for patterns

  echo -e "\nDifferences in SGML ...\n"

  cd $FromSgmlDir

  # check sgml
  for file1 in $FromSgmlDir/*.sgml; do # $FromSgmlDir/ref/*.sgml; do
    if [[ $file1 =~ "/ref/" ]]; then
      file2=$ToSgmlDir/ref/$(basename $file1)
    else
      file2=$ToSgmlDir/$(basename $file1)
    fi

    echo -e '\n**********  '`basename $file1`'  **********'
    # diff -y --suppress-common-lines --width=140 $file1 $file2  |\
    diff $file1 $file2  |\
         grep -E -v '(xml:id=|id=)'  |\
         grep -E -v '(structname|structfield)'
  done
fi


# -----  report HTML differences  ---------------------------
if [[ $doHtmlDiff == true ]]; then

  # 'diff' plus 'egrep' to ignore or search for patterns
  echo -e "\nDifferences in HTML ...\n"

  # create html files
  if [[ true ]]; then
    cd $FromSgmlDir
    make html STYLE=website
    cd $ToSgmlDir
    make html STYLE=website
   fi

  # check html files
  for file1 in $FromSgmlDir/html/*.html; do
    file2=$ToSgmlDir/html/$(basename $file1)

    echo -e '\n**********  '`basename $file1`'  **********'
    #diff -w -y --suppress-common-lines --width=300 $file1 $file2 |\
    diff $file1 $file2 |\
         grep -E -v -w '(link|ulink)'   | \
         grep -E -v ' (class=\"structfield\"|class=\"structname\"|class=\"varname\")'  |\
         grep -E -v 'No newline at end of file'
  done

fi


# --------------------
echo -e "Done\n"
# --------------------


