#!/bin/bash
: <<END_COMMENT

  Migration of PostgreSQL's documentation from Docbook 4.5
  to Docbook 5.2 format.

  J. Purtz, juergen@purtz.de, September 2022 and June/July 2024

  Description: see README.md


END_COMMENT


# --------------  individual environment  -------------------

#  --->  Adopt the three variables to your situation!  <---
ToolDir=~/02_IT/82_pgWork/docbook4_to_docbook5.2
FromRoot=~/02_IT/83_pgSrc/postgresql_13
FromRoot=~/02_IT/83_pgSrc/postgresql_14
FromRoot=~/02_IT/83_pgSrc/postgresql_15
FromRoot=~/02_IT/83_pgSrc/postgresql_16
FromRoot=~/02_IT/83_pgSrc/postgresql_17
ToRoot=~/02_IT/83_pgSrc/postgresql_db5.2

# derived variables
FromSgmlDir=${FromRoot}/doc/src/sgml
ToSgmlDir=${ToRoot}/doc/src/sgml


# ------------  control the migration steps  ---------------

doInit=true
doRealModifications=true
doUpgrade=true
doValidation=true
doSgmlDiff=true
doHtmlDiffSingle=true
doHtmlDiffMultiple=true
doTextDiff=true
doManDiff=true



# ------------------  init  ---------------------------------
# remove previous conversion and re-generate complete file
# structure including git

if [[ $doInit == true ]]; then

  echo -e "\nGenerate initial file structure at: ${ToRoot} as copy of: ${FromRoot}"
  cd $ToolDir

  # remove old directory structure and create new one
  if [[ -d $ToRoot ]]; then
    echo -e "Removing complete old conversion"
    rm -rf $ToRoot
  fi

  echo -e "Creating directory structure and copying existing files"
  cp -r $FromRoot $ToRoot

  # supply entities and Relax NG schema
  cp w3centities-f.ent $ToSgmlDir
  cp docbook.rng $ToSgmlDir

  # supply files for later comparision with the migrated ones
  cd $FromSgmlDir
  for target in html postgres.html postgres.txt man; do 
    if [[ `grep $target Makefile` && ! -f $target ]]; then
      make $target
    fi
  done


  # --   clean up   ------
  echo -e "Clean up the new directory"
  cd $ToSgmlDir
  rm -f postgres.html postgres.txt postgres-full.xml postgres.epub *.fo *.pdf \
        *.texixml *.texi *.info db2texi.refs  
  rm -fr html/ html-stamp man1/ man3/ man7/ man-stamp

  # --   Run some Makefile targets   ------
  # The newer PG versions don't contain the file 'version.sgml' and others in the tar-ball.
  # If necessary, we generate them.
  for target in version.sgml features-supported.sgml features-unsupported.sgml errcodes-table.sgml \
                keywords-table.sgml wait_event_types.sgml targets-meson.sgml; do 
    if [[ `grep $target Makefile` && ! -f $target ]]; then
      make $target
    fi
  done


  # --   Modify Makefile itself  ------------

  if grep -q 'postgres-full.xml' Makefile; then
    # First step: change validation from xmllint to jing for all PG versions (--output keeps alive)
    sed -i -E -e 's/((.* --noent) --valid( .*))/# Generate output with xmllint; validate with jing\n\2\3\n	jing docbook.rng postgres.sgml/' Makefile

    # PG 16+ already use postgres-full.xml. No second step.
    :  # noop
  else
    # First step: change validation from xmllint to jing for all PG versions
    sed -i -E -e 's/(.* --noout --valid .*)/# Generate output with xmllint; validate with jing\n#\1\n	jing docbook.rng postgres.sgml/' Makefile

    # introduce postgres-full
    sed -i -E -e 's/(.*jing docbook.rng.*)/\1\n	$(XMLLINT) $(XMLINCLUDE) --output postgres-full.xml --noent postgres.sgml/' Makefile

    # generate single page html
    sed -i -E -e 's/((.*)\$\(XSLTPROC_HTML_FLAGS\) -o )\$\@ \$\(wordlist .*/\1postgres.html stylesheet-html-nochunk.xsl postgres-full.xml/' Makefile

    # generate html
    sed -i -E -e 's/((.*)\$\(XSLTPROC_HTML_FLAGS\) )\$\(wordlist .*/\1stylesheet.xsl postgres-full.xml/' Makefile

    # generate html help
    sed -i -E -e 's/((.*)\$\(XSLTPROCFLAGS\) )\$\(wordlist .*/\1stylesheet-hh.xsl postgres-full.xml/' Makefile

    # generate man
    sed -i -E -e 's/((.*)\$\(XSLTPROC_MAN_FLAGS\) )\$\(wordlist .*/\1stylesheet-man.xsl postgres-full.xml/' Makefile

    # generate pdf
    sed -i -E -e 's/(.*stylesheet-fo.xsl )%.sgml(.*)/\1%-full.xml\2/' Makefile
    # it needs the additional target 'postgres-full.xml'
    sed -i -E -e 's/(.*\%\.pdf:.*)/postgres-full.xml: postgres.sgml $(ALLSGML)\n	$(XMLLINT) $(XMLINCLUDE) --output $@ --noent $<\n\n\1/' Makefile

  fi

  # --   Modify a Perl script   ----------
  if [[ -f generate-keywords-table.pl ]]; then
    sed -i -E -e 's/ id=/ xml:id=/' generate-keywords-table.pl
  fi


  # --   add namespace to xsl-scripts  ---
  . $ToolDir/addNamespaceToXsl.sh $ToSgmlDir

fi


# -----  real modifications in sgml files  ---------------------------
if [[ $doRealModifications == true ]]; then

  echo -e "\nRealModifications ...\n"
  cd $ToolDir

  . $ToolDir/doRealModifications.sh $ToSgmlDir

fi


# -----  db4 --> db5.2  ---------------------------------
# This step contains some dummy changes to cheat xsltproc.
# They are reverted after xsltproc has done its job.
#
# Among others the previous init-step has copied the sgml files to the 'to' directory.
# Here, their modification takes place.

if [[ $doUpgrade == true ]]; then

  echo -e "1: formal changes; 2: db4 -> db5 migration; 3: revert formal changes\n"
  cd $ToSgmlDir

  for toFile in *sgml ref/*sgml; do

    if [[ $toFile == "filelist.sgml" ]]; then
      # add a line to move parameter entity 'reference' from postgres.sgml to here
      sed -i -E -e 's/(.*allfiles.sgml.*)/\1\n\<!ENTITY reference  SYSTEM "reference.sgml"\>/' $toFile
      # Change name of PG's entity 'parallel' to 'parallel-query' because it is an
      # predefined entity. But keep the original filename!
      sed -i -E -e 's/parallel.*SYSTEM /parallel-query SYSTEM /' $toFile
      # add a comment in the file
      sed -i -E -e "s/(parallel-query.*)/\1  <!-- in a global context the entity-name \'parallel\' has a different meaning -->/" $toFile
    fi

    # omit some special files from the conversion
    if [[ $toFile == "filelist.sgml"             ||    \
          $toFile == "ref/allfiles.sgml"         ||    \
          $toFile == "features-supported.sgml"   ||    \
          $toFile == "features-unsupported.sgml" ||    \
          $toFile == "errcodes-table.sgml"       ||    \
          $toFile == "keywords-table.sgml"       ||    \
          $toFile == "version.sgml"                    \
        ]]; then
      continue
    fi

    # -----  prepare 'tmp.sgml' for intermediate changes
    if [[ $toFile == "postgres.sgml" ]]; then
      # In postgres.sgml we will conserve some entity-definitions 
      cat  $toFile  >tmp.sgml
      # consider the changed entity name
      sed -i -E -e 's/&parallel;/\&parallel-query;/' tmp.sgml

      # Two external entities in postgres.sgml will be thrown away by the
      # conversion anyway. We will recover them later.
      sed -i -E -e 's/%version;//' tmp.sgml
      sed -i -E -e 's/%filelist;//' tmp.sgml

    else
      # Some sgml files are not well-formed, they have multiple root elements.
      # We surround all files with a single <dummy> tag and remove it after
      # the conversion.
      echo "<dummy>" >tmp.sgml
      cat  $toFile  >>tmp.sgml
      echo "</dummy>" >>tmp.sgml
    fi

    # There is no way to hinder the xslt processor to replace entities
    # by their values. Hide them and restore the original syntax
    # later, eg: &version;  -->  _amp_version;  --> &version;
    sed -i -E -e 's/&/_amp_/g' tmp.sgml

    # If <pubdate> contains a valid day, the xslt processor outputs it in
    # a different syntax. Avoid this by changing the tag-name.
    sed -i -E -e 's/pubdate>/pubdatex>/g' tmp.sgml


    # The xslt migration script removes CDATA (why?). The consequence is
    # that the text within CDATA is parsed and leads to many errors.
    # (Adding cdata-section-elements="screen programlisting" had no effect.)
    # We preserve CDATA by surrounding it with start/end comment tags.
    # first step: substitute all '--' within CDATA because they are not allowed in comments
    perl $ToolDir/cdata.pl <tmp.sgml >tmp1.sgml
    rm tmp.sgml
    mv tmp1.sgml tmp.sgml    
    # second step: imbed all CDATA within a comment
    sed -i -E -e 's/<\!\[CDATA\[/<!-- <![CDATA[/' tmp.sgml
    sed -i -E -e 's/]]>/]]> -->/' tmp.sgml


    # -----  perform the conversion ------------------------------------------------------
    # btw: switch back from 'tmp.sgml' to '$toFile'
    # the db4->db5 migration step which is developed by the DocBook
    # team; ignore some frequent warnings

    xsltproc --novalid --nodtdattr --encoding "utf-8" $ToolDir/db4-upgrade.xsl tmp.sgml \
             2> >(grep -Pv "(Found \w* inside |Converting ulink to (link|uri)\. )") >$toFile

    # the conversion merges the two lines '<link\n linkend ...' resp. '<xref\n linkend ...' 
    # into one line. Re-split them as good as possible.
    sed -i -E -e 's/(( *).{50,90}(\<link|\<xref)) (linkend=)/\1\n\2\4/' $toFile
    if ! [[ $toFile =~ "ref/" ]]; then
      # same for 'replaceable', but: the reference files intentionally contain a lot of long lines
      sed -i -E -e 's/(( *).{50,90}(\<replaceable)) (class=)/\1\n\2\4/' $toFile
    fi    

    # ------------------------------------------------------------------------------------


    # restore CDATA by removing comment tags and modified '-'
    sed -i -E -e 's/<\!-- <\!\[CDATA\[/<![CDATA[/' $toFile
    sed -i -E -e 's/]]> -->/]]>/' $toFile
    sed -i -E -e 's/_minus_/-/g'  $toFile

    # restore <pubdate>
    sed -i -E -e 's/pubdatex>/pubdate>/g' $toFile

    # restore entities
    sed -i -E -e 's/_amp_/\&/g' $toFile

    if [[ $toFile == "postgres.sgml" ]]; then

      # postgres.sgml needs further modifications
      # a) remove first line (the conversion-comment) and
      #    second line (<book...>
      sed -i -E -e '1,2d' $toFile
      # b) insert entity definitions to the head of postgres.sgml
      cat <<EOT >tmp.sgml
<?xml version="1.0" encoding="utf-8"?>
<!-- doc/src/sgml/postgres.sgml -->

<!--
The (outdated) use of DOCTYPE serves merely for the definition
of character entities and parameter entities.
The validation process doesn't use this DTD-syntax, it uses
DocBook's RELAX NG schema where entities are unknown.

The use of character entities (eg: &mdash;) instead of hex-values
supports the readability of the source for authors.

The replacement of parameter entities (eg: %filelist;) by the
more XML-conforming xi:include syntax isn't possible without
major changes in many files:
  - Every xml/sgml-file needs a single root element.
  - In every xml/sgml-file we must re-declare namespace(s).
The reason is that parameter entities perform a plain text
substitution whereas xi:include creates trees and combines them.

-->

<!DOCTYPE book [

<!-- Predefined character entities: html, latin1, MathML, ... -->
<!--  via 'http' ...
<!ENTITY % w3centities PUBLIC
         "-//W3C//ENTITIES Combined Set//EN//XML"
         "http://www.w3.org/2003/entities/2007/w3centities.ent"
>
-->

<!-- ... or by local installations -->
<!ENTITY % w3centities-f PUBLIC
         "-//W3C//ENTITIES Combined Set//EN//XML"
         "w3centities-f.ent"
>
%w3centities-f;

<!-- PG specific entities -->
<!ENTITY % version SYSTEM "version.sgml">
%version;
<!ENTITY % filelist SYSTEM "filelist.sgml">
%filelist;

<!--
Zero-width space.  Use this to allow line breaks at desirable places in
table cells, examples, etc. without causing an unwanted space when the
break is not needed in a wider output rendering.  The name is an abbreviation
for the publicly known entity 'ZeroWidthSpace'.
-->
<!ENTITY zwsp "&#x200B;">

]>

<book xmlns="http://docbook.org/ns/docbook"
      xmlns:xlink="http://www.w3.org/1999/xlink"
      xmlns:xi="http://www.w3.org/2001/XInclude"
      xmlns:m="http://www.w3.org/1998/Math/MathML"
      xml:lang="en"
      xml:id="postgres"
      version="5.2"
      >

EOT

      cat $toFile >>tmp.sgml
      cp tmp.sgml $toFile

    else
      # remove <dummy> tag: first (a comment), second, and last line
      sed -i -E -e '1,2d;$d' $toFile

    fi

  done # loop over all sgml files

  # remove the latest version of tmp.sgml
  rm tmp.sgml

  for file in ref/alter_conversion.sgml ref/create_conversion.sgml \
              ref/drop_conversion.sgml  keywords-table.sgml
  # ???  for some reason the attribute 'id' within some files is not converted  ????
  do
    if [[ -f $file ]]; then
      sed -i -E -e 's/ id=/ xml:id=/' $file
    fi
  done
  

  # In some PG versions there is a single xml-file. He contains a different formatting and
  # needs only few changes. We do the conversion manually.
  if [[ -f $FromSgmlDir/standalone-install.xml ]]; then
    toFile=standalone-install.xml
    cp   $FromSgmlDir/standalone-install.xml $toFile
    sed -i -E -e '2d;s/ id=/ xml:id=/'          $toFile
  fi

fi


# -----  validate the result  ---------------------------
if [[ $doValidation == true ]]; then

  echo -e "\nValidation ...\n"
  cd $ToSgmlDir
  jing docbook.rng postgres.sgml

fi


# -----  report SGML differences  ---------------------------
if [[ $doSgmlDiff == true ]]; then

  echo -e "\nDifferences in SGML ...\n"
  cd $ToSgmlDir
  
  # prepare filenames
  for file1 in *.sgml ref/*.sgml; do
    if [[ $file1 =~ "ref/" ]]; then
      file2=$FromSgmlDir/ref/$(basename $file1)
    else
      file2=$FromSgmlDir/$(basename $file1)
    fi

    echo -e '\n**********  '`basename $file1`'  **********'
    # use 'diff' (plus 'grep' to ignore or search for patterns)
    diff  --suppress-common-lines --width=140 $file2 $file1  |\
      grep -E -v '(xml:id=|id=)'                             |\
      grep -E -v '(structname|structfield)'                  |\
      grep -E -v '(<ref | linkend=)'                         |\
      grep -E -v '(<xref|<ulink|</ulink|<link|</link)'

  done
fi


# -----  report HTML differences: single page  ---------------------------

if [[ $doHtmlDiffSingle == true ]]; then

  if ! [[ -d $ToolDir/htmlOld ]]; then
    mkdir $ToolDir/htmlOld
  fi
  if ! [[ -d $ToolDir/htmlNew ]]; then
    mkdir $ToolDir/htmlNew
  fi


  # 'diff' (plus 'grep' to ignore or search for patterns)
  echo -e "\nDifferences in HTML (single file) ...\n"

  # create html files
  cd $FromSgmlDir
  make postgres.html       # single page
  lynx -dump -width=10000  postgres.html >$ToolDir/htmlOld/postgres.html.dump

  cd $ToSgmlDir
  make postgres.html       # single page
  lynx -dump -width=10000  postgres.html >$ToolDir/htmlNew/postgres.html.dump

  diff $ToolDir/htmlOld/postgres.html.dump $ToolDir/htmlNew/postgres.html.dump | \
    grep -E -v 'file:'

fi


# -----  report HTML differences: multiple pages  ---------------------------

if [[ $doHtmlDiffMultiple == true ]]; then

  # 'diff' (plus 'grep' to ignore or search for patterns)
  echo -e "\nDifferences in HTML (multiple files) ...\n"

  # create html files
  cd $FromSgmlDir
  make html STYLE=website  # multiple pages
  cd $ToSgmlDir
  make html STYLE=website  # multiple pages

  cd $FromSgmlDir/html
    
  for file1 in *.html; do
    echo -e "\n**********  $file1  **********"
    lynx -dump -width=10000 $FromSgmlDir/html/$file1 >$ToolDir/htmlOld/$file1.dump
    lynx -dump -width=10000 $ToSgmlDir/html/$file1   >$ToolDir/htmlNew/$file1.dump

    diff $ToolDir/htmlOld/$file1.dump $ToolDir/htmlNew/$file1.dump  | \
         grep -E -v -w '(link|ulink|file:)'   
  done

fi

# -----  report differences: pure text  ---------------------------

if [[ $doTextDiff == true ]]; then

  echo -e "\nDifferences in pure text  ...\n"

  # create text files
  cd $FromSgmlDir
  make postgres.txt
  cd $ToSgmlDir
  make postgres.txt
  diff $FromSgmlDir/postgres.txt $ToSgmlDir/postgres.txt 

fi

# -----  report differences: man pages  ---------------------------

if [[ $doManDiff == true ]]; then

  # 'diff' (plus 'grep' to ignore or search for patterns)
  echo -e "\nDifferences in man pages  ...\n"

  # create man files
  cd $FromSgmlDir
  touch postgres.sgml # sometimes necessary
  make man
  cd $ToSgmlDir
  touch postgres.sgml # sometimes necessary
  make man

  for file1 in man1/* man3/* man7/*; do
    echo -e "\n**********  $file1  **********"
    diff $FromSgmlDir/$file1 $ToSgmlDir/$file1
  done

fi


# --------------------
echo -e "Done\n"
# --------------------


