#!/usr/bin/perl 
#
# This is a generic hack to process CSV files.  This is limited to
# processing only one-liner files, i.e. those where no field has
# a newline within the data.
#

use Pg;
use Date::Parse;

# Ultra-crude options handling!
# 
#   load-csv.pl [options] dbname tablename
#
# Options can be:
#   --debug
#   --clean		Cleans table before loading records.
#   --fields=<fname>,<fname>,<fname>,...
#   --nfields <num> or -n <num> - number of fields.
#   --user <name>  or -u <name> - username for database
#

$debug = 0;
$clean = 0;
$num_fields = 0;
$named_fields = false;
$dbname = "";
$tablename = "";
$datestyle = "EURO";
$username = "$ENV{USER}";
for ( $i=0; $i<=$#ARGV; $i++ ) {
  print "$ARGV[$i]\n" if ( $debug );
  if ($ARGV[$i] eq "--debug") {
    $debug = 1;
  }
  elsif ($ARGV[$i] eq "--clean") {
    $clean = 1;
  }
  elsif ($ARGV[$i] eq "--mdy") {
    $datestyle = "US";
  }
  elsif ($ARGV[$i] eq "-u" || $ARGV[$i] eq "--user" ) {
    $username = "$ARGV[++$i]";
  }
  elsif ($ARGV[$i] eq "-n" || $ARGV[$i] eq "--nfields" ) {
    $num_fields = "$ARGV[++$i]";
  }
  elsif ($ARGV[$i] eq "--fields" ) {
    @fnames = split( /,/, $ARGV[++$i] );
    $num_fields = $#fnames + 1;
    $named_fields = true;
  }
  elsif ( $ARGV[$i] =~ /^-/ ) {
    print "Unknown option \"$ARGV[$i]\"\n";
  }
  else {
    $dbname = $ARGV[$i] if ( "$dbname" eq "" );
    $tablename = $ARGV[$i] if ( "$dbname" ne "" );
  }
}

print "$debug\n$clean\n$dbname\n$tablename\n$username\n$num_fields\n$named_fields\n"; # if ( $debug );
for ($i=0; $i<$num_fields; $i++) {
  print "\@fnames\[$i\] = " . @fnames[$i] . "\n" if ( $debug );
}

$conn = Pg::connectdb("dbname=$dbname user=$ENV{USER}");
die "Couldn\'t connect to database!" if ( $conn->status == PGRES_CONNECTION_BAD );

#if ( $num_fields <= 0 ) {
  $query = "SELECT attname FROM pg_attribute WHERE attrelid =";
  $query .= " (SELECT oid FROM pg_class WHERE relname='$tablename')";
  $query .= " AND attnum>0";
  $query .= " ORDER BY attnum; ";
  $result = $conn->exec( $query );
  $status = $result->resultStatus;
  print "$status \t$query\n" if ( $debug );
  die "ERROR: $query\n" . $conn->errorMessage . "\n" if ( $status >= PGRES_BAD_RESPONSE );
  # Get list of field names to be loaded.
  $num_fields = ($num_fields > 0 && $num_fields < $result->ntuples ? $num_fields : $result->ntuples);
  if ( $named_fields == false ) {
    for ($i=0; $i<$num_fields; $i++) {
      @fnames[$i] = $result->getvalue($i,0);
      print "\@fnames\[$i\] = " . @fnames[$i] . "\n"; # if ( $debug );
    }
  }
  print "$num_fields fields.\n" if ( $debug );
#}

$query = "SET DATESTYLE TO '$datestyle';\n";
$query .= "DELETE FROM $tablename;\n" if ( $clean );
$query .= "SELECT version();";
$result = $conn->exec( $query );
$status = $result->resultStatus;
# print "$status \t$query\n";
die "ERROR: $query\n" . $conn->errorMessage . "\n" if ( $status >= PGRES_BAD_RESPONSE );
$pg_version = $result->getvalue(0,0);

@blank_fields = ();
for ($i=0; $i<$num_fields; $i++) {
  @blank_fields[$i] = "";
}

sub clean_annoyances {

  my $fld = @_[0];
  print "unclean: $fld >>$debug<<\n" if ( $debug );

  # Trim leading and trailing whitespace
  $fld =~ s/^\s*(\S*)\s*$/\1/gm ;

  # Now replace the 0x0d with nothing;
  $fld =~ s/[\x0D]//gm ;

  # Replace the 0x91 and 0x92 characters with single-quotes
  $fld =~ s/[\x91\x92]/\'/gm ;

  # Replace the 0x93 and 0x94 characters with double-quotes
  $fld =~ s/[\x93\x94]/\"/gm ;

  # Replace the 0x96 with a dash
  $fld =~ s/[\x96]/-/gm ;

  # Replace the 0x85 with three dots
  $fld =~ s/[\x85]/.../gm ;

  # Replace the 0xA9 with &copy;
  $fld =~ s/[\xA9]/&copy;/gm ;

  # Replace the 0xAE with &reg;
  $fld =~ s/[\xAE]/&reg;/gm ;

  # Replace the &#61623; with a *
  $fld =~ s/&#61623;/*/gm ;

  # Replace the &#61485; with a *
  $fld =~ s/&#61485;/<br>/gm ;

  # Replace the &#39; with '
  $fld =~ s/&#39;/'/gm ;

  # Finally replace each ' with ''
  $fld =~ s/\'/\'\'/g ;
  print "cleaned: $fld\n" if ( $debug );

  return $fld;
}


sub write_record {
  $query = "INSERT INTO $tablename";
  $vquery = " VALUES( ";
  $query .= " (" if ( $named_fields != false );
  $vquery = ")$vquery" if ( $named_fields != false );
  for ( $i=0; $i<$num_fields; $i++ ) {
    $query  .= (( $i == 0 ? "" : ", ") . @fnames[$i])  if ( $named_fields != false );
    $fval = clean_annoyances(@_[$i]);
    $vquery .= ( $i == 0 ? "" : ", ") . ($fval eq "" ? "NULL" : "'$fval'" );
  }
  $query .= "$vquery );";
  $result = $conn->exec( $query );
  $status = $result->resultStatus;
  print STDERR "$status \t$query\n" if ( $debug );
  print STDERR "ERROR - line $line: $query\n" . $conn->errorMessage . "\n" if ( $status >= PGRES_BAD_RESPONSE );
}

$line = 0;
$state = "pre";
@fields = @blank_fields;
$i = 0;
$last_data = "";

# Rip the bits between the tags
while ( <STDIN> ) {

  do {
    print "$i $state: $_\n" if ( $debug );
    /,/ && do {
      if ( $state eq "pre" ) {
        @fields[$i++] = $last_data;
        $last_data = "";
        next;
      }
      if ( $state eq "inquote" ) {
        @fields[$i] .= ',';
        next;
      }
      if ( $state eq "outquote" ) {
        $state = "pre";
        $last_data = "";
        $i++;
      }
    };
    
    /"/ && do {                   # "
      if ( $state eq "pre" ) {
        $state = "inquote";
        @fields[$i] = "";
        next;
      }
      if ( $state eq "inquote" ) {
        $state = "outquote";
#        @fields[$i] .= $last_data;
        next;
      }
      if ( $state eq "outquote" ) {
        $state = "inquote";
        @fields[$i] .= '"';
        next;
      }
    };
    /[^\",]/ && do {
      @fields[$i] .= $_ if ( $state eq "inquote" );
    };
    $last_data = $_;

  } foreach split( /([,"])/igms, $_ );  # "

  print "$i $state: $last_data\n" if ( $debug );
  @fields[$i] = $last_data if ( $state eq "pre" && $last_data ne "" );
  print @fields[$i] . "\n" if ( $debug );

  if ( $state eq "inquote" ) {
  }
  else {
    $line++;
    $result = write_record( @fields );

    # Be prepared for the next record
	$state = "pre";
    @fields = @blank_fields;
    $i = 0;
    $last_data = "";
  }
}

