From 98f38aff440ad683aa1bd7a30fd960f1d0101191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20V=C3=A9rit=C3=A9?= Date: Sun, 31 Dec 2023 14:47:05 +0100 Subject: [PATCH v4] Support backslash-dot on a line by itself as valid data in COPY FROM...CSV --- doc/src/sgml/ref/copy.sgml | 6 +-- doc/src/sgml/ref/psql-ref.sgml | 4 -- src/backend/commands/copyfromparse.c | 57 +++++++--------------------- src/bin/psql/copy.c | 32 +++++++++++----- src/test/regress/expected/copy.out | 18 +++++++++ src/test/regress/sql/copy.sql | 12 ++++++ 6 files changed, 67 insertions(+), 62 deletions(-) diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml index 18ecc69c33..e719053e3d 100644 --- a/doc/src/sgml/ref/copy.sgml +++ b/doc/src/sgml/ref/copy.sgml @@ -761,11 +761,7 @@ COPY count format, \., the end-of-data marker, could also appear as a data value. To avoid any misinterpretation, a \. data value appearing as a lone entry on a line is automatically - quoted on output, and on input, if quoted, is not interpreted as the - end-of-data marker. If you are loading a file created by another - application that has a single unquoted column and might have a - value of \., you might need to quote that value in the - input file. + quoted on output. diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index cc7d797159..d94e3cacfc 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -1119,10 +1119,6 @@ INSERT INTO tbl1 VALUES ($1, $2) \bind 'first value' 'second value' \g destination, because all data must pass through the client/server connection. For large amounts of data the SQL command might be preferable. - Also, because of this pass-through method, \copy - ... from in CSV mode will erroneously - treat a \. data value alone on a line as an - end-of-input marker. diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index f553734582..67e046d450 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -136,14 +136,6 @@ if (1) \ } \ } else ((void) 0) -/* Undo any read-ahead and jump out of the block. */ -#define NO_END_OF_COPY_GOTO \ -if (1) \ -{ \ - input_buf_ptr = prev_raw_ptr + 1; \ - goto not_end_of_copy; \ -} else ((void) 0) - /* NOTE: there's a copy of this in copyto.c */ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; @@ -1137,7 +1129,6 @@ CopyReadLineText(CopyFromState cstate) bool result = false; /* CSV variables */ - bool first_char_in_line = true; bool in_quote = false, last_was_esc = false; char quotec = '\0'; @@ -1335,10 +1326,9 @@ CopyReadLineText(CopyFromState cstate) } /* - * In CSV mode, we only recognize \. alone on a line. This is because - * \. is a valid CSV data value. + * In CSV mode, backslash is a normal character. */ - if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line)) + if (c == '\\' && !cstate->opts.csv_mode) { char c2; @@ -1371,21 +1361,15 @@ CopyReadLineText(CopyFromState cstate) if (c2 == '\n') { - if (!cstate->opts.csv_mode) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker does not match previous newline style"))); - else - NO_END_OF_COPY_GOTO; + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker does not match previous newline style"))); } else if (c2 != '\r') { - if (!cstate->opts.csv_mode) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker corrupt"))); - else - NO_END_OF_COPY_GOTO; + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); } } @@ -1396,12 +1380,9 @@ CopyReadLineText(CopyFromState cstate) if (c2 != '\r' && c2 != '\n') { - if (!cstate->opts.csv_mode) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("end-of-copy marker corrupt"))); - else - NO_END_OF_COPY_GOTO; + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); } if ((cstate->eol_type == EOL_NL && c2 != '\n') || @@ -1425,7 +1406,7 @@ CopyReadLineText(CopyFromState cstate) result = true; /* report EOF */ break; } - else if (!cstate->opts.csv_mode) + else { /* * If we are here, it means we found a backslash followed by @@ -1433,23 +1414,11 @@ CopyReadLineText(CopyFromState cstate) * after a backslash is special, so we skip over that second * character too. If we didn't do that \\. would be * considered an eof-of copy, while in non-CSV mode it is a - * literal backslash followed by a period. In CSV mode, - * backslashes are not special, so we want to process the - * character after the backslash just like a normal character, - * so we don't increment in those cases. + * literal backslash followed by a period. */ input_buf_ptr++; } } - - /* - * This label is for CSV cases where \. appears at the start of a - * line, but there is more text after it, meaning it was a data value. - * We are more strict for \. in CSV mode because \. could be a data - * value, while in non-CSV mode, \. cannot be a data value. - */ -not_end_of_copy: - first_char_in_line = false; } /* end of outer loop */ /* diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c index dbbbdb8898..f31a7acbb6 100644 --- a/src/bin/psql/copy.c +++ b/src/bin/psql/copy.c @@ -620,20 +620,34 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res) /* current line is done? */ if (buf[buflen - 1] == '\n') { - /* check for EOF marker, but not on a partial line */ - if (at_line_begin) + /* + * When at the beginning of the line, check for EOF marker. + * If the marker is found and the data is inlined, + * we must stop at this point. If not, the \. line can be + * sent to the server, and we let it decide whether + * it's an EOF or not depending on the format: in + * basic TEXT, \. is going to be interpreted as an EOF, in + * CSV, it will not. + */ + if (at_line_begin && copystream == pset.cur_cmd_source) { - /* - * This code erroneously assumes '\.' on a line alone - * inside a quoted CSV string terminates the \copy. - * https://www.postgresql.org/message-id/E1TdNVQ-0001ju-GO@wrigleys.postgresql.org - * - * https://www.postgresql.org/message-id/bfcd57e4-8f23-4c3e-a5db-2571d09208e2@beta.fastmail.com - */ if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) || (linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0)) { copydone = true; + /* + * Remove the EOF marker from the data sent, unless + * using the obsolete v2 protocol that needs it. + * In the case of CSV, the EOF marker must be + * removed, otherwise it would be interpreted by + * the server as valid data. + */ + if (copystream == pset.cur_cmd_source && + PQprotocolVersion(conn) >= 3) + { + *fgresult='\0'; + buflen -= linelen; + } } } diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index b48365ec98..645e849bbd 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -32,6 +32,24 @@ select * from copytest except select * from copytest2; -------+------+-------- (0 rows) +--- test unquoted \. as data inside CSV +-- do not use copy out to export the data, as it would quote \. +\o :filename +\qecho line1 +\qecho '\\.' +\qecho line2 +\o +-- get the data back in with copy +truncate copytest2; +copy copytest2(test) from :'filename' csv; +select test from copytest2 order by test collate "C"; + test +------- + \. + line1 + line2 +(3 rows) + -- test header line feature create temp table copytest3 ( c1 int, diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index 43d2e906dd..68a58740cb 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -38,6 +38,18 @@ copy copytest2 from :'filename' csv quote '''' escape E'\\'; select * from copytest except select * from copytest2; +--- test unquoted \. as data inside CSV +-- do not use copy out to export the data, as it would quote \. +\o :filename +\qecho line1 +\qecho '\\.' +\qecho line2 +\o +-- get the data back in with copy +truncate copytest2; +copy copytest2(test) from :'filename' csv; +select test from copytest2 order by test collate "C"; + -- test header line feature -- 2.34.1