diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml ./doc/src/sgml/ref/copy.sgml *** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml Mon Jan 3 19:39:53 2005 --- ./doc/src/sgml/ref/copy.sgml Sun Feb 20 19:18:54 2005 *************** *** 496,508 **** CSV mode will both recognize and produce CSV files with quoted values containing embedded carriage returns and line feeds. Thus ! the files are not strictly one line per table row like text-mode ! files. However, PostgreSQL will reject ! COPY input if any fields contain embedded line ! end character sequences that do not match the line ending ! convention used in the CSV file itself. It is generally safer to ! import data containing embedded line end characters using the ! text or binary formats rather than CSV. --- 496,503 ---- CSV mode will both recognize and produce CSV files with quoted values containing embedded carriage returns and line feeds. Thus ! the files are not strictly one line per table row as are text-mode ! files. *************** *** 513,518 **** --- 508,515 ---- might encounter some files that cannot be imported using this mechanism, and COPY might produce files that other programs cannot process. + It is generally safer to import data using the text or binary formats, + if possible, rather than using CSV format. diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c ./src/backend/commands/copy.c *** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c Fri Dec 31 16:59:41 2004 --- ./src/backend/commands/copy.c Sun Feb 20 13:40:56 2005 *************** *** 98,104 **** static EolType eol_type; /* EOL type of input */ static int client_encoding; /* remote side's character encoding */ static int server_encoding; /* local encoding */ - static bool embedded_line_warning; /* these are just for error messages, see copy_in_error_callback */ static bool copy_binary; /* is it a binary copy? */ --- 98,103 ---- *************** *** 140,145 **** --- 139,145 ---- char *delim, char *null_print, bool csv_mode, char *quote, char *escape, List *force_notnull_atts); static bool CopyReadLine(void); + static bool CopyReadLineCSV(char * quote, char * escape); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); static char *CopyReadAttributeCSV(const char *delim, const char *null_print, *************** *** 1191,1197 **** attr = tupDesc->attrs; num_phys_attrs = tupDesc->natts; attr_count = list_length(attnumlist); - embedded_line_warning = false; /* * Get info about the columns we need to process. --- 1191,1196 ---- *************** *** 1718,1724 **** ListCell *cur; /* Actually read the line into memory here */ ! done = CopyReadLine(); /* * EOF at start of line means we're done. If we see EOF after --- 1717,1723 ---- ListCell *cur; /* Actually read the line into memory here */ ! done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine(); /* * EOF at start of line means we're done. If we see EOF after *************** *** 2194,2199 **** --- 2193,2448 ---- return result; } + /* + * Read a line for CSV copy mode. Differences from standard mode: + * . CR an NL are not special inside quoted fields - they just get added + * to the buffer. + * . \ is not magical except as the start of the end of data marker. + * + */ + + static bool + CopyReadLineCSV(char * quote, char * escape) + { + bool result; + bool change_encoding = (client_encoding != server_encoding); + int c; + int mblen; + int j; + unsigned char s[2]; + char *cvt; + bool in_quote = false, last_was_esc = false; + char quotec = quote[0]; + char escapec = escape[0]; + + s[1] = 0; + + /* ignore special escape processing if it's the same as quote */ + if (quotec == escapec) + escapec = '\0'; + + /* reset line_buf to empty */ + line_buf.len = 0; + line_buf.data[0] = '\0'; + line_buf.cursor = 0; + + /* mark that encoding conversion hasn't occurred yet */ + line_buf_converted = false; + + /* set default status */ + result = false; + + /* + * In this loop we only care for detecting newlines (\r and/or \n) + * and the end-of-copy marker (\.). These four + * characters, and only these four, are assumed the same in frontend + * and backend encodings. We do not assume that second and later bytes + * of a frontend multibyte character couldn't look like ASCII characters. + * + * What about the encoding implications of the quote / excape chars? + * + * However, CR and NL characters that are inside a quoted field are + * not special, and are simply a part of the data value. The parsing rule + * used is a bit rough and ready, but probably adequate for our purposes. + */ + + for (;;) + { + c = CopyGetChar(); + if (c == EOF) + { + result = true; + break; + } + + /* + * Dealing with quotes and escapes here is mildly tricky. If the + * quote char is also the escape char, there's no problem - we + * just use the char as a toggle. If they are different, we need + * to ensure that we only take account of an escape inside a quoted + * field and immediately preceding a quote char, and not the + * second in a escape-escape sequence. + */ + + if (in_quote && c == escapec) + last_was_esc = ! last_was_esc; + + if (c == quotec && ! last_was_esc) + in_quote = ! in_quote; + + if (c != escapec) + last_was_esc = false; + + /* + * updating the line count for embedded CR and/or LF chars is + * necessarily a little fragile - this test is probably about + * the best we can do. + */ + if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) + copy_lineno++; + + if (!in_quote && c == '\r') + { + if (eol_type == EOL_NL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in CSV data"), + errhint("Use quoted CSV field to represent carriage return."))); + /* Check for \r\n on first line, _and_ handle \r\n. */ + if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL) + { + int c2 = CopyPeekChar(); + + if (c2 == '\n') + { + CopyDonePeek(c2, true); /* eat newline */ + eol_type = EOL_CRNL; + } + else + { + /* found \r, but no \n */ + if (eol_type == EOL_CRNL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in CSV data"), + errhint("Use quoted CSV field to represent carriage return."))); + + /* + * if we got here, it is the first line and we didn't + * get \n, so put it back + */ + CopyDonePeek(c2, false); + eol_type = EOL_CR; + } + } + break; + } + if (!in_quote && c == '\n') + { + if (eol_type == EOL_CR || eol_type == EOL_CRNL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted newline found in CSV data"), + errhint("Use quoted CSV field to represent newline."))); + eol_type = EOL_NL; + break; + } + + /* \ is only potentially magical at the start of a line */ + if (line_buf.len == 0 && c == '\\') + { + int c2 = CopyPeekChar(); + + if (c2 == EOF) + { + result = true; + + CopyDonePeek(c2, true); /* eat it - do we need to? */ + + break; + } + if (c2 == '.') + { + + CopyDonePeek(c2, true); /* so we can keep calling GetChar() */ + + if (eol_type == EOL_CRNL) + { + c = CopyGetChar(); + if (c == '\n') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker does not match previous newline style"))); + if (c != '\r') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); + } + c = CopyGetChar(); + if (c != '\r' && c != '\n') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); + if ((eol_type == EOL_NL && c != '\n') || + (eol_type == EOL_CRNL && c != '\n') || + (eol_type == EOL_CR && c != '\r')) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker does not match previous newline style"))); + + /* + * In protocol version 3, we should ignore anything + * after \. up to the protocol end of copy data. (XXX + * maybe better not to treat \. as special?) + */ + if (copy_dest == COPY_NEW_FE) + { + while (c != EOF) + c = CopyGetChar(); + } + result = true; /* report EOF */ + break; + } + + CopyDonePeek(c2, false); /* not a dot, so put it back */ + + } + + appendStringInfoCharMacro(&line_buf, c); + + /* + * When client encoding != server, must be careful to read the + * extra bytes of a multibyte character exactly, since the encoding + * might not ensure they don't look like ASCII. When the encodings + * are the same, we need not do this, since no server encoding we + * use has ASCII-like following bytes. + */ + if (change_encoding) + { + s[0] = c; + mblen = pg_encoding_mblen(client_encoding, s); + for (j = 1; j < mblen; j++) + { + c = CopyGetChar(); + if (c == EOF) + { + result = true; + break; + } + appendStringInfoCharMacro(&line_buf, c); + } + if (result) + break; /* out of outer loop */ + } + } /* end of outer loop */ + + /* + * Done reading the line. Convert it to server encoding. + * + * Note: set line_buf_converted to true *before* attempting conversion; + * this prevents infinite recursion during error reporting should + * pg_client_to_server() issue an error, due to copy_in_error_callback + * again attempting the same conversion. We'll end up issuing the message + * without conversion, which is bad but better than nothing ... + */ + line_buf_converted = true; + + if (change_encoding) + { + cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data, + line_buf.len); + if (cvt != line_buf.data) + { + /* transfer converted data back to line_buf */ + line_buf.len = 0; + line_buf.data[0] = '\0'; + appendBinaryStringInfo(&line_buf, cvt, strlen(cvt)); + } + } + + return result; + } + /*---------- * Read the value of a single attribute, performing de-escaping as needed. * *************** *** 2369,2402 **** for (;;) { - /* handle multiline quoted fields */ - if (in_quote && line_buf.cursor >= line_buf.len) - { - bool done; - - switch (eol_type) - { - case EOL_NL: - appendStringInfoString(&attribute_buf, "\n"); - break; - case EOL_CR: - appendStringInfoString(&attribute_buf, "\r"); - break; - case EOL_CRNL: - appendStringInfoString(&attribute_buf, "\r\n"); - break; - case EOL_UNKNOWN: - /* shouldn't happen - just keep going */ - break; - } - - copy_lineno++; - done = CopyReadLine(); - if (done && line_buf.len == 0) - break; - start_cursor = line_buf.cursor; - } - end_cursor = line_buf.cursor; if (line_buf.cursor >= line_buf.len) break; --- 2618,2623 ---- *************** *** 2629,2653 **** !use_quote && (c = *test_string) != '\0'; test_string += mblen) { - /* - * We don't know here what the surrounding line end characters - * might be. It might not even be under postgres' control. So - * we simple warn on ANY embedded line ending character. - * - * This warning will disappear when we make line parsing field-aware, - * so that we can reliably read in embedded line ending characters - * regardless of the file's line-end context. - * - */ - - if (!embedded_line_warning && (c == '\n' || c == '\r') ) - { - embedded_line_warning = true; - elog(WARNING, - "CSV fields with embedded linefeed or carriage return " - "characters might not be able to be reimported"); - } - if (c == delimc || c == quotec || c == '\n' || c == '\r') use_quote = true; if (!same_encoding) --- 2850,2855 ----