Index: src/backend/commands/copy.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/commands/copy.c,v retrieving revision 1.255 diff -c -c -r1.255 copy.c *** src/backend/commands/copy.c 22 Nov 2005 18:17:08 -0000 1.255 --- src/backend/commands/copy.c 27 Dec 2005 02:10:18 -0000 *************** *** 76,94 **** /* * This struct contains all the state variables used throughout a COPY ! * operation. For simplicity, we use the same struct for all variants ! * of COPY, even though some fields are used in only some cases. * ! * A word about encoding considerations: encodings that are only supported on ! * the client side are those where multibyte characters may have second or ! * later bytes with the high bit not set. When scanning data in such an ! * encoding to look for a match to a single-byte (ie ASCII) character, ! * we must use the full pg_encoding_mblen() machinery to skip over ! * multibyte characters, else we might find a false match to a trailing ! * byte. In supported server encodings, there is no possibility of ! * a false match, and it's faster to make useless comparisons to trailing ! * bytes than it is to invoke pg_encoding_mblen() to skip over them. ! * client_only_encoding is TRUE when we have to do it the hard way. */ typedef struct CopyStateData { --- 76,94 ---- /* * This struct contains all the state variables used throughout a COPY ! * operation. For simplicity, we use the same struct for all variants of COPY, ! * even though some fields are used in only some cases. * ! * Multi-byte encodings: all supported client-side encodings encode multi-byte ! * characters by having the first byte's high bit set. Subsequent bytes of the ! * character can have the high bit not set. When scanning data in such an ! * encoding to look for a match to a single-byte (ie ASCII) character, we must ! * use the full pg_encoding_mblen() machinery to skip over multibyte ! * characters, else we might find a false match to a trailing byte. In ! * supported server encodings, there is no possibility of a false match, and ! * it's faster to make useless comparisons to trailing bytes than it is to ! * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is TRUE ! * when we have to do it the hard way. */ typedef struct CopyStateData { *************** *** 101,107 **** EolType eol_type; /* EOL type of input */ int client_encoding; /* remote side's character encoding */ bool need_transcoding; /* client encoding diff from server? */ ! bool client_only_encoding; /* encoding not valid on server? */ /* parameters from the COPY command */ Relation rel; /* relation to copy to or from */ --- 101,107 ---- EolType eol_type; /* EOL type of input */ int client_encoding; /* remote side's character encoding */ bool need_transcoding; /* client encoding diff from server? */ ! bool encoding_embeds_ascii; /* ASCII can be non-first byte? */ /* parameters from the COPY command */ Relation rel; /* relation to copy to or from */ *************** *** 160,165 **** --- 160,230 ---- typedef CopyStateData *CopyState; + /* + * These macros centralize code used to process line_buf and raw_buf buffers. + * They are macros because they often do continue/break control and to avoid + * function call overhead in tight COPY loops. + * + * We must use "if (1)" because "do {} while(0)" overrides the continue/break + * processing. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros. + */ + + /* + * This keeps the character read at the top of the loop in the buffer + * even if there is more than one read-ahead. + */ + #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \ + if (1) \ + { \ + if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \ + { \ + raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \ + need_data = true; \ + continue; \ + } \ + } else + + + /* This consumes the remainder of the buffer and breaks */ + #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \ + if (1) \ + { \ + if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \ + { \ + if (extralen) \ + raw_buf_ptr = copy_buf_len; /* consume the partial character */ \ + /* backslash just before EOF, treat as data char */ \ + result = true; \ + break; \ + } \ + } else + + + /* + * Transfer any approved data to line_buf; must do this to be sure + * there is some room in raw_buf. + */ + #define REFILL_LINEBUF \ + if (1) \ + { \ + if (raw_buf_ptr > cstate->raw_buf_index) \ + { \ + appendBinaryStringInfo(&cstate->line_buf, \ + cstate->raw_buf + cstate->raw_buf_index, \ + raw_buf_ptr - cstate->raw_buf_index); \ + cstate->raw_buf_index = raw_buf_ptr; \ + } \ + } else + + /* Undo any read-ahead and jump out of the block. */ + #define NO_END_OF_COPY_GOTO \ + if (1) \ + { \ + raw_buf_ptr = prev_raw_ptr + 1; \ + goto not_end_of_copy; \ + } else + + static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; *************** *** 169,175 **** static void CopyFrom(CopyState cstate); static bool CopyReadLine(CopyState cstate); static bool CopyReadLineText(CopyState cstate); - static bool CopyReadLineCSV(CopyState cstate); static int CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals); static int CopyReadAttributesCSV(CopyState cstate, int maxfields, --- 234,239 ---- *************** *** 940,946 **** /* Set up encoding conversion info */ cstate->client_encoding = pg_get_client_encoding(); cstate->need_transcoding = (cstate->client_encoding != GetDatabaseEncoding()); ! cstate->client_only_encoding = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding); cstate->copy_dest = COPY_FILE; /* default */ --- 1004,1011 ---- /* Set up encoding conversion info */ cstate->client_encoding = pg_get_client_encoding(); cstate->need_transcoding = (cstate->client_encoding != GetDatabaseEncoding()); ! /* See Multibyte encoding comment above */ ! cstate->encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding); cstate->copy_dest = COPY_FILE; /* default */ *************** *** 1970,1979 **** cstate->line_buf_converted = false; /* Parse data and transfer into line_buf */ ! if (cstate->csv_mode) ! result = CopyReadLineCSV(cstate); ! else ! result = CopyReadLineText(cstate); if (result) { --- 2035,2041 ---- cstate->line_buf_converted = false; /* Parse data and transfer into line_buf */ ! result = CopyReadLineText(cstate); if (result) { *************** *** 2048,2089 **** } /* ! * CopyReadLineText - inner loop of CopyReadLine for non-CSV mode ! * ! * If you need to change this, better look at CopyReadLineCSV too */ static bool CopyReadLineText(CopyState cstate) { - bool result; char *copy_raw_buf; int raw_buf_ptr; int copy_buf_len; ! bool need_data; ! bool hit_eof; ! char s[2]; ! s[1] = 0; ! /* set default status */ ! result = false; /* * The objective of this loop is to transfer the entire next input line * into line_buf. Hence, we only care for detecting newlines (\r and/or * \n) and the end-of-copy marker (\.). * ! * For backwards compatibility we allow backslashes to escape newline ! * characters. Backslashes other than the end marker get put into the ! * line_buf, since CopyReadAttributesText does its own escape processing. * ! * These four characters, and only these four, are assumed the same in ! * frontend and backend encodings. * ! * For speed, we try to move data to line_buf in chunks rather than one ! * character at a time. raw_buf_ptr points to the next character to ! * examine; any characters from raw_buf_index to raw_buf_ptr have been ! * determined to be part of the line, but not yet transferred to line_buf. * * For a little extra speed within the loop, we copy raw_buf and * raw_buf_len into local variables. --- 2110,2162 ---- } /* ! * CopyReadLineText - inner loop of CopyReadLine for text mode */ static bool CopyReadLineText(CopyState cstate) { char *copy_raw_buf; int raw_buf_ptr; int copy_buf_len; ! bool need_data = false; ! bool hit_eof = false; ! bool result = false; ! char mblen_str[2]; ! /* CSV variables */ ! bool first_char_in_line = true; ! bool in_quote = false, ! last_was_esc = false; ! char quotec = '\0'; ! char escapec = '\0'; ! if (cstate->csv_mode) ! { ! quotec = cstate->quote[0]; ! escapec = cstate->escape[0]; ! /* ignore special escape processing if it's the same as quotec */ ! if (quotec == escapec) ! escapec = '\0'; ! } ! mblen_str[1] = '\0'; /* * The objective of this loop is to transfer the entire next input line * into line_buf. Hence, we only care for detecting newlines (\r and/or * \n) and the end-of-copy marker (\.). * ! * In CSV mode, \r and \n inside a quoted field are just part of the data ! * value and are put in line_buf. We keep just enough state to know if we ! * are currently in a quoted field or not. * ! * These four characters, and the CSV escape and quote characters, are ! * assumed the same in frontend and backend encodings. * ! * For speed, we try to move data from raw_buf to line_buf in chunks ! * rather than one character at a time. raw_buf_ptr points to the next ! * character to examine; any characters from raw_buf_index to raw_buf_ptr ! * have been determined to be part of the line, but not yet transferred ! * to line_buf. * * For a little extra speed within the loop, we copy raw_buf and * raw_buf_len into local variables. *************** *** 2091,2118 **** copy_raw_buf = cstate->raw_buf; raw_buf_ptr = cstate->raw_buf_index; copy_buf_len = cstate->raw_buf_len; - need_data = false; /* flag to force reading more data */ - hit_eof = false; /* flag indicating no more data available */ for (;;) { int prev_raw_ptr; char c; ! /* Load more data if needed */ if (raw_buf_ptr >= copy_buf_len || need_data) { ! /* ! * Transfer any approved data to line_buf; must do this to be sure ! * there is some room in raw_buf. ! */ ! if (raw_buf_ptr > cstate->raw_buf_index) ! { ! appendBinaryStringInfo(&cstate->line_buf, ! cstate->raw_buf + cstate->raw_buf_index, ! raw_buf_ptr - cstate->raw_buf_index); ! cstate->raw_buf_index = raw_buf_ptr; ! } /* * Try to read some more data. This will certainly reset --- 2164,2188 ---- copy_raw_buf = cstate->raw_buf; raw_buf_ptr = cstate->raw_buf_index; copy_buf_len = cstate->raw_buf_len; for (;;) { int prev_raw_ptr; char c; ! /* ! * Load more data if needed. Ideally we would just force four bytes ! * of read-ahead and avoid the many calls to ! * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE ! * protocol does not allow us to read too far ahead or we might ! * read into the next data, so we read-ahead only as far we know ! * we can. One optimization would be to read-ahead four byte here ! * if cstate->copy_dest != COPY_OLD_FE, but it hardly seems worth it, ! * considering the size of the buffer. ! */ if (raw_buf_ptr >= copy_buf_len || need_data) { ! REFILL_LINEBUF; /* * Try to read some more data. This will certainly reset *************** *** 2139,2472 **** prev_raw_ptr = raw_buf_ptr; c = copy_raw_buf[raw_buf_ptr++]; ! if (c == '\r') ! { ! /* Check for \r\n on first line, _and_ handle \r\n. */ ! if (cstate->eol_type == EOL_UNKNOWN || ! cstate->eol_type == EOL_CRNL) ! { ! /* ! * If need more data, go back to loop top to load it. ! * ! * Note that if we are at EOF, c will wind up as '\0' because ! * of the guaranteed pad of raw_buf. ! */ ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } ! c = copy_raw_buf[raw_buf_ptr]; ! ! if (c == '\n') ! { ! raw_buf_ptr++; /* eat newline */ ! cstate->eol_type = EOL_CRNL; /* in case not set yet */ ! } ! else ! { ! /* found \r, but no \n */ ! if (cstate->eol_type == EOL_CRNL) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal carriage return found in data"), ! errhint("Use \"\\r\" to represent carriage return."))); ! ! /* ! * if we got here, it is the first line and we didn't find ! * \n, so don't consume the peeked character ! */ ! cstate->eol_type = EOL_CR; ! } ! } ! else if (cstate->eol_type == EOL_NL) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal carriage return found in data"), ! errhint("Use \"\\r\" to represent carriage return."))); ! /* If reach here, we have found the line terminator */ ! break; ! } ! ! if (c == '\n') ! { ! if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal newline found in data"), ! errhint("Use \"\\n\" to represent newline."))); ! cstate->eol_type = EOL_NL; /* in case not set yet */ ! /* If reach here, we have found the line terminator */ ! break; ! } ! ! if (c == '\\') { /* ! * If need more data, go back to loop top to load it. */ ! if (raw_buf_ptr >= copy_buf_len) { ! if (hit_eof) ! { ! /* backslash just before EOF, treat as data char */ ! result = true; ! break; ! } ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; } /* ! * In non-CSV mode, backslash quotes the following character even ! * if it's a newline, so we always advance to next character */ ! c = copy_raw_buf[raw_buf_ptr++]; ! ! if (c == '.') ! { ! if (cstate->eol_type == EOL_CRNL) ! { ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } ! /* if hit_eof, c will become '\0' */ ! c = copy_raw_buf[raw_buf_ptr++]; ! if (c == '\n') ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker does not match previous newline style"))); ! if (c != '\r') ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker corrupt"))); ! } ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } ! /* if hit_eof, c will become '\0' */ ! c = copy_raw_buf[raw_buf_ptr++]; ! if (c != '\r' && c != '\n') ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker corrupt"))); ! if ((cstate->eol_type == EOL_NL && c != '\n') || ! (cstate->eol_type == EOL_CRNL && c != '\n') || ! (cstate->eol_type == EOL_CR && c != '\r')) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker does not match previous newline style"))); ! ! /* ! * Transfer only the data before the \. into line_buf, then ! * discard the data and the \. sequence. ! */ ! if (prev_raw_ptr > cstate->raw_buf_index) ! appendBinaryStringInfo(&cstate->line_buf, ! cstate->raw_buf + cstate->raw_buf_index, ! prev_raw_ptr - cstate->raw_buf_index); ! cstate->raw_buf_index = raw_buf_ptr; ! result = true; /* report EOF */ ! break; ! } ! } ! ! /* ! * Do we need to be careful about trailing bytes of multibyte ! * characters? (See note above about client_only_encoding) ! * ! * We assume here that pg_encoding_mblen only looks at the first byte ! * of the character! ! */ ! if (cstate->client_only_encoding) ! { ! int mblen; ! ! s[0] = c; ! mblen = pg_encoding_mblen(cstate->client_encoding, s); ! if (raw_buf_ptr + (mblen - 1) > copy_buf_len) ! { ! if (hit_eof) ! { ! /* consume the partial character (conversion will fail) */ ! raw_buf_ptr = copy_buf_len; ! result = true; ! break; ! } ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } ! raw_buf_ptr += mblen - 1; ! } ! } /* end of outer loop */ ! ! /* ! * Transfer any still-uncopied data to line_buf. ! */ ! if (raw_buf_ptr > cstate->raw_buf_index) ! { ! appendBinaryStringInfo(&cstate->line_buf, ! cstate->raw_buf + cstate->raw_buf_index, ! raw_buf_ptr - cstate->raw_buf_index); ! cstate->raw_buf_index = raw_buf_ptr; ! } ! ! return result; ! } ! ! /* ! * CopyReadLineCSV - inner loop of CopyReadLine for CSV mode ! * ! * If you need to change this, better look at CopyReadLineText too ! */ ! static bool ! CopyReadLineCSV(CopyState cstate) ! { ! bool result; ! char *copy_raw_buf; ! int raw_buf_ptr; ! int copy_buf_len; ! bool need_data; ! bool hit_eof; ! char s[2]; ! bool in_quote = false, last_was_esc = false; - char quotec = cstate->quote[0]; - char escapec = cstate->escape[0]; - - /* ignore special escape processing if it's the same as quotec */ - if (quotec == escapec) - escapec = '\0'; - - s[1] = 0; - - /* set default status */ - result = false; - - /* - * The objective of this loop is to transfer the entire next input line - * into line_buf. Hence, we only care for detecting newlines (\r and/or - * \n) and the end-of-copy marker (\.). - * - * In CSV mode, \r and \n inside a quoted field are just part of the data - * value and are put in line_buf. We keep just enough state to know if we - * are currently in a quoted field or not. - * - * These four characters, and the CSV escape and quote characters, are - * assumed the same in frontend and backend encodings. - * - * For speed, we try to move data to line_buf in chunks rather than one - * character at a time. raw_buf_ptr points to the next character to - * examine; any characters from raw_buf_index to raw_buf_ptr have been - * determined to be part of the line, but not yet transferred to line_buf. - * - * For a little extra speed within the loop, we copy raw_buf and - * raw_buf_len into local variables. - */ - copy_raw_buf = cstate->raw_buf; - raw_buf_ptr = cstate->raw_buf_index; - copy_buf_len = cstate->raw_buf_len; - need_data = false; /* flag to force reading more data */ - hit_eof = false; /* flag indicating no more data available */ - - for (;;) - { - int prev_raw_ptr; - char c; - - /* Load more data if needed */ - if (raw_buf_ptr >= copy_buf_len || need_data) - { - /* - * Transfer any approved data to line_buf; must do this to be sure - * there is some room in raw_buf. - */ - if (raw_buf_ptr > cstate->raw_buf_index) - { - appendBinaryStringInfo(&cstate->line_buf, - cstate->raw_buf + cstate->raw_buf_index, - raw_buf_ptr - cstate->raw_buf_index); - cstate->raw_buf_index = raw_buf_ptr; - } - - /* - * Try to read some more data. This will certainly reset - * raw_buf_index to zero, and raw_buf_ptr must go with it. - */ - if (!CopyLoadRawBuf(cstate)) - hit_eof = true; - raw_buf_ptr = 0; - copy_buf_len = cstate->raw_buf_len; /* ! * If we are completely out of data, break out of the loop, ! * reporting EOF. */ ! if (copy_buf_len <= 0) ! { ! result = true; ! break; ! } ! need_data = false; ! } ! ! /* OK to fetch a character */ ! prev_raw_ptr = raw_buf_ptr; ! c = copy_raw_buf[raw_buf_ptr++]; ! ! /* ! * If character is '\\' or '\r', we may need to look ahead below. ! * Force fetch of the next character if we don't already have it. We ! * need to do this before changing CSV state, in case one of these ! * characters is also the quote or escape character. ! * ! * Note: old-protocol does not like forced prefetch, but it's OK here ! * since we cannot validly be at EOF. ! */ ! if (c == '\\' || c == '\r') ! { ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } } ! /* ! * Dealing with quotes and escapes here is mildly tricky. If the quote ! * char is also the escape char, there's no problem - we just use the ! * char as a toggle. If they are different, we need to ensure that we ! * only take account of an escape inside a quoted field and ! * immediately preceding a quote char, and not the second in a ! * escape-escape sequence. ! */ ! if (in_quote && c == escapec) ! last_was_esc = !last_was_esc; ! if (c == quotec && !last_was_esc) ! in_quote = !in_quote; ! if (c != escapec) ! last_was_esc = false; ! ! /* ! * Updating the line count for embedded CR and/or LF chars is ! * necessarily a little fragile - this test is probably about the best ! * we can do. (XXX it's arguable whether we should do this at all --- ! * is cur_lineno a physical or logical count?) ! */ ! if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r')) ! cstate->cur_lineno++; ! ! if (c == '\r' && !in_quote) { /* Check for \r\n on first line, _and_ handle \r\n. */ if (cstate->eol_type == EOL_UNKNOWN || --- 2209,2257 ---- prev_raw_ptr = raw_buf_ptr; c = copy_raw_buf[raw_buf_ptr++]; ! if (cstate->csv_mode) { /* ! * If character is '\\' or '\r', we may need to look ahead below. ! * Force fetch of the next character if we don't already have it. We ! * need to do this before changing CSV state, in case one of these ! * characters is also the quote or escape character. ! * ! * Note: old-protocol does not like forced prefetch, but it's OK here ! * since we cannot validly be at EOF. */ ! if (c == '\\' || c == '\r') { ! IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); } /* ! * Dealing with quotes and escapes here is mildly tricky. If the quote ! * char is also the escape char, there's no problem - we just use the ! * char as a toggle. If they are different, we need to ensure that we ! * only take account of an escape inside a quoted field and ! * immediately preceding a quote char, and not the second in a ! * escape-escape sequence. */ ! if (in_quote && c == escapec) ! last_was_esc = !last_was_esc; ! if (c == quotec && !last_was_esc) ! in_quote = !in_quote; ! if (c != escapec) last_was_esc = false; /* ! * Updating the line count for embedded CR and/or LF chars is ! * necessarily a little fragile - this test is probably about the best ! * we can do. (XXX it's arguable whether we should do this at all --- ! * is cur_lineno a physical or logical count?) */ ! if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r')) ! cstate->cur_lineno++; } ! /* Process \r */ ! if (c == '\r' && (!cstate->csv_mode || !in_quote)) { /* Check for \r\n on first line, _and_ handle \r\n. */ if (cstate->eol_type == EOL_UNKNOWN || *************** *** 2478,2489 **** * Note that if we are at EOF, c will wind up as '\0' because * of the guaranteed pad of raw_buf. */ ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } c = copy_raw_buf[raw_buf_ptr]; if (c == '\n') --- 2263,2271 ---- * Note that if we are at EOF, c will wind up as '\0' because * of the guaranteed pad of raw_buf. */ ! IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); ! ! /* get next char */ c = copy_raw_buf[raw_buf_ptr]; if (c == '\n') *************** *** 2497,2505 **** if (cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unquoted carriage return found in data"), ! errhint("Use quoted CSV field to represent carriage return."))); ! /* * if we got here, it is the first line and we didn't find * \n, so don't consume the peeked character --- 2279,2290 ---- if (cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg(!cstate->csv_mode ? ! "literal carriage return found in data" : ! "unquoted carriage return found in data"), ! errhint(!cstate->csv_mode ? ! "Use \"\\r\" to represent carriage return." : ! "Use quoted CSV field to represent carriage return."))); /* * if we got here, it is the first line and we didn't find * \n, so don't consume the peeked character *************** *** 2510,2559 **** else if (cstate->eol_type == EOL_NL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unquoted carriage return found in CSV data"), ! errhint("Use quoted CSV field to represent carriage return."))); /* If reach here, we have found the line terminator */ break; } ! if (c == '\n' && !in_quote) { if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unquoted newline found in data"), ! errhint("Use quoted CSV field to represent newline."))); cstate->eol_type = EOL_NL; /* in case not set yet */ /* If reach here, we have found the line terminator */ break; } /* ! * In CSV mode, we only recognize \. at start of line */ ! if (c == '\\' && cstate->line_buf.len == 0) { char c2; ! /* ! * If need more data, go back to loop top to load it. ! */ ! if (raw_buf_ptr >= copy_buf_len) ! { ! if (hit_eof) ! { ! /* backslash just before EOF, treat as data char */ ! result = true; ! break; ! } ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } ! /* ! * Note: we do not change c here since we aren't treating \ as ! * escaping the next character. */ c2 = copy_raw_buf[raw_buf_ptr]; --- 2295,2343 ---- else if (cstate->eol_type == EOL_NL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg(!cstate->csv_mode ? ! "literal carriage return found in data" : ! "unquoted carriage return found in data"), ! errhint(!cstate->csv_mode ? ! "Use \"\\r\" to represent carriage return." : ! "Use quoted CSV field to represent carriage return."))); /* If reach here, we have found the line terminator */ break; } ! /* Process \n */ ! if (c == '\n' && (!cstate->csv_mode || !in_quote)) { if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg(!cstate->csv_mode ? ! "literal newline found in data" : ! "unquoted newline found in data"), ! errhint(!cstate->csv_mode ? ! "Use \"\\n\" to represent newline." : ! "Use quoted CSV field to represent newline."))); cstate->eol_type = EOL_NL; /* in case not set yet */ /* If reach here, we have found the line terminator */ break; } /* ! * In CSV mode, we only recognize \. alone on a line. This is ! * because \. is a valid CSV data value. */ ! if (c == '\\' && (!cstate->csv_mode || first_char_in_line)) { char c2; ! IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); ! IF_NEED_REFILL_AND_EOF_BREAK(0); ! /* ----- ! * get next character ! * Note: we do not change c so if it isn't \., we can fall ! * through and continue processing for client encoding. ! * ----- */ c2 = copy_raw_buf[raw_buf_ptr]; *************** *** 2568,2662 **** */ if (cstate->eol_type == EOL_CRNL) { ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } /* if hit_eof, c2 will become '\0' */ c2 = copy_raw_buf[raw_buf_ptr++]; if (c2 == '\n') ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker does not match previous newline style"))); ! if (c2 != '\r') ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker corrupt"))); ! } ! if (raw_buf_ptr >= copy_buf_len && !hit_eof) ! { ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; } /* if hit_eof, c2 will become '\0' */ c2 = copy_raw_buf[raw_buf_ptr++]; if (c2 != '\r' && c2 != '\n') ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker corrupt"))); if ((cstate->eol_type == EOL_NL && c2 != '\n') || (cstate->eol_type == EOL_CRNL && c2 != '\n') || (cstate->eol_type == EOL_CR && c2 != '\r')) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), errmsg("end-of-copy marker does not match previous newline style"))); /* * Transfer only the data before the \. into line_buf, then * discard the data and the \. sequence. */ if (prev_raw_ptr > cstate->raw_buf_index) ! appendBinaryStringInfo(&cstate->line_buf, cstate->raw_buf + cstate->raw_buf_index, prev_raw_ptr - cstate->raw_buf_index); cstate->raw_buf_index = raw_buf_ptr; result = true; /* report EOF */ break; } } /* ! * Do we need to be careful about trailing bytes of multibyte ! * characters? (See note above about client_only_encoding) * ! * We assume here that pg_encoding_mblen only looks at the first byte ! * of the character! */ ! if (cstate->client_only_encoding) { int mblen; ! s[0] = c; ! mblen = pg_encoding_mblen(cstate->client_encoding, s); ! if (raw_buf_ptr + (mblen - 1) > copy_buf_len) ! { ! if (hit_eof) ! { ! /* consume the partial character (will fail below) */ ! raw_buf_ptr = copy_buf_len; ! result = true; ! break; ! } ! raw_buf_ptr = prev_raw_ptr; /* undo fetch */ ! need_data = true; ! continue; ! } raw_buf_ptr += mblen - 1; } } /* end of outer loop */ /* * Transfer any still-uncopied data to line_buf. */ ! if (raw_buf_ptr > cstate->raw_buf_index) ! { ! appendBinaryStringInfo(&cstate->line_buf, ! cstate->raw_buf + cstate->raw_buf_index, ! raw_buf_ptr - cstate->raw_buf_index); ! cstate->raw_buf_index = raw_buf_ptr; ! } return result; } --- 2352,2466 ---- */ if (cstate->eol_type == EOL_CRNL) { ! /* Get the next character */ ! IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); /* if hit_eof, c2 will become '\0' */ c2 = copy_raw_buf[raw_buf_ptr++]; + if (c2 == '\n') ! { ! if (!cstate->csv_mode) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker does not match previous newline style"))); ! else ! NO_END_OF_COPY_GOTO; ! } ! else if (c2 != '\r') ! { ! if (!cstate->csv_mode) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker corrupt"))); ! else ! NO_END_OF_COPY_GOTO; ! } } + + /* Get the next character */ + IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); /* if hit_eof, c2 will become '\0' */ c2 = copy_raw_buf[raw_buf_ptr++]; + if (c2 != '\r' && c2 != '\n') ! { ! if (!cstate->csv_mode) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("end-of-copy marker corrupt"))); ! else ! NO_END_OF_COPY_GOTO; ! } ! if ((cstate->eol_type == EOL_NL && c2 != '\n') || (cstate->eol_type == EOL_CRNL && c2 != '\n') || (cstate->eol_type == EOL_CR && c2 != '\r')) + { ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), errmsg("end-of-copy marker does not match previous newline style"))); + } /* * Transfer only the data before the \. into line_buf, then * discard the data and the \. sequence. */ if (prev_raw_ptr > cstate->raw_buf_index) ! appendBinaryStringInfo(&cstate->line_buf, ! cstate->raw_buf + cstate->raw_buf_index, prev_raw_ptr - cstate->raw_buf_index); cstate->raw_buf_index = raw_buf_ptr; result = true; /* report EOF */ break; } + else if (!cstate->csv_mode) + /* + * If we are here, it means we found a backslash followed by + * something other than a period. In non-CSV mode, anything + * after a backslash is special, so we skip over that second + * character too. If we didn't do that \\. would be + * considered an eof-of copy, while in non-CVS mode it is a + * literal backslash followed by a period. In CSV mode, + * backslashes are not special, so we want to process the + * character after the backslash just like a normal character, + * so we don't increment in those cases. + */ + raw_buf_ptr++; } /* ! * This label is for CSV cases where \. appears at the start of a line, ! * but there is more text after it, meaning it was a data value. ! * We are more strict for \. in CSV mode because \. could be a data ! * value, while in non-CSV mode, \. cannot be a data value. ! */ ! not_end_of_copy: ! ! /* ! * Process all bytes of a multi-byte character as a group. * ! * We only support multi-byte sequences where the first byte ! * has the high-bit set, so as an optimization we can avoid ! * this block entirely if it is not set. */ ! if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) { int mblen; ! mblen_str[0] = c; ! /* All our encodings only read the first byte to get the length */ ! mblen = pg_encoding_mblen(cstate->client_encoding, mblen_str); ! IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1); ! IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1); raw_buf_ptr += mblen - 1; } + first_char_in_line = false; } /* end of outer loop */ /* * Transfer any still-uncopied data to line_buf. */ ! REFILL_LINEBUF; return result; } *************** *** 3150,3156 **** * safe, because in valid backend encodings, extra bytes of a * multibyte character never look like ASCII. */ ! if (cstate->client_only_encoding) mblen = pg_encoding_mblen(cstate->client_encoding, string); CopySendData(cstate, string, mblen); break; --- 2954,2960 ---- * safe, because in valid backend encodings, extra bytes of a * multibyte character never look like ASCII. */ ! if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) mblen = pg_encoding_mblen(cstate->client_encoding, string); CopySendData(cstate, string, mblen); break; *************** *** 3196,3202 **** use_quote = true; break; } ! if (cstate->client_only_encoding) mblen = pg_encoding_mblen(cstate->client_encoding, tstring); else mblen = 1; --- 3000,3006 ---- use_quote = true; break; } ! if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) mblen = pg_encoding_mblen(cstate->client_encoding, tstring); else mblen = 1; *************** *** 3210,3216 **** { if (use_quote && (c == quotec || c == escapec)) CopySendChar(cstate, escapec); ! if (cstate->client_only_encoding) mblen = pg_encoding_mblen(cstate->client_encoding, string); else mblen = 1; --- 3014,3020 ---- { if (use_quote && (c == quotec || c == escapec)) CopySendChar(cstate, escapec); ! if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c)) mblen = pg_encoding_mblen(cstate->client_encoding, string); else mblen = 1;