Index: org/postgresql/core/Encoding.java
===================================================================
RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/core/Encoding.java,v
retrieving revision 1.16
diff -u -c -r1.16 Encoding.java
*** org/postgresql/core/Encoding.java 17 Jul 2004 07:39:41 -0000 1.16
--- org/postgresql/core/Encoding.java 8 Aug 2004 23:00:50 -0000
***************
*** 261,268 ****
/**
* Custom byte[] -> String conversion routine for UTF-8 only.
! * This is about 30% faster than using the String(byte[],int,int,String)
! * ctor, at least under JDK 1.4.2.
*
* @param data the array containing UTF8-encoded data
* @param offset the offset of the first byte in data
to decode from
--- 261,270 ----
/**
* Custom byte[] -> String conversion routine for UTF-8 only.
! * This is about twice as fast as using the String(byte[],int,int,String)
! * ctor, at least under JDK 1.4.2. The extra checks for illegal representations
! * add about 10-15% overhead but seem worth it given the number of SQL_ASCII
! * databases out there..
*
* @param data the array containing UTF8-encoded data
* @param offset the offset of the first byte in data
to decode from
***************
*** 270,276 ****
* @return a decoded string
* @throws IOException if something goes wrong
*/
! private synchronized String decodeUTF8(byte[] data, int offset, int length) throws IOException {
char[] cdata = decoderArray;
if (cdata.length < length)
cdata = decoderArray = new char[length];
--- 272,278 ----
* @return a decoded string
* @throws IOException if something goes wrong
*/
! public synchronized String decodeUTF8(byte[] data, int offset, int length) throws IOException {
char[] cdata = decoderArray;
if (cdata.length < length)
cdata = decoderArray = new char[length];
***************
*** 282,309 ****
try {
while (in < end) {
int ch = data[in++] & 0xff;
if (ch < 0x80) {
! // Length 1: \u00000 .. \u0007f
} else if (ch < 0xe0) {
! // Length 2: \u00080 .. \u007ff
ch = ((ch & 0x1f) << 6);
ch = ch | (data[in++] & 0x3f);
! } else {
! // Length 3: \u00800 .. \u0ffff
ch = ((ch & 0x0f) << 12);
ch = ch | ((data[in++] & 0x3f) << 6);
ch = ch | (data[in++] & 0x3f);
}
- cdata[out++] = (char)ch;
}
} catch (ArrayIndexOutOfBoundsException a) {
! throw new IOException("UTF-8 string representation was truncated");
}
!
// Check if we ran past the end without seeing an exception.
if (in > end)
! throw new IOException("UTF-8 string representation was truncated");
!
return new String(cdata, 0, out);
}
--- 284,389 ----
try {
while (in < end) {
int ch = data[in++] & 0xff;
+
+ // Convert UTF-8 to 31-bit codepoint.
if (ch < 0x80) {
! // 0xxxxxxx -- length 1.
! } else if (ch < 0xc0) {
! // 10xxxxxx -- illegal!
! throw new IOException("Illegal UTF-8 input (initial byte is 10xxxxxx)");
} else if (ch < 0xe0) {
! // 110xxxxx 10xxxxxx
ch = ((ch & 0x1f) << 6);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 2 of 2 not 10xxxxxx)");
ch = ch | (data[in++] & 0x3f);
! } else if (ch < 0xf0) {
! // 1110xxxx 10xxxxxx 10xxxxxx
ch = ((ch & 0x0f) << 12);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 2 of 3 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 6);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 3 of 3 not 10xxxxxx)");
+ ch = ch | (data[in++] & 0x3f);
+ } else if (ch < 0xf8) {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ ch = ((ch & 0x07) << 18);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 2 of 4 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 12);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 3 of 4 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 6);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 4 of 4 not 10xxxxxx)");
+ ch = ch | (data[in++] & 0x3f);
+ } else if (ch < 0xfc) {
+ // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ // nb: should never happen in theory, but might as well accept it anyway --
+ // perhaps something is generating non-minimal UTF-8 output.
+ ch = ((ch & 0x03) << 24);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 2 of 5 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 18);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 3 of 5 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 12);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 4 of 5 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 6);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 5 of 5 not 10xxxxxx)");
+ ch = ch | (data[in++] & 0x3f);
+ } else if (ch < 0xfe) {
+ // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ // nb: should never happen in theory, but might as well accept it anyway --
+ // perhaps something is generating non-minimal UTF-8 output.
+ ch = ((ch & 0x01) << 30);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 2 of 6 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 24);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 3 of 6 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 18);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 4 of 6 not 10xxxxxx)");
+ ch = ch | ((data[in++] & 0x3f) << 12);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 5 of 6 not 10xxxxxx)");
ch = ch | ((data[in++] & 0x3f) << 6);
+ if ((data[in] & 0xc0) != 0x80)
+ throw new IOException("Illegal UTF-8 input (byte 6 of 6 not 10xxxxxx)");
ch = ch | (data[in++] & 0x3f);
+ } else {
+ throw new IOException("Illegal UTF-8 input (initial byte is 1111111x)");
+ }
+
+ // Convert 31-bit codepoint to UTF-16
+ if (ch > 0x10ffff)
+ throw new IOException("Illegal UTF-8 input (final value out of range: " + ch + ")");
+
+ if (ch > 0xffff) {
+ // Use a surrogate pair to represent it.
+ ch -= 0x10000; // ch is now 0..fffff (20 bits)
+ cdata[out++] = (char) (0xd800 + (ch >> 10)); // top 10 bits
+ cdata[out++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits
+ } else if (ch >= 0xd800 && ch < 0xe000) {
+ // Not allowed to encode the surrogate range directly.
+ throw new IOException("Illegal UTF-8 input (final value is a surrogate value: " + ch + ")");
+ } else {
+ // Normal case.
+ cdata[out++] = (char) ch;
}
}
} catch (ArrayIndexOutOfBoundsException a) {
! throw new IOException("UTF-8 input was truncated");
}
!
// Check if we ran past the end without seeing an exception.
if (in > end)
! throw new IOException("UTF-8 input was truncated");
!
return new String(cdata, 0, out);
}
Index: org/postgresql/test/jdbc2/DatabaseEncodingTest.java
===================================================================
RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/test/jdbc2/DatabaseEncodingTest.java,v
retrieving revision 1.2
diff -u -c -r1.2 DatabaseEncodingTest.java
*** org/postgresql/test/jdbc2/DatabaseEncodingTest.java 27 Jul 2004 05:03:04 -0000 1.2
--- org/postgresql/test/jdbc2/DatabaseEncodingTest.java 8 Aug 2004 23:00:50 -0000
***************
*** 1,16 ****
package org.postgresql.test.jdbc2;
import org.postgresql.test.TestUtil;
import junit.framework.TestCase;
import java.sql.*;
/*
! * Test case for Dario's encoding problems.
! * Ensure the driver's own utf-8 decode method works.
*/
public class DatabaseEncodingTest extends TestCase
{
private Connection con;
public DatabaseEncodingTest(String name)
{
--- 1,23 ----
package org.postgresql.test.jdbc2;
import org.postgresql.test.TestUtil;
+ import org.postgresql.core.Encoding;
+ import java.io.IOException;
+ import java.util.Arrays;
import junit.framework.TestCase;
import java.sql.*;
/*
! * Test case for various encoding problems.
! *
! * Ensure that we can do a round-trip of all server-supported unicode
! * values without trashing them, and that bad character encodings are
! * detected.
*/
public class DatabaseEncodingTest extends TestCase
{
private Connection con;
+ private final Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8");
public DatabaseEncodingTest(String name)
{
***************
*** 66,73 ****
rs.close();
// Create data.
! // NB: we only test up to d800 as code points above that are
! // reserved for surrogates in UTF-16
PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)");
for (int i = 1; i < 0xd800; i += STEP) {
int count = (i+STEP) > 0xd800 ? 0xd800-i : STEP;
--- 73,81 ----
rs.close();
// Create data.
! // NB: we avoid d800-dfff as that range is reserved for surrogates in UTF-16.
! // We also do not test codepoints above U+10000 as the server doesn't correctly
! // support them (yet).
PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)");
for (int i = 1; i < 0xd800; i += STEP) {
int count = (i+STEP) > 0xd800 ? 0xd800-i : STEP;
***************
*** 82,87 ****
--- 90,108 ----
assertEquals(1, insert.executeUpdate());
}
+ for (int i = 0xe000; i < 0x10000; i += STEP) {
+ int count = (i+STEP) > 0x10000 ? 0x10000-i : STEP;
+ char[] testChars = new char[count];
+ for (int j = 0; j < count; ++j)
+ testChars[j] = (char)(i+j);
+
+ String testString = new String(testChars);
+
+ insert.setInt(1, i);
+ insert.setString(2, testString);
+ assertEquals(1, insert.executeUpdate());
+ }
+
con.commit();
// Check data.
***************
*** 99,103 ****
--- 120,267 ----
assertEquals(dumpString(testString), dumpString(rs.getString(2)));
}
+
+ for (int i = 0xe000; i < 0x10000; i += STEP) {
+ assertTrue(rs.next());
+ assertEquals(i, rs.getInt(1));
+
+ int count = (i+STEP) > 0x10000 ? 0x10000-i : STEP;
+ char[] testChars = new char[count];
+ for (int j = 0; j < count; ++j)
+ testChars[j] = (char)(i+j);
+
+ String testString = new String(testChars);
+
+ assertEquals(dumpString(testString), dumpString(rs.getString(2)));
+ }
+ }
+
+ public void testUTF8Decode() throws Exception {
+ // Tests for our custom UTF-8 decoder.
+
+ for (int ch = 0; ch < 0x110000; ++ch) {
+ if (ch >= 0xd800 && ch < 0xe000)
+ continue; // Surrogate range.
+
+ String testString;
+ if (ch >= 0x10000) {
+ testString = new String(new char[] {
+ (char) (0xd800 + ((ch-0x10000) >> 10)),
+ (char) (0xdc00 + ((ch-0x10000) & 0x3ff)) });
+ } else {
+ testString = new String(new char[] { (char)ch });
+ }
+
+ byte[] jvmEncoding = testString.getBytes("UTF-8");
+ String jvmDecoding = new String(jvmEncoding, 0, jvmEncoding.length, "UTF-8");
+ String ourDecoding = utf8Encoding.decode(jvmEncoding, 0, jvmEncoding.length);
+
+ assertEquals(testString, jvmDecoding);
+ assertEquals(testString, ourDecoding);
+ }
+ }
+
+ public void testBadUTF8Decode() throws Exception {
+ byte[][] badSequences = new byte[][] {
+ // One-byte illegal sequences
+ { (byte)0x80 }, // First byte may not be 10xxxxxx
+
+ // Two-byte illegal sequences
+ { (byte)0xc0, (byte)0x00 }, // Second byte must be 10xxxxxx
+
+ // Three-byte illegal sequences
+ { (byte)0xe0, (byte)0x00 }, // Second byte must be 10xxxxxx
+ { (byte)0xe0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx
+ { (byte)0xed, (byte)0xa0, (byte)0x80 }, // Not allowed to encode the range d800..dfff
+
+ // Four-byte illegal sequences
+ { (byte)0xf0, (byte)0x00 }, // Second byte must be 10xxxxxx
+ { (byte)0xf0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx
+ { (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx
+
+ // Five-byte illegal sequences
+ { (byte)0xf8, (byte)0x00 }, // Second byte must be 10xxxxxx
+ { (byte)0xf8, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx
+ { (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx
+ { (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fifth byte must be 10xxxxxx
+ { (byte)0xf8, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80 }, // Resulting value must be < U+110000
+
+ // Six-byte illegal sequences
+ { (byte)0xfc, (byte)0x00 }, // Second byte must be 10xxxxxx
+ { (byte)0xfc, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx
+ { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx
+ { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fifth byte must be 10xxxxxx
+ { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x00 }, // Sixth byte must be 10xxxxxx
+ { (byte)0xfc, (byte)0x80, (byte)0x88, (byte)0x80, (byte)0x80, (byte)0x80 }, // Resulting value must be < U+110000
+
+ // Seven-byte illegal sequences
+ { (byte)0xfe }, // Can't have a seven-byte sequence.
+
+ // Eigth-byte illegal sequences
+ { (byte)0xff }, // Can't have an eight-byte sequence.
+ };
+
+ byte[] paddedSequence = new byte[32];
+ for (int i = 0; i < badSequences.length; ++i) {
+ byte[] sequence = badSequences[i];
+
+ try {
+ String str = utf8Encoding.decode(sequence, 0, sequence.length);
+ fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ } catch (IOException ioe) {}
+
+ // Try it with padding.
+ Arrays.fill(paddedSequence, (byte)0);
+ System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length);
+
+ try {
+ String str = utf8Encoding.decode(paddedSequence, 0, paddedSequence.length);
+ fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ } catch (IOException ioe) {}
+ }
+ }
+
+ public void testTruncatedUTF8Decode() throws Exception {
+ byte[][] shortSequences = new byte[][] {
+ { (byte)0xc0 }, // Second byte must be present
+
+ { (byte)0xe0 }, // Second byte must be present
+ { (byte)0xe0, (byte)0x80 }, // Third byte must be present
+
+ { (byte)0xf0 }, // Second byte must be present
+ { (byte)0xf0, (byte)0x80 }, // Third byte must be present
+ { (byte)0xf0, (byte)0x80, (byte)0x80 }, // Fourth byte must be present
+
+ { (byte)0xfc }, // Second byte must be present
+ { (byte)0xfc, (byte)0x80 }, // Third byte must be present
+ { (byte)0xfc, (byte)0x80, (byte)0x80 }, // Fourth byte must be present
+ { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80 }, // Fifth byte must be present
+ { (byte)0xfc, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // Sixth byte must be present
+
+ { (byte)0xf8 }, // Second byte must be present
+ { (byte)0xf8, (byte)0x80 }, // Third byte must be present
+ { (byte)0xf8, (byte)0x80, (byte)0x80 }, // Fourth byte must be present
+ { (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80 }, // Fifth byte must be present
+ };
+
+ byte[] paddedSequence = new byte[32];
+ for (int i = 0; i < shortSequences.length; ++i) {
+ byte[] sequence = shortSequences[i];
+
+ try {
+ String str = utf8Encoding.decode(sequence, 0, sequence.length);
+ fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ } catch (IOException ioe) {}
+
+
+ // Try it with padding and a truncated length.
+ Arrays.fill(paddedSequence, (byte)0);
+ System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length);
+
+ try {
+ String str = utf8Encoding.decode(paddedSequence, 0, sequence.length);
+ fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
+ } catch (IOException ioe) {}
+ }
}
}