? build.local.properties
? org/postgresql/core/UTF8Encoding.java
Index: org/postgresql/core/Encoding.java
===================================================================
RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/core/Encoding.java,v
retrieving revision 1.20
diff -c -r1.20 Encoding.java
*** org/postgresql/core/Encoding.java 11 Jan 2005 08:25:43 -0000 1.20
--- org/postgresql/core/Encoding.java 29 Jun 2005 14:04:37 -0000
***************
*** 23,29 ****
*/
public class Encoding
{
-
private static final Encoding DEFAULT_ENCODING = new Encoding(null);
/*
--- 23,28 ----
***************
*** 36,41 ****
--- 35,41 ----
// encodings found in backend/util/mb/encnames.c
encodings.put("SQL_ASCII", new String[] { "ASCII", "us-ascii" });
encodings.put("UNICODE", new String[] { "UTF-8", "UTF8" });
+ encodings.put("UTF8", new String[] { "UTF-8", "UTF8" }); // 8.1's canonical name for UNICODE changed.
encodings.put("LATIN1", new String[] { "ISO8859_1" });
encodings.put("LATIN2", new String[] { "ISO8859_2" });
encodings.put("LATIN3", new String[] { "ISO8859_3" });
***************
*** 75,86 ****
}
private final String encoding;
- private final boolean utf8;
! private Encoding(String encoding)
{
this.encoding = encoding;
- this.utf8 = (encoding != null && (encoding.equals("UTF-8") || encoding.equals("UTF8")));
}
/**
--- 75,84 ----
}
private final String encoding;
! protected Encoding(String encoding)
{
this.encoding = encoding;
}
/**
***************
*** 93,99 ****
*/
public static Encoding getJVMEncoding(String jvmEncoding) {
if (isAvailable(jvmEncoding))
! return new Encoding(jvmEncoding);
else
return defaultEncoding();
}
--- 91,102 ----
*/
public static Encoding getJVMEncoding(String jvmEncoding) {
if (isAvailable(jvmEncoding))
! {
! if (jvmEncoding.equals("UTF-8") || jvmEncoding.equals("UTF8"))
! return new UTF8Encoding(jvmEncoding);
! else
! return new Encoding(jvmEncoding);
! }
else
return defaultEncoding();
}
***************
*** 175,183 ****
if (encoding == null)
return new String(encodedString, offset, length);
- if (utf8)
- return decodeUTF8(encodedString, offset, length);
-
return new String(encodedString, offset, length, encoding);
}
--- 178,183 ----
***************
*** 251,316 ****
}
}
- private char[] decoderArray = new char[1024];
-
- /**
- * Custom byte[] -> String conversion routine for UTF-8 only.
- * This is about 30% faster than using the String(byte[],int,int,String)
- * ctor, at least under JDK 1.4.2.
- *
- * @param data the array containing UTF8-encoded data
- * @param offset the offset of the first byte in data
to decode from
- * @param length the number of bytes to decode
- * @return a decoded string
- * @throws IOException if something goes wrong
- */
- private synchronized String decodeUTF8(byte[] data, int offset, int length)
- throws IOException {
- char[] cdata = decoderArray;
- if (cdata.length < length)
- cdata = decoderArray = new char[length];
-
- int in = offset;
- int out = 0;
- int end = length + offset;
-
- try
- {
- while (in < end)
- {
- int ch = data[in++] & 0xff;
- if (ch < 0x80)
- {
- // Length 1: \u00000 .. \u0007f
- }
- else if (ch < 0xe0)
- {
- // Length 2: \u00080 .. \u007ff
- ch = ((ch & 0x1f) << 6);
- ch = ch | (data[in++] & 0x3f);
- }
- else
- {
- // Length 3: \u00800 .. \u0ffff
- ch = ((ch & 0x0f) << 12);
- ch = ch | ((data[in++] & 0x3f) << 6);
- ch = ch | (data[in++] & 0x3f);
- }
- cdata[out++] = (char)ch;
- }
- }
- catch (ArrayIndexOutOfBoundsException a)
- {
- throw new IOException("UTF-8 string representation was truncated");
- }
-
- // Check if we ran past the end without seeing an exception.
- if (in > end)
- throw new IOException("UTF-8 string representation was truncated");
-
- return new String(cdata, 0, out);
- }
-
public String toString() {
return (encoding == null ? "" : encoding);
}
--- 251,256 ----
Index: org/postgresql/test/jdbc2/DatabaseEncodingTest.java
===================================================================
RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/test/jdbc2/DatabaseEncodingTest.java,v
retrieving revision 1.5
diff -c -r1.5 DatabaseEncodingTest.java
*** org/postgresql/test/jdbc2/DatabaseEncodingTest.java 11 Jan 2005 08:25:48 -0000 1.5
--- org/postgresql/test/jdbc2/DatabaseEncodingTest.java 29 Jun 2005 14:04:37 -0000
***************
*** 12,21 ****
import org.postgresql.test.TestUtil;
import junit.framework.TestCase;
import java.sql.*;
/*
! * Test case for Dario's encoding problems.
! * Ensure the driver's own utf-8 decode method works.
*/
public class DatabaseEncodingTest extends TestCase
{
--- 12,28 ----
import org.postgresql.test.TestUtil;
import junit.framework.TestCase;
import java.sql.*;
+ import org.postgresql.core.Encoding;
+ import org.postgresql.PGConnection;
+ import java.io.IOException;
+ import java.util.Arrays;
/*
! * Test case for various encoding problems.
! *
! * Ensure that we can do a round-trip of all server-supported unicode
! * values without trashing them, and that bad encodings are
! * detected.
*/
public class DatabaseEncodingTest extends TestCase
{
***************
*** 26,32 ****
super(name);
}
! private static final int STEP = 300;
// Set up the fixture for this testcase: a connection to a database with
// a table for this test.
--- 33,39 ----
super(name);
}
! private static final int STEP = 100;
// Set up the fixture for this testcase: a connection to a database with
// a table for this test.
***************
*** 68,84 ****
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery("SELECT getdatabaseencoding()");
assertTrue(rs.next());
! if (!"UNICODE".equals(rs.getString(1)))
{
rs.close();
return ; // not a UNICODE database.
}
rs.close();
// Create data.
! // NB: we only test up to d800 as code points above that are
! // reserved for surrogates in UTF-16
PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)");
for (int i = 1; i < 0xd800; i += STEP)
{
--- 75,95 ----
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery("SELECT getdatabaseencoding()");
assertTrue(rs.next());
!
! String dbEncoding = rs.getString(1);
! if (!dbEncoding.equals("UNICODE") && !dbEncoding.equals("UTF8"))
{
+ System.err.println("DatabaseEncodingTest: Skipping UNICODE database tests as test database encoding is " + dbEncoding);
rs.close();
return ; // not a UNICODE database.
}
rs.close();
+ boolean testHighUnicode = TestUtil.haveMinimumServerVersion(con, "8.1");
+
// Create data.
! // NB: we avoid d800-dfff as those are reserved for surrogates in UTF-16
PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)");
for (int i = 1; i < 0xd800; i += STEP)
{
***************
*** 94,102 ****
--- 105,149 ----
assertEquals(1, insert.executeUpdate());
}
+ for (int i = 0xe000; i < 0x10000; i += STEP)
+ {
+ int count = (i + STEP) > 0x10000 ? 0x10000 - i : STEP;
+ char[] testChars = new char[count];
+ for (int j = 0; j < count; ++j)
+ testChars[j] = (char)(i + j);
+
+ String testString = new String(testChars);
+
+ insert.setInt(1, i);
+ insert.setString(2, testString);
+ assertEquals(1, insert.executeUpdate());
+ }
+
+ if (testHighUnicode) {
+ for (int i = 0x10000; i < 0x110000; i += STEP)
+ {
+ int count = (i + STEP) > 0x110000 ? 0x110000 - i : STEP;
+ char[] testChars = new char[count*2];
+ for (int j = 0; j < count; ++j) {
+ testChars[j*2] = (char)(0xd800 + ((i + j - 0x10000) >> 10));
+ testChars[j*2+1] = (char)(0xdc00 + ((i + j - 0x10000) & 0x3ff));
+ }
+
+ String testString = new String(testChars);
+
+ insert.setInt(1, i);
+ insert.setString(2, testString);
+
+ //System.err.println("Inserting: " + dumpString(testString));
+
+ assertEquals(1, insert.executeUpdate());
+ }
+ }
+
con.commit();
// Check data.
+ stmt.setFetchSize(1);
rs = stmt.executeQuery("SELECT unicode_ordinal, unicode_string FROM testdbencoding ORDER BY unicode_ordinal");
for (int i = 1; i < 0xd800; i += STEP)
{
***************
*** 110,116 ****
String testString = new String(testChars);
! assertEquals(dumpString(testString), dumpString(rs.getString(2)));
}
}
}
--- 157,323 ----
String testString = new String(testChars);
! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(rs.getString(2)));
! }
!
! for (int i = 0xe000; i < 0x10000; i += STEP)
! {
! assertTrue(rs.next());
! assertEquals(i, rs.getInt(1));
!
! int count = (i + STEP) > 0x10000 ? 0x10000 - i : STEP;
! char[] testChars = new char[count];
! for (int j = 0; j < count; ++j)
! testChars[j] = (char)(i + j);
!
! String testString = new String(testChars);
!
! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(rs.getString(2)));
! }
!
! if (testHighUnicode) {
! for (int i = 0x10000; i < 0x110000; i += STEP)
! {
! assertTrue(rs.next());
! assertEquals(i, rs.getInt(1));
!
! int count = (i + STEP) > 0x110000 ? 0x110000 - i : STEP;
! char[] testChars = new char[count*2];
! for (int j = 0; j < count; ++j) {
! testChars[j*2] = (char)(0xd800 + ((i + j - 0x10000) >> 10));
! testChars[j*2+1] = (char)(0xdc00 + ((i + j - 0x10000) & 0x3ff));
! }
!
! String testString = new String(testChars);
!
! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(rs.getString(2)));
! }
! }
! }
!
! public void testUTF8Decode() throws Exception {
! // Tests for our custom UTF-8 decoder.
!
! Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8");
!
! for (int ch = 0; ch < 0x110000; ++ch) {
! if (ch >= 0xd800 && ch < 0xe000)
! continue; // Surrogate range.
!
! String testString;
! if (ch >= 0x10000) {
! testString = new String(new char[] {
! (char) (0xd800 + ((ch-0x10000) >> 10)),
! (char) (0xdc00 + ((ch-0x10000) & 0x3ff)) });
! } else {
! testString = new String(new char[] { (char)ch });
! }
!
! byte[] jvmEncoding = testString.getBytes("UTF-8");
! String jvmDecoding = new String(jvmEncoding, 0, jvmEncoding.length, "UTF-8");
! String ourDecoding = utf8Encoding.decode(jvmEncoding, 0, jvmEncoding.length);
!
! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(jvmDecoding));
! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(ourDecoding));
! }
! }
!
! public void testBadUTF8Decode() throws Exception {
! Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8");
!
! byte[][] badSequences = new byte[][] {
! // One-byte illegal sequences
! { (byte)0x80 }, // First byte may not be 10xxxxxx
!
! // Two-byte illegal sequences
! { (byte)0xc0, (byte)0x00 }, // Second byte must be 10xxxxxx
! { (byte)0xc0, (byte)0x80 }, // Can't represent a value < 0x80
!
! // Three-byte illegal sequences
! { (byte)0xe0, (byte)0x00 }, // Second byte must be 10xxxxxx
! { (byte)0xe0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx
! { (byte)0xe0, (byte)0x80, (byte)0x80 }, // Can't represent a value < 0x800
! { (byte)0xed, (byte)0xa0, (byte)0x80 }, // Not allowed to encode the range d800..dfff
!
! // Four-byte illegal sequences
! { (byte)0xf0, (byte)0x00 }, // Second byte must be 10xxxxxx
! { (byte)0xf0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx
! { (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx
! { (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x80 }, // Can't represent a value < 0x10000
!
! // Five-byte illegal sequences
! { (byte)0xf8 }, // Can't have a five-byte sequence.
!
! // Six-byte illegal sequences
! { (byte)0xfc }, // Can't have a six-byte sequence.
!
! // Seven-byte illegal sequences
! { (byte)0xfe }, // Can't have a seven-byte sequence.
!
! // Eigth-byte illegal sequences
! { (byte)0xff }, // Can't have an eight-byte sequence.
! };
!
! byte[] paddedSequence = new byte[32];
! for (int i = 0; i < badSequences.length; ++i) {
! byte[] sequence = badSequences[i];
!
! try {
! String str = utf8Encoding.decode(sequence, 0, sequence.length);
! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
! } catch (IOException ioe) {
! // Expected exception.
! }
!
! // Try it with padding.
! Arrays.fill(paddedSequence, (byte)0);
! System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length);
!
! try {
! String str = utf8Encoding.decode(paddedSequence, 0, paddedSequence.length);
! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
! } catch (IOException ioe) {
! // Expected exception.
! }
! }
! }
!
! public void testTruncatedUTF8Decode() throws Exception {
! Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8");
!
! byte[][] shortSequences = new byte[][] {
! { (byte)0xc0 }, // Second byte must be present
!
! { (byte)0xe0 }, // Second byte must be present
! { (byte)0xe0, (byte)0x80 }, // Third byte must be present
!
! { (byte)0xf0 }, // Second byte must be present
! { (byte)0xf0, (byte)0x80 }, // Third byte must be present
! { (byte)0xf0, (byte)0x80, (byte)0x80 }, // Fourth byte must be present
! };
!
! byte[] paddedSequence = new byte[32];
! for (int i = 0; i < shortSequences.length; ++i) {
! byte[] sequence = shortSequences[i];
!
! try {
! String str = utf8Encoding.decode(sequence, 0, sequence.length);
! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
! } catch (IOException ioe) {
! // Expected exception.
! }
!
!
! // Try it with padding and a truncated length.
! Arrays.fill(paddedSequence, (byte)0);
! System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length);
!
! try {
! String str = utf8Encoding.decode(paddedSequence, 0, sequence.length);
! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">");
! } catch (IOException ioe) {
! // Expected exception.
! }
}
}
}
*** /dev/null Thu Jan 1 12:00:00 1970
--- org/postgresql/core/UTF8Encoding.java Thu Jun 30 02:03:28 2005
***************
*** 0 ****
--- 1,157 ----
+ package org.postgresql.core;
+
+ import java.io.IOException;
+ import org.postgresql.util.GT;
+
+ class UTF8Encoding extends Encoding {
+ UTF8Encoding(String jvmEncoding) {
+ super(jvmEncoding);
+ }
+
+ private static final int MIN_2_BYTES = 0x80;
+ private static final int MIN_3_BYTES = 0x800;
+ private static final int MIN_4_BYTES = 0x10000;
+ private static final int MAX_CODE_POINT = 0x10ffff;
+
+ private char[] decoderArray = new char[1024];
+
+ // helper for decode
+ private final static void checkByte(int ch, int pos, int len) throws IOException {
+ if ((ch & 0xc0) != 0x80)
+ throw new IOException(GT.tr("Illegal UTF-8 sequence: byte {0} of {1} byte sequence is not 10xxxxxx: {2}",
+ new Object[] { new Integer(pos), new Integer(len), new Integer(ch) }));
+ }
+
+ private final static void checkMinimal(int ch, int minValue) throws IOException {
+ if (ch >= minValue)
+ return;
+
+ int actualLen;
+ switch (minValue) {
+ case MIN_2_BYTES:
+ actualLen = 2;
+ break;
+ case MIN_3_BYTES:
+ actualLen = 3;
+ break;
+ case MIN_4_BYTES:
+ actualLen = 4;
+ break;
+ default:
+ throw new IllegalArgumentException("unexpected minValue passed to checkMinimal: " + minValue);
+ }
+
+ int expectedLen;
+ if (ch < MIN_2_BYTES)
+ expectedLen = 1;
+ else if (ch < MIN_3_BYTES)
+ expectedLen = 2;
+ else if (ch < MIN_4_BYTES)
+ expectedLen = 3;
+ else
+ throw new IllegalArgumentException("unexpected ch passed to checkMinimal: " + ch);
+
+ throw new IOException(GT.tr("Illegal UTF-8 sequence: {0} bytes used to encode a {1} byte value: {2}",
+ new Object[] { new Integer(actualLen), new Integer(expectedLen), new Integer(ch) }));
+ }
+
+ /**
+ * Custom byte[] -> String conversion routine for UTF-8 only.
+ * This is about twice as fast as using the String(byte[],int,int,String)
+ * ctor, at least under JDK 1.4.2. The extra checks for illegal representations
+ * add about 10-15% overhead, but they seem worth it given the number of SQL_ASCII
+ * databases out there.
+ *
+ * @param data the array containing UTF8-encoded data
+ * @param offset the offset of the first byte in data
to decode from
+ * @param length the number of bytes to decode
+ * @return a decoded string
+ * @throws IOException if something goes wrong
+ */
+ public synchronized String decode(byte[] data, int offset, int length) throws IOException {
+ char[] cdata = decoderArray;
+ if (cdata.length < length)
+ cdata = decoderArray = new char[length];
+
+ int in = offset;
+ int out = 0;
+ int end = length + offset;
+
+ try
+ {
+ while (in < end)
+ {
+ int ch = data[in++] & 0xff;
+
+ // Convert UTF-8 to 21-bit codepoint.
+ if (ch < 0x80) {
+ // 0xxxxxxx -- length 1.
+ } else if (ch < 0xc0) {
+ // 10xxxxxx -- illegal!
+ throw new IOException(GT.tr("Illegal UTF-8 sequence: initial byte is {0}: {1}",
+ new Object[] { "10xxxxxx", new Integer(ch) }));
+ } else if (ch < 0xe0) {
+ // 110xxxxx 10xxxxxx
+ ch = ((ch & 0x1f) << 6);
+ checkByte(data[in], 2, 2);
+ ch = ch | (data[in++] & 0x3f);
+ checkMinimal(ch, MIN_2_BYTES);
+ } else if (ch < 0xf0) {
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ ch = ((ch & 0x0f) << 12);
+ checkByte(data[in], 2, 3);
+ ch = ch | ((data[in++] & 0x3f) << 6);
+ checkByte(data[in], 3, 3);
+ ch = ch | (data[in++] & 0x3f);
+ checkMinimal(ch, MIN_3_BYTES);
+ } else if (ch < 0xf8) {
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ ch = ((ch & 0x07) << 18);
+ checkByte(data[in], 2, 4);
+ ch = ch | ((data[in++] & 0x3f) << 12);
+ checkByte(data[in], 3, 4);
+ ch = ch | ((data[in++] & 0x3f) << 6);
+ checkByte(data[in], 4, 4);
+ ch = ch | (data[in++] & 0x3f);
+ checkMinimal(ch, MIN_4_BYTES);
+ } else {
+ throw new IOException(GT.tr("Illegal UTF-8 sequence: initial byte is {0}: {1}",
+ new Object[] { "11111xxx", new Integer(ch) }));
+ }
+
+ if (ch > MAX_CODE_POINT)
+ throw new IOException(GT.tr("Illegal UTF-8 sequence: final value is out of range: {0}",
+ new Integer(ch)));
+
+ // Convert 21-bit codepoint to Java chars:
+ // 0..ffff are represented directly as a single char
+ // 10000..10ffff are represented as a "surrogate pair" of two chars
+ // See: http://java.sun.com/developer/technicalArticles/Intl/Supplementary/
+
+ if (ch > 0xffff) {
+ // Use a surrogate pair to represent it.
+ ch -= 0x10000; // ch is now 0..fffff (20 bits)
+ cdata[out++] = (char) (0xd800 + (ch >> 10)); // top 10 bits
+ cdata[out++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits
+ } else if (ch >= 0xd800 && ch < 0xe000) {
+ // Not allowed to encode the surrogate range directly.
+ throw new IOException(GT.tr("Illegal UTF-8 sequence: final value is a surrogate value: {0}",
+ new Integer(ch)));
+ } else {
+ // Normal case.
+ cdata[out++] = (char) ch;
+ }
+ }
+ }
+ catch (ArrayIndexOutOfBoundsException a)
+ {
+ throw new IOException("Illegal UTF-8 sequence: multibyte sequence was truncated");
+ }
+
+ // Check if we ran past the end without seeing an exception.
+ if (in > end)
+ throw new IOException("Illegal UTF-8 sequence: multibyte sequence was truncated");
+
+ return new String(cdata, 0, out);
+ }
+ }