? build.local.properties ? org/postgresql/core/UTF8Encoding.java Index: org/postgresql/core/Encoding.java =================================================================== RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/core/Encoding.java,v retrieving revision 1.20 diff -c -r1.20 Encoding.java *** org/postgresql/core/Encoding.java 11 Jan 2005 08:25:43 -0000 1.20 --- org/postgresql/core/Encoding.java 29 Jun 2005 14:04:37 -0000 *************** *** 23,29 **** */ public class Encoding { - private static final Encoding DEFAULT_ENCODING = new Encoding(null); /* --- 23,28 ---- *************** *** 36,41 **** --- 35,41 ---- // encodings found in backend/util/mb/encnames.c encodings.put("SQL_ASCII", new String[] { "ASCII", "us-ascii" }); encodings.put("UNICODE", new String[] { "UTF-8", "UTF8" }); + encodings.put("UTF8", new String[] { "UTF-8", "UTF8" }); // 8.1's canonical name for UNICODE changed. encodings.put("LATIN1", new String[] { "ISO8859_1" }); encodings.put("LATIN2", new String[] { "ISO8859_2" }); encodings.put("LATIN3", new String[] { "ISO8859_3" }); *************** *** 75,86 **** } private final String encoding; - private final boolean utf8; ! private Encoding(String encoding) { this.encoding = encoding; - this.utf8 = (encoding != null && (encoding.equals("UTF-8") || encoding.equals("UTF8"))); } /** --- 75,84 ---- } private final String encoding; ! protected Encoding(String encoding) { this.encoding = encoding; } /** *************** *** 93,99 **** */ public static Encoding getJVMEncoding(String jvmEncoding) { if (isAvailable(jvmEncoding)) ! return new Encoding(jvmEncoding); else return defaultEncoding(); } --- 91,102 ---- */ public static Encoding getJVMEncoding(String jvmEncoding) { if (isAvailable(jvmEncoding)) ! { ! if (jvmEncoding.equals("UTF-8") || jvmEncoding.equals("UTF8")) ! return new UTF8Encoding(jvmEncoding); ! else ! return new Encoding(jvmEncoding); ! } else return defaultEncoding(); } *************** *** 175,183 **** if (encoding == null) return new String(encodedString, offset, length); - if (utf8) - return decodeUTF8(encodedString, offset, length); - return new String(encodedString, offset, length, encoding); } --- 178,183 ---- *************** *** 251,316 **** } } - private char[] decoderArray = new char[1024]; - - /** - * Custom byte[] -> String conversion routine for UTF-8 only. - * This is about 30% faster than using the String(byte[],int,int,String) - * ctor, at least under JDK 1.4.2. - * - * @param data the array containing UTF8-encoded data - * @param offset the offset of the first byte in data to decode from - * @param length the number of bytes to decode - * @return a decoded string - * @throws IOException if something goes wrong - */ - private synchronized String decodeUTF8(byte[] data, int offset, int length) - throws IOException { - char[] cdata = decoderArray; - if (cdata.length < length) - cdata = decoderArray = new char[length]; - - int in = offset; - int out = 0; - int end = length + offset; - - try - { - while (in < end) - { - int ch = data[in++] & 0xff; - if (ch < 0x80) - { - // Length 1: \u00000 .. \u0007f - } - else if (ch < 0xe0) - { - // Length 2: \u00080 .. \u007ff - ch = ((ch & 0x1f) << 6); - ch = ch | (data[in++] & 0x3f); - } - else - { - // Length 3: \u00800 .. \u0ffff - ch = ((ch & 0x0f) << 12); - ch = ch | ((data[in++] & 0x3f) << 6); - ch = ch | (data[in++] & 0x3f); - } - cdata[out++] = (char)ch; - } - } - catch (ArrayIndexOutOfBoundsException a) - { - throw new IOException("UTF-8 string representation was truncated"); - } - - // Check if we ran past the end without seeing an exception. - if (in > end) - throw new IOException("UTF-8 string representation was truncated"); - - return new String(cdata, 0, out); - } - public String toString() { return (encoding == null ? "" : encoding); } --- 251,256 ---- Index: org/postgresql/test/jdbc2/DatabaseEncodingTest.java =================================================================== RCS file: /usr/local/cvsroot/pgjdbc/pgjdbc/org/postgresql/test/jdbc2/DatabaseEncodingTest.java,v retrieving revision 1.5 diff -c -r1.5 DatabaseEncodingTest.java *** org/postgresql/test/jdbc2/DatabaseEncodingTest.java 11 Jan 2005 08:25:48 -0000 1.5 --- org/postgresql/test/jdbc2/DatabaseEncodingTest.java 29 Jun 2005 14:04:37 -0000 *************** *** 12,21 **** import org.postgresql.test.TestUtil; import junit.framework.TestCase; import java.sql.*; /* ! * Test case for Dario's encoding problems. ! * Ensure the driver's own utf-8 decode method works. */ public class DatabaseEncodingTest extends TestCase { --- 12,28 ---- import org.postgresql.test.TestUtil; import junit.framework.TestCase; import java.sql.*; + import org.postgresql.core.Encoding; + import org.postgresql.PGConnection; + import java.io.IOException; + import java.util.Arrays; /* ! * Test case for various encoding problems. ! * ! * Ensure that we can do a round-trip of all server-supported unicode ! * values without trashing them, and that bad encodings are ! * detected. */ public class DatabaseEncodingTest extends TestCase { *************** *** 26,32 **** super(name); } ! private static final int STEP = 300; // Set up the fixture for this testcase: a connection to a database with // a table for this test. --- 33,39 ---- super(name); } ! private static final int STEP = 100; // Set up the fixture for this testcase: a connection to a database with // a table for this test. *************** *** 68,84 **** Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT getdatabaseencoding()"); assertTrue(rs.next()); ! if (!"UNICODE".equals(rs.getString(1))) { rs.close(); return ; // not a UNICODE database. } rs.close(); // Create data. ! // NB: we only test up to d800 as code points above that are ! // reserved for surrogates in UTF-16 PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)"); for (int i = 1; i < 0xd800; i += STEP) { --- 75,95 ---- Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT getdatabaseencoding()"); assertTrue(rs.next()); ! ! String dbEncoding = rs.getString(1); ! if (!dbEncoding.equals("UNICODE") && !dbEncoding.equals("UTF8")) { + System.err.println("DatabaseEncodingTest: Skipping UNICODE database tests as test database encoding is " + dbEncoding); rs.close(); return ; // not a UNICODE database. } rs.close(); + boolean testHighUnicode = TestUtil.haveMinimumServerVersion(con, "8.1"); + // Create data. ! // NB: we avoid d800-dfff as those are reserved for surrogates in UTF-16 PreparedStatement insert = con.prepareStatement("INSERT INTO testdbencoding(unicode_ordinal, unicode_string) VALUES (?,?)"); for (int i = 1; i < 0xd800; i += STEP) { *************** *** 94,102 **** --- 105,149 ---- assertEquals(1, insert.executeUpdate()); } + for (int i = 0xe000; i < 0x10000; i += STEP) + { + int count = (i + STEP) > 0x10000 ? 0x10000 - i : STEP; + char[] testChars = new char[count]; + for (int j = 0; j < count; ++j) + testChars[j] = (char)(i + j); + + String testString = new String(testChars); + + insert.setInt(1, i); + insert.setString(2, testString); + assertEquals(1, insert.executeUpdate()); + } + + if (testHighUnicode) { + for (int i = 0x10000; i < 0x110000; i += STEP) + { + int count = (i + STEP) > 0x110000 ? 0x110000 - i : STEP; + char[] testChars = new char[count*2]; + for (int j = 0; j < count; ++j) { + testChars[j*2] = (char)(0xd800 + ((i + j - 0x10000) >> 10)); + testChars[j*2+1] = (char)(0xdc00 + ((i + j - 0x10000) & 0x3ff)); + } + + String testString = new String(testChars); + + insert.setInt(1, i); + insert.setString(2, testString); + + //System.err.println("Inserting: " + dumpString(testString)); + + assertEquals(1, insert.executeUpdate()); + } + } + con.commit(); // Check data. + stmt.setFetchSize(1); rs = stmt.executeQuery("SELECT unicode_ordinal, unicode_string FROM testdbencoding ORDER BY unicode_ordinal"); for (int i = 1; i < 0xd800; i += STEP) { *************** *** 110,116 **** String testString = new String(testChars); ! assertEquals(dumpString(testString), dumpString(rs.getString(2))); } } } --- 157,323 ---- String testString = new String(testChars); ! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(rs.getString(2))); ! } ! ! for (int i = 0xe000; i < 0x10000; i += STEP) ! { ! assertTrue(rs.next()); ! assertEquals(i, rs.getInt(1)); ! ! int count = (i + STEP) > 0x10000 ? 0x10000 - i : STEP; ! char[] testChars = new char[count]; ! for (int j = 0; j < count; ++j) ! testChars[j] = (char)(i + j); ! ! String testString = new String(testChars); ! ! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(rs.getString(2))); ! } ! ! if (testHighUnicode) { ! for (int i = 0x10000; i < 0x110000; i += STEP) ! { ! assertTrue(rs.next()); ! assertEquals(i, rs.getInt(1)); ! ! int count = (i + STEP) > 0x110000 ? 0x110000 - i : STEP; ! char[] testChars = new char[count*2]; ! for (int j = 0; j < count; ++j) { ! testChars[j*2] = (char)(0xd800 + ((i + j - 0x10000) >> 10)); ! testChars[j*2+1] = (char)(0xdc00 + ((i + j - 0x10000) & 0x3ff)); ! } ! ! String testString = new String(testChars); ! ! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(rs.getString(2))); ! } ! } ! } ! ! public void testUTF8Decode() throws Exception { ! // Tests for our custom UTF-8 decoder. ! ! Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8"); ! ! for (int ch = 0; ch < 0x110000; ++ch) { ! if (ch >= 0xd800 && ch < 0xe000) ! continue; // Surrogate range. ! ! String testString; ! if (ch >= 0x10000) { ! testString = new String(new char[] { ! (char) (0xd800 + ((ch-0x10000) >> 10)), ! (char) (0xdc00 + ((ch-0x10000) & 0x3ff)) }); ! } else { ! testString = new String(new char[] { (char)ch }); ! } ! ! byte[] jvmEncoding = testString.getBytes("UTF-8"); ! String jvmDecoding = new String(jvmEncoding, 0, jvmEncoding.length, "UTF-8"); ! String ourDecoding = utf8Encoding.decode(jvmEncoding, 0, jvmEncoding.length); ! ! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(jvmDecoding)); ! assertEquals("Test string: " + dumpString(testString), dumpString(testString), dumpString(ourDecoding)); ! } ! } ! ! public void testBadUTF8Decode() throws Exception { ! Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8"); ! ! byte[][] badSequences = new byte[][] { ! // One-byte illegal sequences ! { (byte)0x80 }, // First byte may not be 10xxxxxx ! ! // Two-byte illegal sequences ! { (byte)0xc0, (byte)0x00 }, // Second byte must be 10xxxxxx ! { (byte)0xc0, (byte)0x80 }, // Can't represent a value < 0x80 ! ! // Three-byte illegal sequences ! { (byte)0xe0, (byte)0x00 }, // Second byte must be 10xxxxxx ! { (byte)0xe0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx ! { (byte)0xe0, (byte)0x80, (byte)0x80 }, // Can't represent a value < 0x800 ! { (byte)0xed, (byte)0xa0, (byte)0x80 }, // Not allowed to encode the range d800..dfff ! ! // Four-byte illegal sequences ! { (byte)0xf0, (byte)0x00 }, // Second byte must be 10xxxxxx ! { (byte)0xf0, (byte)0x80, (byte)0x00 }, // Third byte must be 10xxxxxx ! { (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x00 }, // Fourth byte must be 10xxxxxx ! { (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x80 }, // Can't represent a value < 0x10000 ! ! // Five-byte illegal sequences ! { (byte)0xf8 }, // Can't have a five-byte sequence. ! ! // Six-byte illegal sequences ! { (byte)0xfc }, // Can't have a six-byte sequence. ! ! // Seven-byte illegal sequences ! { (byte)0xfe }, // Can't have a seven-byte sequence. ! ! // Eigth-byte illegal sequences ! { (byte)0xff }, // Can't have an eight-byte sequence. ! }; ! ! byte[] paddedSequence = new byte[32]; ! for (int i = 0; i < badSequences.length; ++i) { ! byte[] sequence = badSequences[i]; ! ! try { ! String str = utf8Encoding.decode(sequence, 0, sequence.length); ! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); ! } catch (IOException ioe) { ! // Expected exception. ! } ! ! // Try it with padding. ! Arrays.fill(paddedSequence, (byte)0); ! System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length); ! ! try { ! String str = utf8Encoding.decode(paddedSequence, 0, paddedSequence.length); ! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); ! } catch (IOException ioe) { ! // Expected exception. ! } ! } ! } ! ! public void testTruncatedUTF8Decode() throws Exception { ! Encoding utf8Encoding = Encoding.getJVMEncoding("UTF-8"); ! ! byte[][] shortSequences = new byte[][] { ! { (byte)0xc0 }, // Second byte must be present ! ! { (byte)0xe0 }, // Second byte must be present ! { (byte)0xe0, (byte)0x80 }, // Third byte must be present ! ! { (byte)0xf0 }, // Second byte must be present ! { (byte)0xf0, (byte)0x80 }, // Third byte must be present ! { (byte)0xf0, (byte)0x80, (byte)0x80 }, // Fourth byte must be present ! }; ! ! byte[] paddedSequence = new byte[32]; ! for (int i = 0; i < shortSequences.length; ++i) { ! byte[] sequence = shortSequences[i]; ! ! try { ! String str = utf8Encoding.decode(sequence, 0, sequence.length); ! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); ! } catch (IOException ioe) { ! // Expected exception. ! } ! ! ! // Try it with padding and a truncated length. ! Arrays.fill(paddedSequence, (byte)0); ! System.arraycopy(sequence, 0, paddedSequence, 0, sequence.length); ! ! try { ! String str = utf8Encoding.decode(paddedSequence, 0, sequence.length); ! fail("Expected an IOException on sequence " + i + ", but decoded to <" + str + ">"); ! } catch (IOException ioe) { ! // Expected exception. ! } } } } *** /dev/null Thu Jan 1 12:00:00 1970 --- org/postgresql/core/UTF8Encoding.java Thu Jun 30 02:03:28 2005 *************** *** 0 **** --- 1,157 ---- + package org.postgresql.core; + + import java.io.IOException; + import org.postgresql.util.GT; + + class UTF8Encoding extends Encoding { + UTF8Encoding(String jvmEncoding) { + super(jvmEncoding); + } + + private static final int MIN_2_BYTES = 0x80; + private static final int MIN_3_BYTES = 0x800; + private static final int MIN_4_BYTES = 0x10000; + private static final int MAX_CODE_POINT = 0x10ffff; + + private char[] decoderArray = new char[1024]; + + // helper for decode + private final static void checkByte(int ch, int pos, int len) throws IOException { + if ((ch & 0xc0) != 0x80) + throw new IOException(GT.tr("Illegal UTF-8 sequence: byte {0} of {1} byte sequence is not 10xxxxxx: {2}", + new Object[] { new Integer(pos), new Integer(len), new Integer(ch) })); + } + + private final static void checkMinimal(int ch, int minValue) throws IOException { + if (ch >= minValue) + return; + + int actualLen; + switch (minValue) { + case MIN_2_BYTES: + actualLen = 2; + break; + case MIN_3_BYTES: + actualLen = 3; + break; + case MIN_4_BYTES: + actualLen = 4; + break; + default: + throw new IllegalArgumentException("unexpected minValue passed to checkMinimal: " + minValue); + } + + int expectedLen; + if (ch < MIN_2_BYTES) + expectedLen = 1; + else if (ch < MIN_3_BYTES) + expectedLen = 2; + else if (ch < MIN_4_BYTES) + expectedLen = 3; + else + throw new IllegalArgumentException("unexpected ch passed to checkMinimal: " + ch); + + throw new IOException(GT.tr("Illegal UTF-8 sequence: {0} bytes used to encode a {1} byte value: {2}", + new Object[] { new Integer(actualLen), new Integer(expectedLen), new Integer(ch) })); + } + + /** + * Custom byte[] -> String conversion routine for UTF-8 only. + * This is about twice as fast as using the String(byte[],int,int,String) + * ctor, at least under JDK 1.4.2. The extra checks for illegal representations + * add about 10-15% overhead, but they seem worth it given the number of SQL_ASCII + * databases out there. + * + * @param data the array containing UTF8-encoded data + * @param offset the offset of the first byte in data to decode from + * @param length the number of bytes to decode + * @return a decoded string + * @throws IOException if something goes wrong + */ + public synchronized String decode(byte[] data, int offset, int length) throws IOException { + char[] cdata = decoderArray; + if (cdata.length < length) + cdata = decoderArray = new char[length]; + + int in = offset; + int out = 0; + int end = length + offset; + + try + { + while (in < end) + { + int ch = data[in++] & 0xff; + + // Convert UTF-8 to 21-bit codepoint. + if (ch < 0x80) { + // 0xxxxxxx -- length 1. + } else if (ch < 0xc0) { + // 10xxxxxx -- illegal! + throw new IOException(GT.tr("Illegal UTF-8 sequence: initial byte is {0}: {1}", + new Object[] { "10xxxxxx", new Integer(ch) })); + } else if (ch < 0xe0) { + // 110xxxxx 10xxxxxx + ch = ((ch & 0x1f) << 6); + checkByte(data[in], 2, 2); + ch = ch | (data[in++] & 0x3f); + checkMinimal(ch, MIN_2_BYTES); + } else if (ch < 0xf0) { + // 1110xxxx 10xxxxxx 10xxxxxx + ch = ((ch & 0x0f) << 12); + checkByte(data[in], 2, 3); + ch = ch | ((data[in++] & 0x3f) << 6); + checkByte(data[in], 3, 3); + ch = ch | (data[in++] & 0x3f); + checkMinimal(ch, MIN_3_BYTES); + } else if (ch < 0xf8) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + ch = ((ch & 0x07) << 18); + checkByte(data[in], 2, 4); + ch = ch | ((data[in++] & 0x3f) << 12); + checkByte(data[in], 3, 4); + ch = ch | ((data[in++] & 0x3f) << 6); + checkByte(data[in], 4, 4); + ch = ch | (data[in++] & 0x3f); + checkMinimal(ch, MIN_4_BYTES); + } else { + throw new IOException(GT.tr("Illegal UTF-8 sequence: initial byte is {0}: {1}", + new Object[] { "11111xxx", new Integer(ch) })); + } + + if (ch > MAX_CODE_POINT) + throw new IOException(GT.tr("Illegal UTF-8 sequence: final value is out of range: {0}", + new Integer(ch))); + + // Convert 21-bit codepoint to Java chars: + // 0..ffff are represented directly as a single char + // 10000..10ffff are represented as a "surrogate pair" of two chars + // See: http://java.sun.com/developer/technicalArticles/Intl/Supplementary/ + + if (ch > 0xffff) { + // Use a surrogate pair to represent it. + ch -= 0x10000; // ch is now 0..fffff (20 bits) + cdata[out++] = (char) (0xd800 + (ch >> 10)); // top 10 bits + cdata[out++] = (char) (0xdc00 + (ch & 0x3ff)); // bottom 10 bits + } else if (ch >= 0xd800 && ch < 0xe000) { + // Not allowed to encode the surrogate range directly. + throw new IOException(GT.tr("Illegal UTF-8 sequence: final value is a surrogate value: {0}", + new Integer(ch))); + } else { + // Normal case. + cdata[out++] = (char) ch; + } + } + } + catch (ArrayIndexOutOfBoundsException a) + { + throw new IOException("Illegal UTF-8 sequence: multibyte sequence was truncated"); + } + + // Check if we ran past the end without seeing an exception. + if (in > end) + throw new IOException("Illegal UTF-8 sequence: multibyte sequence was truncated"); + + return new String(cdata, 0, out); + } + }