Re: BUG #2895: Private Use Unicode character crashes server when using ILIKE

From: Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us>
To: Michael Fuhr <mike(at)fuhr(dot)org>, James Russell <internationalhobo(at)gmail(dot)com>, pgsql-bugs(at)postgresql(dot)org
Subject: Re: BUG #2895: Private Use Unicode character crashes server when using ILIKE
Date: 2007-01-24 17:15:04
Message-ID: 2876.1169658904@sss.pgh.pa.us
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-bugs

I wrote:
> Nonetheless, the code is certainly giving wrong answers for 4-byte
> characters. Will go fix...

I've applied the attached patch for 8.1, and related patches in all
supported branches.

regards, tom lane

Index: wchar.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.47.2.4
diff -c -r1.47.2.4 wchar.c
*** wchar.c 22 Aug 2006 12:11:38 -0000 1.47.2.4
--- wchar.c 24 Jan 2007 16:16:27 -0000
***************
*** 345,362 ****
}

/*
! * convert UTF8 string to pg_wchar (UCS-2)
! * caller should allocate enough space for "to"
* len: length of from.
* "from" not necessarily null terminated.
*/
static int
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
{
- unsigned char c1,
- c2,
- c3;
int cnt = 0;

while (len > 0 && *from)
{
--- 345,363 ----
}

/*
! * convert UTF8 string to pg_wchar (UCS-4)
! * caller must allocate enough space for "to", including a trailing zero!
* len: length of from.
* "from" not necessarily null terminated.
*/
static int
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
{
int cnt = 0;
+ uint32 c1,
+ c2,
+ c3,
+ c4;

while (len > 0 && *from)
{
***************
*** 365,390 ****
*to = *from++;
len--;
}
! else if ((*from & 0xe0) == 0xc0 && len >= 2)
{
c1 = *from++ & 0x1f;
c2 = *from++ & 0x3f;
! *to = c1 << 6;
! *to |= c2;
len -= 2;
}
! else if ((*from & 0xe0) == 0xe0 && len >= 3)
{
c1 = *from++ & 0x0f;
c2 = *from++ & 0x3f;
c3 = *from++ & 0x3f;
! *to = c1 << 12;
! *to |= c2 << 6;
! *to |= c3;
len -= 3;
}
else
{
*to = *from++;
len--;
}
--- 366,404 ----
*to = *from++;
len--;
}
! else if ((*from & 0xe0) == 0xc0)
{
+ if (len < 2)
+ break; /* drop trailing incomplete char */
c1 = *from++ & 0x1f;
c2 = *from++ & 0x3f;
! *to = (c1 << 6) | c2;
len -= 2;
}
! else if ((*from & 0xf0) == 0xe0)
{
+ if (len < 3)
+ break; /* drop trailing incomplete char */
c1 = *from++ & 0x0f;
c2 = *from++ & 0x3f;
c3 = *from++ & 0x3f;
! *to = (c1 << 12) | (c2 << 6) | c3;
len -= 3;
}
+ else if ((*from & 0xf8) == 0xf0)
+ {
+ if (len < 4)
+ break; /* drop trailing incomplete char */
+ c1 = *from++ & 0x07;
+ c2 = *from++ & 0x3f;
+ c3 = *from++ & 0x3f;
+ c4 = *from++ & 0x3f;
+ *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
+ len -= 4;
+ }
else
{
+ /* treat a bogus char as length 1; not ours to raise error */
*to = *from++;
len--;
}
***************
*** 396,407 ****
}

/*
! * returns the byte length of a UTF8 character pointed to by s
*/
int
pg_utf_mblen(const unsigned char *s)
{
! int len = 1;

if ((*s & 0x80) == 0)
len = 1;
--- 410,429 ----
}

/*
! * Return the byte length of a UTF8 character pointed to by s
! *
! * Note: in the current implementation we do not support UTF8 sequences
! * of more than 4 bytes; hence do NOT return a value larger than 4.
! * We return "1" for any leading byte that is either flat-out illegal or
! * indicates a length larger than we support.
! *
! * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps
! * other places would need to be fixed to change this.
*/
int
pg_utf_mblen(const unsigned char *s)
{
! int len;

if ((*s & 0x80) == 0)
len = 1;
***************
*** 411,421 ****
len = 3;
else if ((*s & 0xf8) == 0xf0)
len = 4;
else if ((*s & 0xfc) == 0xf8)
len = 5;
else if ((*s & 0xfe) == 0xfc)
len = 6;
! return (len);
}

static int
--- 433,447 ----
len = 3;
else if ((*s & 0xf8) == 0xf0)
len = 4;
+ #ifdef NOT_USED
else if ((*s & 0xfc) == 0xf8)
len = 5;
else if ((*s & 0xfe) == 0xfc)
len = 6;
! #endif
! else
! len = 1;
! return len;
}

static int

In response to

Browse pgsql-bugs by date

  From Date Subject
Next Message Tom Lane 2007-01-24 17:24:33 Re: Function returns wrong data after datatype change
Previous Message James Becerra 2007-01-24 16:10:38 BUG #2929: Error opening 5432 port