LIKE optimization in UTF-8 and locale-C

From: ITAGAKI Takahiro <itagaki(dot)takahiro(at)oss(dot)ntt(dot)co(dot)jp>
To: pgsql-hackers(at)postgresql(dot)org, pgsql-patches(at)postgresql(dot)org
Subject: LIKE optimization in UTF-8 and locale-C
Date: 2007-03-22 05:41:35
Message-ID: 20070322143734.62FA.ITAGAKI.TAKAHIRO@oss.ntt.co.jp
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers pgsql-patches

Hello,

I found LIKE operators are slower on multi-byte encoding databases
than single-byte encoding ones. It comes from difference between
MatchText() and MBMatchText().

We've had an optimization for single-byte encodings using
pg_database_encoding_max_length() == 1 test. I'll propose to extend it
in UTF-8 with locale-C case. All of trailing bytes are different from first
bytes in UTF-8 multi-byte characters, so we can use functions for single-bytes
and byte-wise comparison in the case. With the attached patch, the performance
of UTF-8 LIKE operators are pushed up to near other single-bytes encodings.

Databases initialized with locale-C are widely used in Japan, because
Japanese locale are broken in almost of platforms. Japanese user can
choose EUC-jp or UTF-8 as a server encoding, but I think UTF-8 will be
more and more used in the future.

---- test ----

initdb --no-locale --encoding=<encoding>

[HEAD]
SQL_ASCII : 7171 ms / 7203 ms / 7187 ms
LATIN1 : 7172 ms / 7156 ms / 7141 ms
UTF8 : 16235 ms / 16281 ms / 16281 ms
EUC_JP : 17454 ms / 17453 ms / 17438 ms

[with patch]
SQL_ASCII : 7062 ms / 7125 ms / 7125 ms
LATIN1 : 7047 ms / 7063 ms / 7047 ms
UTF8 : 7188 ms / 7234 ms / 7235 ms
EUC_JP : 17468 ms / 17453 ms / 17453 ms

CREATE OR REPLACE FUNCTION test() RETURNS INTEGER AS $$
DECLARE
cnt integer;
BEGIN
FOR i IN 1..1000 LOOP
SELECT count(*) INTO cnt FROM item WHERE i_title LIKE '%BABABABABARIBA%' LIMIT 50;
END LOOP;
RETURN cnt;
END;
$$ LANGUAGE plpgsql;

SELECT count(*) FROM item; -- borrowed from DBT-1 (TPC-W)
count
-------
10000
(1 row)

---- patch ----

Index: src/backend/utils/adt/like.c
===================================================================
--- src/backend/utils/adt/like.c (head)
+++ src/backend/utils/adt/like.c (working copy)
@@ -21,6 +21,7 @@

#include "mb/pg_wchar.h"
#include "utils/builtins.h"
+#include "utils/pg_locale.h"


#define LIKE_TRUE 1
@@ -119,6 +120,13 @@


/*
+ * true iff match functions for single-byte characters are available.
+ */
+#define sb_match_available() \
+ (pg_database_encoding_max_length() == 1 || \
+ (lc_collate_is_c() && GetDatabaseEncoding() == PG_UTF8))
+
+/*
* interface routines called by the function manager
*/

@@ -138,7 +146,7 @@
p = VARDATA(pat);
plen = (VARSIZE(pat) - VARHDRSZ);

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
else
result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
@@ -162,7 +170,7 @@
p = VARDATA(pat);
plen = (VARSIZE(pat) - VARHDRSZ);

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
else
result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
@@ -186,7 +194,7 @@
p = VARDATA(pat);
plen = (VARSIZE(pat) - VARHDRSZ);

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
else
result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
@@ -210,7 +218,7 @@
p = VARDATA(pat);
plen = (VARSIZE(pat) - VARHDRSZ);

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
else
result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
@@ -275,7 +283,7 @@
int slen,
plen;

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
{
s = NameStr(*str);
slen = strlen(s);
@@ -316,7 +324,7 @@
int slen,
plen;

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
{
s = NameStr(*str);
slen = strlen(s);
@@ -357,7 +365,7 @@
int slen,
plen;

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
{
s = VARDATA(str);
slen = (VARSIZE(str) - VARHDRSZ);
@@ -393,7 +401,7 @@
int slen,
plen;

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
{
s = VARDATA(str);
slen = (VARSIZE(str) - VARHDRSZ);
@@ -429,7 +437,7 @@
text *esc = PG_GETARG_TEXT_P(1);
text *result;

- if (pg_database_encoding_max_length() == 1)
+ if (sb_match_available())
result = do_like_escape(pat, esc);
else
result = MB_do_like_escape(pat, esc);

Regards,
---
ITAGAKI Takahiro
NTT Open Source Software Center

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Jeremy Drake 2007-03-22 05:47:44 Re: patch adding new regexp functions
Previous Message Tom Lane 2007-03-22 05:32:53 Re: patch adding new regexp functions

Browse pgsql-patches by date

  From Date Subject
Next Message Jeremy Drake 2007-03-22 05:47:44 Re: patch adding new regexp functions
Previous Message Tom Lane 2007-03-22 05:32:53 Re: patch adding new regexp functions