[PATCH] RFC: Add length parameterised dmetaphone functions

From: Christian Marie <christian(at)ponies(dot)io>
To: pgsql-hackers(at)postgresql(dot)org
Subject: [PATCH] RFC: Add length parameterised dmetaphone functions
Date: 2015-11-06 05:00:49
Message-ID: 1446786049-6677-1-git-send-email-christian@ponies.io
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

A developer I work with was trying to use dmetaphone to group people names into
equivalence classes. He found that many long names would be grouped together
when they shouldn't be, this turned out to be because dmetaphone has an
undocumented upper bound on its output length, of four. This is obviously
impractical for many use cases.

This patch addresses this by adding and documenting an optional argument to
dmetaphone and dmetaphone_alt that specifies the maximum output length. This
makes it possible to use dmetaphone on much longer inputs.

Backwards compatibility is catered for by making the new argument optional,
defaulting to the old, hard-coded value of four. We now have:

dmetaphone(text source) returns text
dmetaphone(text source, int max_output_length) returns text
dmetaphone_alt(text source) returns text
dmetaphone_alt(text source, int max_output_length) returns text

---
contrib/fuzzystrmatch/Makefile | 2 +-
contrib/fuzzystrmatch/dmetaphone.c | 46 ++++++++++++++------
contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql | 12 ++++++
contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql | 44 -------------------
contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql | 52 +++++++++++++++++++++++
contrib/fuzzystrmatch/fuzzystrmatch.control | 2 +-
doc/src/sgml/fuzzystrmatch.sgml | 18 ++++++--
7 files changed, 115 insertions(+), 61 deletions(-)
create mode 100644 contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql
delete mode 100644 contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql
create mode 100644 contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql

diff --git a/contrib/fuzzystrmatch/Makefile b/contrib/fuzzystrmatch/Makefile
index 0327d9510a50..fee85ad1ec36 100644
--- a/contrib/fuzzystrmatch/Makefile
+++ b/contrib/fuzzystrmatch/Makefile
@@ -4,7 +4,7 @@ MODULE_big = fuzzystrmatch
OBJS = fuzzystrmatch.o dmetaphone.o $(WIN32RES)

EXTENSION = fuzzystrmatch
-DATA = fuzzystrmatch--1.0.sql fuzzystrmatch--unpackaged--1.0.sql
+DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--unpackaged--1.0.sql fuzzystrmatch--1.0--1.1.sql
PGFILEDESC = "fuzzystrmatch - similarities and distance between strings"

ifdef USE_PGXS
diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 147c8501ee89..7afcf0aee841 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -115,10 +115,28 @@ The remaining code is authored by Andrew Dunstan <amdunstan(at)ncshp(dot)org> and
#include <assert.h>

/* prototype for the main function we got from the perl module */
-static void DoubleMetaphone(char *, char **);
+static void DoubleMetaphone(char *, char **, int);

#ifndef DMETAPHONE_MAIN

+/* Fetch and validate the optional second function argument, which should be a
+ * maximum length for output
+ *
+ * Defaults to 4.
+ *
+ * Introduced in 1.1
+ */
+inline int
+fetch_max_len_arg(PG_FUNCTION_ARGS)
+{
+ int max_len = (PG_NARGS() > 1) ? PG_GETARG_INT32(1) : 4;
+ if(max_len < 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid max output length, should be >= 1")));
+ return max_len;
+}
+
/*
* The PostgreSQL visible dmetaphone function.
*/
@@ -128,6 +146,7 @@ PG_FUNCTION_INFO_V1(dmetaphone);
Datum
dmetaphone(PG_FUNCTION_ARGS)
{
+ int max_len;
text *arg;
char *aptr,
*codes[2],
@@ -139,8 +158,9 @@ dmetaphone(PG_FUNCTION_ARGS)
#endif
arg = PG_GETARG_TEXT_P(0);
aptr = text_to_cstring(arg);
+ max_len = fetch_max_len_arg(fcinfo);

- DoubleMetaphone(aptr, codes);
+ DoubleMetaphone(aptr, codes, max_len);
code = codes[0];
if (!code)
code = "";
@@ -157,6 +177,7 @@ PG_FUNCTION_INFO_V1(dmetaphone_alt);
Datum
dmetaphone_alt(PG_FUNCTION_ARGS)
{
+ int max_len;
text *arg;
char *aptr,
*codes[2],
@@ -168,8 +189,9 @@ dmetaphone_alt(PG_FUNCTION_ARGS)
#endif
arg = PG_GETARG_TEXT_P(0);
aptr = text_to_cstring(arg);
+ max_len = fetch_max_len_arg(fcinfo);

- DoubleMetaphone(aptr, codes);
+ DoubleMetaphone(aptr, codes, max_len);
code = codes[1];
if (!code)
code = "";
@@ -390,7 +412,7 @@ MetaphAdd(metastring *s, char *new_str)


static void
-DoubleMetaphone(char *str, char **codes)
+DoubleMetaphone(char *str, char **codes, int max_len)
{
int length;
metastring *original;
@@ -427,7 +449,7 @@ DoubleMetaphone(char *str, char **codes)
}

/* main loop */
- while ((primary->length < 4) || (secondary->length < 4))
+ while ((primary->length < max_len) || (secondary->length < max_len))
{
if (current >= length)
break;
@@ -1410,11 +1432,11 @@ DoubleMetaphone(char *str, char **codes)
}


- if (primary->length > 4)
- SetAt(primary, 4, '\0');
+ if (primary->length > max_len)
+ SetAt(primary, max_len, '\0');

- if (secondary->length > 4)
- SetAt(secondary, 4, '\0');
+ if (secondary->length > max_len)
+ SetAt(secondary, max_len, '\0');

*codes = primary->str;
*++codes = secondary->str;
@@ -1428,13 +1450,13 @@ DoubleMetaphone(char *str, char **codes)

/* just for testing - not part of the perl code */

-main(int argc, char **argv)
+void main(int argc, char **argv)
{
char *codes[2];

- if (argc > 1)
+ if (argc > 2)
{
- DoubleMetaphone(argv[1], codes);
+ DoubleMetaphone(argv[1], codes, atoi(argv[2]));
printf("%s|%s\n", codes[0], codes[1]);
}
}
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql
new file mode 100644
index 000000000000..b00112c6bc9d
--- /dev/null
+++ b/contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql
@@ -0,0 +1,12 @@
+/* contrib/fuzzystrmatch/fuzzystrmatch--1.0--1.1.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.1'" to load this file. \quit
+
+CREATE FUNCTION dmetaphone (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone_alt (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+LANGUAGE C IMMUTABLE STRICT;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql
deleted file mode 100644
index 1cf9b6188cc0..000000000000
--- a/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql
+++ /dev/null
@@ -1,44 +0,0 @@
-/* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */
-
--- complain if script is sourced in psql, rather than via CREATE EXTENSION
-\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
-
-CREATE FUNCTION levenshtein (text,text) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein_with_costs'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein_less_equal'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
-AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION metaphone (text,int) RETURNS text
-AS 'MODULE_PATHNAME','metaphone'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION soundex(text) RETURNS text
-AS 'MODULE_PATHNAME', 'soundex'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION text_soundex(text) RETURNS text
-AS 'MODULE_PATHNAME', 'soundex'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION difference(text,text) RETURNS int
-AS 'MODULE_PATHNAME', 'difference'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION dmetaphone (text) RETURNS text
-AS 'MODULE_PATHNAME', 'dmetaphone'
-LANGUAGE C IMMUTABLE STRICT;
-
-CREATE FUNCTION dmetaphone_alt (text) RETURNS text
-AS 'MODULE_PATHNAME', 'dmetaphone_alt'
-LANGUAGE C IMMUTABLE STRICT;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql b/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql
new file mode 100644
index 000000000000..e0eba457892b
--- /dev/null
+++ b/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql
@@ -0,0 +1,52 @@
+/* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
+
+CREATE FUNCTION levenshtein (text,text) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein_with_costs'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein_less_equal'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
+AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION metaphone (text,int) RETURNS text
+AS 'MODULE_PATHNAME','metaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION soundex(text) RETURNS text
+AS 'MODULE_PATHNAME', 'soundex'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION text_soundex(text) RETURNS text
+AS 'MODULE_PATHNAME', 'soundex'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION difference(text,text) RETURNS int
+AS 'MODULE_PATHNAME', 'difference'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone (text) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone_alt (text) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION dmetaphone_alt (text, int) RETURNS text
+AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+LANGUAGE C IMMUTABLE STRICT;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.control b/contrib/fuzzystrmatch/fuzzystrmatch.control
index e257f0961171..6b2832ae8d95 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.control
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.control
@@ -1,5 +1,5 @@
# fuzzystrmatch extension
comment = 'determine similarities and distance between strings'
-default_version = '1.0'
+default_version = '1.1'
module_pathname = '$libdir/fuzzystrmatch'
relocatable = true
diff --git a/doc/src/sgml/fuzzystrmatch.sgml b/doc/src/sgml/fuzzystrmatch.sgml
index f26bd90dfc5f..3898d16af22c 100644
--- a/doc/src/sgml/fuzzystrmatch.sgml
+++ b/doc/src/sgml/fuzzystrmatch.sgml
@@ -210,7 +210,9 @@ test=# SELECT metaphone('GUMBO', 4);

<synopsis>
dmetaphone(text source) returns text
+dmetaphone(text source, int max_output_length) returns text
dmetaphone_alt(text source) returns text
+dmetaphone_alt(text source, int max_output_length) returns text
</synopsis>

<para>
@@ -218,14 +220,24 @@ dmetaphone_alt(text source) returns text
</para>

<para>
+ max_output_length defaults to 4 and must be greater than 1
+ </para>
+
+ <para>
Example:
</para>

<screen>
-test=# select dmetaphone('gumbo');
- dmetaphone
+postgres=# select dmetaphone('unicorns');
+ dmetaphone
+------------
+ ANKR
+(1 row)
+
+postgres=# select dmetaphone('unicorns', 8);
+ dmetaphone
------------
- KMP
+ ANKRNS
(1 row)
</screen>
</sect2>
--
2.6.2

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Robert Haas 2015-11-06 05:32:27 Re: NOTIFY in Background Worker
Previous Message Kyotaro HORIGUCHI 2015-11-06 04:58:21 Re: Bitmap index scans use of filters on available columns