From 8161ca49ae2044e004d3f36c04f60b03e97f4071 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 19 Nov 2025 13:24:38 -0800
Subject: [PATCH v13 1/2] fuzzystrmatch: use pg_ascii_toupper().

fuzzystrmatch is designed for ASCII, so no need to rely on the global
LC_CTYPE setting.

TODO: what about \xc7 case? Also, what should the behavior be for
soundex()?

Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
---
 contrib/fuzzystrmatch/dmetaphone.c    | 45 +++++++++++++++++++++++++--
 contrib/fuzzystrmatch/fuzzystrmatch.c | 43 ++++++++++++++-----------
 2 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 227d8b11ddc..9a4e5ae7e0e 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -98,6 +98,7 @@ The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
 
 #include "postgres.h"
 
+#include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 
 /* turn off assertions for embedded function */
@@ -116,6 +117,9 @@ The remaining code is authored by Andrew Dunstan <amdunstan@ncshp.org> and
 #include <assert.h>
 #include <ctype.h>
 
+#define SMALL_LETTER_C_WITH_CEDILLA		'\xe7'
+#define CAPITAL_LETTER_C_WITH_CEDILLA	'\xc7'
+
 /* prototype for the main function we got from the perl module */
 static void DoubleMetaphone(char *str, char **codes);
 
@@ -282,9 +286,46 @@ static void
 MakeUpper(metastring *s)
 {
 	char	   *i;
+	bool		c_with_cedilla;
+
+	/*
+	 * C WITH CEDILLA should be uppercased, as well.
+	 *
+	 * XXX: Only works in single-byte encodings that encode lowercase C WITH
+	 * CEDILLA as \xe7. Should have proper multibyte support.
+	 *
+	 * NB: WIN1256 encodes only the lowercase C WITH CEDILLA, but for the
+	 * purposes of metaphone, we can still "uppercase" it to \xc7 here so that
+	 * it's recognized later.
+	 */
+	switch (GetDatabaseEncoding())
+	{
+		case PG_LATIN1:
+		case PG_LATIN2:
+		case PG_LATIN3:
+		case PG_LATIN5:
+		case PG_LATIN8:
+		case PG_LATIN9:
+		case PG_LATIN10:
+		case PG_WIN1250:
+		case PG_WIN1252:
+		case PG_WIN1254:
+		case PG_WIN1256:
+		case PG_WIN1258:
+			c_with_cedilla = true;
+			break;
+		default:
+			c_with_cedilla = false;
+			break;
+	}
 
 	for (i = s->str; *i; i++)
-		*i = toupper((unsigned char) *i);
+	{
+		if (c_with_cedilla && *i == SMALL_LETTER_C_WITH_CEDILLA)
+			*i = CAPITAL_LETTER_C_WITH_CEDILLA;
+		else
+			*i = pg_ascii_toupper((unsigned char) *i);
+	}
 }
 
 
@@ -463,7 +504,7 @@ DoubleMetaphone(char *str, char **codes)
 					current += 1;
 				break;
 
-			case '\xc7':		/* C with cedilla */
+			case CAPITAL_LETTER_C_WITH_CEDILLA:
 				MetaphAdd(primary, "S");
 				MetaphAdd(secondary, "S");
 				current += 1;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index e7cc314b763..319302af0e4 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202";
 static char
 soundex_code(char letter)
 {
-	letter = toupper((unsigned char) letter);
+	letter = pg_ascii_toupper((unsigned char) letter);
 	/* Defend against non-ASCII letters */
 	if (letter >= 'A' && letter <= 'Z')
 		return soundex_table[letter - 'A'];
@@ -122,16 +122,21 @@ static const char _codes[26] = {
 static int
 getcode(char c)
 {
-	if (isalpha((unsigned char) c))
-	{
-		c = toupper((unsigned char) c);
-		/* Defend against non-ASCII letters */
-		if (c >= 'A' && c <= 'Z')
-			return _codes[c - 'A'];
-	}
+	c = pg_ascii_toupper((unsigned char) c);
+	/* Defend against non-ASCII letters */
+	if (c >= 'A' && c <= 'Z')
+		return _codes[c - 'A'];
+
 	return 0;
 }
 
+static bool
+ascii_isalpha(char c)
+{
+	return (c >= 'A' && c <= 'Z') ||
+		(c >= 'a' && c <= 'z');
+}
+
 #define isvowel(c)	(getcode(c) & 1)	/* AEIOU */
 
 /* These letters are passed through unchanged */
@@ -301,18 +306,18 @@ metaphone(PG_FUNCTION_ARGS)
  * accessing the array directly... */
 
 /* Look at the next letter in the word */
-#define Next_Letter (toupper((unsigned char) word[w_idx+1]))
+#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
 /* Look at the current letter in the word */
-#define Curr_Letter (toupper((unsigned char) word[w_idx]))
+#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
 /* Go N letters back. */
 #define Look_Back_Letter(n) \
-	(w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
+	(w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
 /* Previous letter.  I dunno, should this return null on failure? */
 #define Prev_Letter (Look_Back_Letter(1))
 /* Look two letters down.  It makes sure you don't walk off the string. */
 #define After_Next_Letter \
-	(Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
-#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
+	(Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
+#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
 
 
 /* Allows us to safely look ahead an arbitrary # of letters */
@@ -340,7 +345,7 @@ Lookahead(char *word, int how_far)
 #define Phone_Len	(p_idx)
 
 /* Note is a letter is a 'break' in the word */
-#define Isbreak(c)	(!isalpha((unsigned char) (c)))
+#define Isbreak(c)	(!ascii_isalpha((unsigned char) (c)))
 
 
 static void
@@ -379,7 +384,7 @@ _metaphone(char *word,			/* IN */
 
 	/*-- The first phoneme has to be processed specially. --*/
 	/* Find our first letter */
-	for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
+	for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
 	{
 		/* On the off chance we were given nothing but crap... */
 		if (Curr_Letter == '\0')
@@ -478,7 +483,7 @@ _metaphone(char *word,			/* IN */
 		 */
 
 		/* Ignore non-alphas */
-		if (!isalpha((unsigned char) (Curr_Letter)))
+		if (!ascii_isalpha((unsigned char) (Curr_Letter)))
 			continue;
 
 		/* Drop duplicates, except CC */
@@ -731,7 +736,7 @@ _soundex(const char *instr, char *outstr)
 	Assert(outstr);
 
 	/* Skip leading non-alphabetic characters */
-	while (*instr && !isalpha((unsigned char) *instr))
+	while (*instr && !ascii_isalpha((unsigned char) *instr))
 		++instr;
 
 	/* If no string left, return all-zeroes buffer */
@@ -742,12 +747,12 @@ _soundex(const char *instr, char *outstr)
 	}
 
 	/* Take the first letter as is */
-	*outstr++ = (char) toupper((unsigned char) *instr++);
+	*outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
 
 	count = 1;
 	while (*instr && count < SOUNDEX_LEN)
 	{
-		if (isalpha((unsigned char) *instr) &&
+		if (ascii_isalpha((unsigned char) *instr) &&
 			soundex_code(*instr) != soundex_code(*(instr - 1)))
 		{
 			*outstr = soundex_code(*instr);
-- 
2.43.0

