From 8d9ae434c2924a2cea14f521f6c3f2e9b168c0fe Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Thu, 25 Jun 2026 10:03:44 -0700
Subject: [PATCH v3] Optimize UUID parse using SIMD.

string_to_uuid() parsed the input one character at a time in a scalar
loop. Add a fast path that recognizes the two common shapes -- a bare
string of 32 hexadecimal digits and the canonical 8x-4x-4x-4x-12x
form, each optionally wrapped in braces -- compacts them into 32
contiguous hex digits, and decodes them with the SIMD-aware
hex_decode_safe().

Any other shape, or any decoding error, is handed off to the original
scalar parser (now string_to_uuid_scalar()), so the accepted grammar
and the error messages are unchanged.  In particular,
hex_decode_safe() silently skips whitespace while the UUID grammar
does not, so the fast path also rejects results shorter than UUID_LEN
and falls back to the scalar parser, which reports the error.

Regression tests are added for the shapes handled by the fast path and
for invalid inputs that must still be rejected.

Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Reviwed-by: Haibo Yan <tristan.yim@gmail.com>
Discussion: https://postgr.es/m/CAD21AoCqeR4UQU77Q_yOMNNzJ7AVeiO5QZT+4HnzPm4Wm-e02Q@mail.gmail.com
---
 src/backend/utils/adt/uuid.c       | 101 +++++++++++++++++++++++++++--
 src/test/regress/expected/uuid.out |  55 ++++++++++++++++
 src/test/regress/sql/uuid.sql      |  16 +++++
 3 files changed, 167 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c
index 6ee3752ac78..bbfecd26d49 100644
--- a/src/backend/utils/adt/uuid.c
+++ b/src/backend/utils/adt/uuid.c
@@ -19,7 +19,9 @@
 #include "common/hashfn.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
+#include "nodes/miscnodes.h"
 #include "port/pg_bswap.h"
+#include "utils/builtins.h"
 #include "utils/fmgrprotos.h"
 #include "utils/guc.h"
 #include "utils/skipsupport.h"
@@ -122,13 +124,10 @@ uuid_out(PG_FUNCTION_ARGS)
 }
 
 /*
- * We allow UUIDs as a series of 32 hexadecimal digits with an optional dash
- * after each group of 4 hexadecimal digits, and optionally surrounded by {}.
- * (The canonical format 8x-4x-4x-4x-12x, where "nx" means n hexadecimal
- * digits, is the only one used for output.)
+ * General UUID parser.
  */
 static void
-string_to_uuid(const char *source, pg_uuid_t *uuid, Node *escontext)
+string_to_uuid_scalar(const char *source, pg_uuid_t *uuid, Node *escontext)
 {
 	const char *src = source;
 	bool		braces = false;
@@ -177,6 +176,98 @@ syntax_error:
 					"uuid", source)));
 }
 
+/*
+ * Fast path for the common UUID shapes, built on our SIMD-aware hex decoder.
+ *
+ * This handles a bare string of 32 hex digits and the canonical
+ * 8x-4x-4x-4x-12x form (where "nx" means n hex digits), each optionally
+ * wrapped in braces. Any other shape, or any decoding error, is handed off to
+ * string_to_uuid_scalar() so that parsing and error reporting stay identical
+ * to the scalar implementation.
+ */
+#ifndef	USE_NO_SIMD
+static void
+string_to_uuid_fast(const char *source, pg_uuid_t *uuid, Node *escontext)
+{
+	const char *body = source;
+	size_t		len = strlen(source);
+	const char *hexsrc = NULL;
+	char		hexbuf[32];
+	uint64		written;
+	ErrorSaveContext esctx = {T_ErrorSaveContext};
+
+	/* Strip one optional surrounding brace pair */
+	if (len >= 2 && source[0] == '{' && source[len - 1] == '}')
+	{
+		body = source + 1;
+		len -= 2;
+	}
+
+	if (len == 32)
+	{
+		/*
+		 * Body is already 32 contiguous hex digits -- decode straight from
+		 * the input. hex_decode_safe() reads exactly body[0..31], so it never
+		 * touches the trailing NULL or '}'.
+		 */
+		hexsrc = body;
+	}
+	else if (len == 36 && body[8] == '-' && body[13] == '-' &&
+			 body[18] == '-' && body[23] == '-')
+	{
+		/*
+		 * Canonical 8x-4x-4x-4x-12x form; compact them into hexbuf with
+		 * fixed-offset copies, dropping the dashes.
+		 */
+		memcpy(&hexbuf[0], &body[0], 8);
+		memcpy(&hexbuf[8], &body[9], 4);
+		memcpy(&hexbuf[12], &body[14], 4);
+		memcpy(&hexbuf[16], &body[19], 4);
+		memcpy(&hexbuf[20], &body[24], 12);
+		hexsrc = hexbuf;
+	}
+
+	if (hexsrc == NULL)
+	{
+		/* Uncommon shape; let the general parse handle it */
+		string_to_uuid_scalar(source, uuid, escontext);
+		return;
+	}
+
+	/*
+	 * Decode the UUID hex data using our hex decoder that is SIMD-aware. We
+	 * give it a private error context so that a decode failure is swallowed
+	 * here and reported by the scalar path instead, keeping the error message
+	 * identical.
+	 */
+	written = hex_decode_safe(hexsrc, 32, (char *) uuid->data, (Node *) &esctx);
+
+	/*
+	 * Fall back to the scalar path on any error. We must also reject a short
+	 * result: hex_decode_safe() skips whitespace, so it can succeed yet write
+	 * fewer than UUID_LEN bytes, whereas the UUID grammar forbids whitespace.
+	 */
+	if (esctx.error_occurred || written != UUID_LEN)
+		string_to_uuid_scalar(source, uuid, escontext);
+}
+#endif
+
+/*
+ * We allow UUIDs as a series of 32 hexadecimal digits with an optional dash
+ * after each group of 4 hexadecimal digits, and optionally surrounded by {}.
+ * (The canonical format 8x-4x-4x-4x-12x, where "nx" means n hexadecimal
+ * digits, is the only one used for output.)
+ */
+static void
+string_to_uuid(const char *source, pg_uuid_t *uuid, Node *escontext)
+{
+#ifdef USE_NO_SIMD
+	string_to_uuid_scalar(source, uuid, escontext);
+#else
+	string_to_uuid_fast(source, uuid, escontext);
+#endif
+}
+
 Datum
 uuid_recv(PG_FUNCTION_ARGS)
 {
diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out
index 9c5dda9e9ab..928e71c7ad3 100644
--- a/src/test/regress/expected/uuid.out
+++ b/src/test/regress/expected/uuid.out
@@ -340,5 +340,60 @@ SELECT v = v::bytea::uuid as matched FROM gen_random_uuid() v;
  t
 (1 row)
 
+-- Test UUID shapes that the parser uses the SIMD path.
+SELECT '5b35380a-7143-4912-9b55-f322699c6770'::uuid;
+                 uuid                 
+--------------------------------------
+ 5b35380a-7143-4912-9b55-f322699c6770
+(1 row)
+
+SELECT '{5b35380a-7143-4912-9b55-f322699c6770}'::uuid;
+                 uuid                 
+--------------------------------------
+ 5b35380a-7143-4912-9b55-f322699c6770
+(1 row)
+
+SELECT '5b35380a714349129b55f322699c6770'::uuid;
+                 uuid                 
+--------------------------------------
+ 5b35380a-7143-4912-9b55-f322699c6770
+(1 row)
+
+SELECT '{5b35380a714349129b55f322699c6770}'::uuid;
+                 uuid                 
+--------------------------------------
+ 5b35380a-7143-4912-9b55-f322699c6770
+(1 row)
+
+-- Test if the UUID parser using SIMD optimization correctly rejects invalid UUID
+-- string format.
+SELECT '5b35380a714349129b55f32  99c6770'::uuid;
+ERROR:  invalid input syntax for type uuid: "5b35380a714349129b55f32  99c6770"
+LINE 1: SELECT '5b35380a714349129b55f32  99c6770'::uuid;
+               ^
+SELECT '5b35380a-7143-4912-9b55-f322699c67  '::uuid;
+ERROR:  invalid input syntax for type uuid: "5b35380a-7143-4912-9b55-f322699c67  "
+LINE 1: SELECT '5b35380a-7143-4912-9b55-f322699c67  '::uuid;
+               ^
+SELECT '  35380a-7143-4912-9b55-f322699c6770'::uuid;
+ERROR:  invalid input syntax for type uuid: "  35380a-7143-4912-9b55-f322699c6770"
+LINE 1: SELECT '  35380a-7143-4912-9b55-f322699c6770'::uuid;
+               ^
+SELECT 'AZ35380a-7143-4912-9b55-f322699c6770'::uuid;
+ERROR:  invalid input syntax for type uuid: "AZ35380a-7143-4912-9b55-f322699c6770"
+LINE 1: SELECT 'AZ35380a-7143-4912-9b55-f322699c6770'::uuid;
+               ^
+SELECT '{AZ35380a-7143-4912-9b55-f322699c6770}'::uuid;
+ERROR:  invalid input syntax for type uuid: "{AZ35380a-7143-4912-9b55-f322699c6770}"
+LINE 1: SELECT '{AZ35380a-7143-4912-9b55-f322699c6770}'::uuid;
+               ^
+SELECT '{AZ35380a714349129b55f322699c6770}'::uuid;
+ERROR:  invalid input syntax for type uuid: "{AZ35380a714349129b55f322699c6770}"
+LINE 1: SELECT '{AZ35380a714349129b55f322699c6770}'::uuid;
+               ^
+SELECT '{AZ35380a714349129b55f322699c67  }'::uuid;
+ERROR:  invalid input syntax for type uuid: "{AZ35380a714349129b55f322699c67  }"
+LINE 1: SELECT '{AZ35380a714349129b55f322699c67  }'::uuid;
+               ^
 -- clean up
 DROP TABLE guid1, guid2, guid3 CASCADE;
diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql
index 8cc2ad40614..d67d3d2ded9 100644
--- a/src/test/regress/sql/uuid.sql
+++ b/src/test/regress/sql/uuid.sql
@@ -161,5 +161,21 @@ SELECT '\x019a2f859ced7225b99d9c55044a2563'::bytea::uuid;
 SELECT '\x1234567890abcdef'::bytea::uuid; -- error
 SELECT v = v::bytea::uuid as matched FROM gen_random_uuid() v;
 
+-- Test UUID shapes that the parser uses the SIMD path.
+SELECT '5b35380a-7143-4912-9b55-f322699c6770'::uuid;
+SELECT '{5b35380a-7143-4912-9b55-f322699c6770}'::uuid;
+SELECT '5b35380a714349129b55f322699c6770'::uuid;
+SELECT '{5b35380a714349129b55f322699c6770}'::uuid;
+
+-- Test if the UUID parser using SIMD optimization correctly rejects invalid UUID
+-- string format.
+SELECT '5b35380a714349129b55f32  99c6770'::uuid;
+SELECT '5b35380a-7143-4912-9b55-f322699c67  '::uuid;
+SELECT '  35380a-7143-4912-9b55-f322699c6770'::uuid;
+SELECT 'AZ35380a-7143-4912-9b55-f322699c6770'::uuid;
+SELECT '{AZ35380a-7143-4912-9b55-f322699c6770}'::uuid;
+SELECT '{AZ35380a714349129b55f322699c6770}'::uuid;
+SELECT '{AZ35380a714349129b55f322699c67  }'::uuid;
+
 -- clean up
 DROP TABLE guid1, guid2, guid3 CASCADE;
-- 
2.54.0

