From 741afead540b219a2a3e82f0db6840e8ff93b0eb Mon Sep 17 00:00:00 2001
From: John Naylor <jcnaylor@gmail.com>
Date: Wed, 26 Dec 2018 21:34:34 -0500
Subject: [PATCH v4 2/2] Dispatch keyword lookup on the first character.

In addition, use heuristics to improve chances of skipping binary search.
For each legal keyword character where it makes sense, choose a common
keyword and use its index for the first middle index of binary search.
---
 src/common/keywords.c           | 30 ++++++++++---
 src/include/common/keywords.h   | 10 ++++-
 src/pl/plpgsql/src/pl_scanner.c |  6 +--
 src/tools/gen_keywords.pl       | 78 +++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 10 deletions(-)

diff --git a/src/common/keywords.c b/src/common/keywords.c
index b0e5a721b6..207bc4274f 100644
--- a/src/common/keywords.c
+++ b/src/common/keywords.c
@@ -118,12 +118,14 @@ int
 ScanKeywordLookupOffset(const char *string_to_lookup,
 						const char *kw_strings,
 						const uint16 *kw_offsets,
-						int num_keywords)
+						const ScanKeywordRange *kw_ranges)
 {
 	int			len,
-				i;
+				i,
+				range_idx;
 	char		word[NAMEDATALEN];
 	const uint16 *low;
+	const uint16 *middle;
 	const uint16 *high;
 
 	len = strlen(string_to_lookup);
@@ -145,17 +147,32 @@ ScanKeywordLookupOffset(const char *string_to_lookup,
 	}
 	word[len] = '\0';
 
+	/* XXX assumes keywords can't start with '_' */
+	range_idx = (int) word[0] - 'a';
+
+	if (word[0] < 'a' || word[0] > 'z'
+		|| kw_ranges[range_idx].lower == PG_UINT16_MAX)
+		return -1;
+
 	/*
 	 * Now do a binary search using plain strcmp() comparison.
 	 */
-	low = kw_offsets;
-	high = kw_offsets + (num_keywords - 1);
+	low = kw_offsets + kw_ranges[range_idx].lower;
+	high = kw_offsets + kw_ranges[range_idx].upper;
+
+	if (kw_ranges[range_idx].middle != PG_UINT16_MAX)
+	{
+		middle = kw_offsets + kw_ranges[range_idx].middle;
+	}
+	else
+	{
+		middle = low + (high - low) / 2;
+	}
+
 	while (low <= high)
 	{
-		const uint16 *middle;
 		int			difference;
 
-		middle = low + (high - low) / 2;
 		difference = strcmp(kw_strings + *middle, word);
 		if (difference == 0)
 			return middle - kw_offsets;
@@ -163,6 +180,7 @@ ScanKeywordLookupOffset(const char *string_to_lookup,
 			low = middle + 1;
 		else
 			high = middle - 1;
+		middle = low + (high - low) / 2;
 	}
 
 	return -1;
diff --git a/src/include/common/keywords.h b/src/include/common/keywords.h
index 201d0fcc7a..348773c682 100644
--- a/src/include/common/keywords.h
+++ b/src/include/common/keywords.h
@@ -35,6 +35,14 @@ typedef struct ScanKeywordAux
 	char		category;		/* see codes above */
 } ScanKeywordAux;
 
+/* Lower and upper indexes, and first guess, for a range of keywords. */
+typedef struct ScanKeywordRange
+{
+	uint16		lower;
+	uint16		middle;
+	uint16		upper;
+} ScanKeywordRange;
+
 #ifndef FRONTEND
 extern PGDLLIMPORT const ScanKeyword ScanKeywords[];
 extern PGDLLIMPORT const int NumScanKeywords;
@@ -51,6 +59,6 @@ extern const ScanKeyword *ScanKeywordLookup(const char *text,
 int ScanKeywordLookupOffset(const char *string_to_lookup,
 						const char *kw_strings,
 						const uint16 *kw_offsets,
-						int num_keywords);
+						const ScanKeywordRange *kw_ranges);
 
 #endif							/* KEYWORDS_H */
diff --git a/src/pl/plpgsql/src/pl_scanner.c b/src/pl/plpgsql/src/pl_scanner.c
index 0b2f331117..808bf465a0 100644
--- a/src/pl/plpgsql/src/pl_scanner.c
+++ b/src/pl/plpgsql/src/pl_scanner.c
@@ -261,7 +261,7 @@ plpgsql_yylex(void)
 						 (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident,
 												 pl_unreserved_kw_strings,
 												 pl_unreserved_kw_offsets,
-												 num_unreserved_keywords)) >= 0)
+												 pl_unreserved_kw_ranges)) >= 0)
 				{
 					aux1.lval.keyword = pl_unreserved_kw_strings
 										+ pl_unreserved_kw_offsets[kwnum];
@@ -293,7 +293,7 @@ plpgsql_yylex(void)
 					(kwnum = ScanKeywordLookupOffset(aux1.lval.str,
 											 pl_unreserved_kw_strings,
 											 pl_unreserved_kw_offsets,
-											 num_unreserved_keywords)) >= 0)
+											 pl_unreserved_kw_ranges)) >= 0)
 				{
 					aux1.lval.keyword = pl_unreserved_kw_strings
 										+ pl_unreserved_kw_offsets[kwnum];
@@ -319,7 +319,7 @@ plpgsql_yylex(void)
 						 (kwnum = ScanKeywordLookupOffset(aux1.lval.word.ident,
 												 pl_unreserved_kw_strings,
 												 pl_unreserved_kw_offsets,
-												 num_unreserved_keywords)) >= 0)
+												 pl_unreserved_kw_ranges)) >= 0)
 				{
 					aux1.lval.keyword = pl_unreserved_kw_strings
 										+ pl_unreserved_kw_offsets[kwnum];
diff --git a/src/tools/gen_keywords.pl b/src/tools/gen_keywords.pl
index 1faa14ffca..6678bfc72f 100644
--- a/src/tools/gen_keywords.pl
+++ b/src/tools/gen_keywords.pl
@@ -99,6 +99,84 @@ for my $i (0..$#keywords - 1)
 	  if ($keywords[$i] cmp $keywords[$i + 1]) >= 0;
 }
 
+# These are the most common core keywords for each letter of the alphabet,
+# if there is a clear winner, preferably near the middle of the range.
+my %FIRST_GUESS = (
+	and => 1,
+	commit => 1, # demo for WIP, since it's a plpgsql unreserved keyword
+	delete => 1,
+	exists => 1,
+	from => 1,
+	group => 1,
+	having => 1,
+	in => 1,
+	join => 1,
+	limit => 1,
+	not => 1,
+	or => 1,
+	select => 1,
+	then => 1,
+	update => 1,
+	values => 1,
+	where => 1,
+);
+
+# Save the min and max indexes and a first guess middle index for each range
+# of keywords starting with the same first character.
+my $index = 0;
+my $lower = 0;
+my $middle;
+my $first = 1;
+my $curr_char = substr($keywords[0], 0, 1);
+my %range;
+foreach my $name (@keywords)
+{
+	if (substr($name, 0, 1) ne $curr_char)
+	{
+		$range{$curr_char} = {lower => $lower, middle => $middle, upper => $index};
+
+		# Set values for next range.
+		$curr_char = substr($name, 0, 1);
+		$index++;
+		$lower = $index;
+		$middle = undef;
+	}
+	elsif ($first == 1)
+	{
+		$first = 0;
+	}
+	else
+	{
+		$index++;
+	}
+
+	# Save the index if the keyword is our first guess for the current
+	# first character.
+	if (exists $FIRST_GUESS{$name})
+	{
+		$middle = $index;
+	}
+}
+
+# Save last member of the list.
+$range{$curr_char} = {lower => $lower, middle => $middle, upper => $index};
+
+# Emit array of the upper, middle and lower indexes that we saved earlier.
+# To keep the array small, we count from 'a', the lowest character that can
+# start a keyword.
+# XXX assumes keywords can't start with '_'
+printf $kwdef "static const ScanKeywordRange %skw_ranges[] = {\n\t", $prefix;
+foreach my $c ('a'..'z')
+{
+	my $char = $range{$c};
+
+	printf $kwdef "{%s, %s, %s},\n\t",
+	  defined($char->{lower}) ? $char->{lower} : 'PG_UINT16_MAX',
+	  defined($char->{middle}) ? $char->{middle} : 'PG_UINT16_MAX',
+	  defined($char->{upper}) ? $char->{upper} : 'PG_UINT16_MAX',
+}
+print $kwdef "};\n\n";
+
 # Emit an array of numerical offsets which will be used to index into the
 # keyword string.
 printf $kwdef "static const uint16 %skw_offsets[] = {\n\t", $prefix;
-- 
2.17.1

