From 3fbb6dae2ec260024fbf048e7dd4b23449053c5b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 28 Jan 2018 16:15:42 -0800
Subject: [PATCH 2/3] Improve bit perturbation in TupleHashTableHash.

The changes in b81b5a96f424531b97cdd1dba97d9d1b9c9d372e did not fully
address the issue, because the bit-mixing of the IV into the final
hash-key didn't prevent clustering in the input-data survive in the
output data.

This didn't cause a lot of problems because of the additional growth
conditions added d4c62a6b623d6eef88218158e9fa3cf974c6c7e5. But as we
want to rain that in due to those causing problems, this needs to be
fixed.

Author: Andres Freund
Discussion: https://postgr.es/m/20171127185700.1470.20362@wrigleys.postgresql.org
Backpatch: 10, where simplehash was introduced
---
 src/backend/executor/execGrouping.c        | 11 +++++++++--
 src/test/regress/expected/groupingsets.out | 30 +++++++++++++++++-------------
 src/test/regress/sql/groupingsets.sql      |  7 ++++---
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 058ee688041..8e8dbb1f205 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,6 +23,7 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
+#include "utils/hashutils.h"
 #include "utils/memutils.h"
 
 static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple);
@@ -326,7 +327,7 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	 * underestimated.
 	 */
 	if (use_variable_hash_iv)
-		hashtable->hash_iv = hash_uint32(ParallelWorkerNumber);
+		hashtable->hash_iv = murmurhash32(ParallelWorkerNumber);
 	else
 		hashtable->hash_iv = 0;
 
@@ -510,7 +511,13 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 		}
 	}
 
-	return hashkey;
+	/*
+	 * The way hashes are combined above, among each other and with the IV,
+	 * doesn't lead to good bit perturbation. As the IV's goal is to lead to
+	 * achieve that, perform a round of hashing of the combined hash -
+	 * resulting in near perfect perturbation.
+	 */
+	return murmurhash32(hashkey);
 }
 
 /*
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index cbfdbfd8563..d21a494a9dd 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1183,29 +1183,33 @@ explain (costs off)
 -- simple rescan tests
 select a, b, sum(v.x)
   from (values (1),(2)) v(x), gstest_data(v.x)
- group by grouping sets (a,b);
+ group by grouping sets (a,b)
+ order by 1, 2, 3;
  a | b | sum 
 ---+---+-----
- 2 |   |   6
  1 |   |   3
+ 2 |   |   6
+   | 1 |   3
    | 2 |   3
    | 3 |   3
-   | 1 |   3
 (5 rows)
 
 explain (costs off)
   select a, b, sum(v.x)
     from (values (1),(2)) v(x), gstest_data(v.x)
-   group by grouping sets (a,b);
-                QUERY PLAN                
-------------------------------------------
- HashAggregate
-   Hash Key: gstest_data.a
-   Hash Key: gstest_data.b
-   ->  Nested Loop
-         ->  Values Scan on "*VALUES*"
-         ->  Function Scan on gstest_data
-(6 rows)
+   group by grouping sets (a,b)
+   order by 3, 1, 2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Sort
+   Sort Key: (sum("*VALUES*".column1)), gstest_data.a, gstest_data.b
+   ->  HashAggregate
+         Hash Key: gstest_data.a
+         Hash Key: gstest_data.b
+         ->  Nested Loop
+               ->  Values Scan on "*VALUES*"
+               ->  Function Scan on gstest_data
+(8 rows)
 
 select *
   from (values (1),(2)) v(x),
diff --git a/src/test/regress/sql/groupingsets.sql b/src/test/regress/sql/groupingsets.sql
index b28d8217c12..eb680286030 100644
--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -342,12 +342,13 @@ explain (costs off)
 
 select a, b, sum(v.x)
   from (values (1),(2)) v(x), gstest_data(v.x)
- group by grouping sets (a,b);
+ group by grouping sets (a,b)
+ order by 1, 2, 3;
 explain (costs off)
   select a, b, sum(v.x)
     from (values (1),(2)) v(x), gstest_data(v.x)
-   group by grouping sets (a,b);
-
+   group by grouping sets (a,b)
+   order by 3, 1, 2;
 select *
   from (values (1),(2)) v(x),
        lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s;
-- 
2.15.1.354.g95ec6b1b33.dirty