From 037f3534f5274eb7bcdb5adee262b5af624175e2 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Mon, 12 Mar 2018 15:52:18 +0900
Subject: [PATCH 2/4] introduce dynhash pruning

---
 src/backend/utils/hash/dynahash.c | 169 +++++++++++++++++++++++++++++++++-----
 src/include/utils/catcache.h      |  12 +++
 src/include/utils/hsearch.h       |  21 ++++-
 3 files changed, 182 insertions(+), 20 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 5281cd5410..a5b4979662 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -88,6 +88,7 @@
 #include "access/xact.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
+#include "utils/catcache.h"
 #include "utils/dynahash.h"
 #include "utils/memutils.h"
 
@@ -184,6 +185,10 @@ struct HASHHDR
 	long		ssize;			/* segment size --- must be power of 2 */
 	int			sshift;			/* segment shift = log2(ssize) */
 	int			nelem_alloc;	/* number of entries to allocate at once */
+	bool		prunable;		/* true if prunable */
+	HASH_PRUNE_CB	prune_cb;	/* pruning callback. see above. */
+	int		   *memory_target;	/* pointer to memory target */
+	int		   *prune_min_age;	/* pointer to prune minimum age */
 
 #ifdef HASH_STATISTICS
 
@@ -227,16 +232,18 @@ struct HTAB
 	int			sshift;			/* segment shift = log2(ssize) */
 };
 
+#define HASHELEMENT_SIZE(ctlp) MAXALIGN(ctlp->prunable ? sizeof(PRUNABLE_HASHELEMENT) : sizeof(HASHELEMENT))
+
 /*
  * Key (also entry) part of a HASHELEMENT
  */
-#define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
+#define ELEMENTKEY(helem, ctlp)  (((char *)(helem)) + HASHELEMENT_SIZE(ctlp))
 
 /*
  * Obtain element pointer given pointer to key
  */
-#define ELEMENT_FROM_KEY(key)  \
-	((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
+#define ELEMENT_FROM_KEY(key, ctlp)										\
+	((HASHELEMENT *) (((char *) (key)) - HASHELEMENT_SIZE(ctlp)))
 
 /*
  * Fast MOD arithmetic, assuming that y is a power of 2 !
@@ -257,6 +264,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp);
 static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
+static bool prune_entries(HTAB *hashp);
 static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
@@ -497,6 +505,25 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 		hctl->entrysize = info->entrysize;
 	}
 
+	/*
+	 * hash table runs pruning
+	 */
+	if (flags & HASH_PRUNABLE)
+	{
+		hctl->prunable = true;
+		hctl->prune_cb = info->prune_cb;
+		if (info->memory_target)
+			hctl->memory_target = info->memory_target;
+		else
+			hctl->memory_target = &syscache_memory_target;
+		if (info->prune_min_age)
+			hctl->prune_min_age = info->prune_min_age;
+		else
+			hctl->prune_min_age = &syscache_prune_min_age;
+	}
+	else
+		hctl->prunable = false;
+
 	/* make local copies of heavily-used constant fields */
 	hashp->keysize = hctl->keysize;
 	hashp->ssize = hctl->ssize;
@@ -982,7 +1009,7 @@ hash_search_with_hash_value(HTAB *hashp,
 	while (currBucket != NULL)
 	{
 		if (currBucket->hashvalue == hashvalue &&
-			match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
+			match(ELEMENTKEY(currBucket, hctl), keyPtr, keysize) == 0)
 			break;
 		prevBucketPtr = &(currBucket->link);
 		currBucket = *prevBucketPtr;
@@ -995,6 +1022,17 @@ hash_search_with_hash_value(HTAB *hashp,
 	if (foundPtr)
 		*foundPtr = (bool) (currBucket != NULL);
 
+	/* Update access counter if needed */
+	if (hctl->prunable && currBucket &&
+		(action == HASH_FIND || action == HASH_ENTER))
+	{
+		PRUNABLE_HASHELEMENT *prunable_elm =
+			(PRUNABLE_HASHELEMENT *) currBucket;
+		if (prunable_elm->naccess < 2)
+			prunable_elm->naccess++;
+		prunable_elm->last_access = GetCatCacheClock();
+	}
+
 	/*
 	 * OK, now what?
 	 */
@@ -1002,7 +1040,8 @@ hash_search_with_hash_value(HTAB *hashp,
 	{
 		case HASH_FIND:
 			if (currBucket != NULL)
-				return (void *) ELEMENTKEY(currBucket);
+				return (void *) ELEMENTKEY(currBucket, hctl);
+
 			return NULL;
 
 		case HASH_REMOVE:
@@ -1031,7 +1070,7 @@ hash_search_with_hash_value(HTAB *hashp,
 				 * element, because someone else is going to reuse it the next
 				 * time something is added to the table
 				 */
-				return (void *) ELEMENTKEY(currBucket);
+				return (void *) ELEMENTKEY(currBucket, hctl);
 			}
 			return NULL;
 
@@ -1043,7 +1082,7 @@ hash_search_with_hash_value(HTAB *hashp,
 		case HASH_ENTER:
 			/* Return existing element if found, else create one */
 			if (currBucket != NULL)
-				return (void *) ELEMENTKEY(currBucket);
+				return (void *) ELEMENTKEY(currBucket, hctl);
 
 			/* disallow inserts if frozen */
 			if (hashp->frozen)
@@ -1073,8 +1112,18 @@ hash_search_with_hash_value(HTAB *hashp,
 
 			/* copy key into record */
 			currBucket->hashvalue = hashvalue;
-			hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
+			hashp->keycopy(ELEMENTKEY(currBucket, hctl), keyPtr, keysize);
 
+			/* set access counter */
+			if (hctl->prunable)
+			{
+				PRUNABLE_HASHELEMENT *prunable_elm =
+					(PRUNABLE_HASHELEMENT *) currBucket;
+				if (prunable_elm->naccess < 2)
+					prunable_elm->naccess++;
+				prunable_elm->last_access = GetCatCacheClock();
+			}
+			
 			/*
 			 * Caller is expected to fill the data field on return.  DO NOT
 			 * insert any code that could possibly throw error here, as doing
@@ -1082,7 +1131,7 @@ hash_search_with_hash_value(HTAB *hashp,
 			 * caller's data structure.
 			 */
 
-			return (void *) ELEMENTKEY(currBucket);
+			return (void *) ELEMENTKEY(currBucket, hctl);
 	}
 
 	elog(ERROR, "unrecognized hash action code: %d", (int) action);
@@ -1114,7 +1163,7 @@ hash_update_hash_key(HTAB *hashp,
 					 void *existingEntry,
 					 const void *newKeyPtr)
 {
-	HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
+	HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry, hashp->hctl);
 	HASHHDR    *hctl = hashp->hctl;
 	uint32		newhashvalue;
 	Size		keysize;
@@ -1198,7 +1247,7 @@ hash_update_hash_key(HTAB *hashp,
 	while (currBucket != NULL)
 	{
 		if (currBucket->hashvalue == newhashvalue &&
-			match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
+			match(ELEMENTKEY(currBucket, hctl), newKeyPtr, keysize) == 0)
 			break;
 		prevBucketPtr = &(currBucket->link);
 		currBucket = *prevBucketPtr;
@@ -1232,7 +1281,7 @@ hash_update_hash_key(HTAB *hashp,
 
 	/* copy new key into record */
 	currBucket->hashvalue = newhashvalue;
-	hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
+	hashp->keycopy(ELEMENTKEY(currBucket, hctl), newKeyPtr, keysize);
 
 	/* rest of record is untouched */
 
@@ -1386,8 +1435,8 @@ hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
 void *
 hash_seq_search(HASH_SEQ_STATUS *status)
 {
-	HTAB	   *hashp;
-	HASHHDR    *hctl;
+	HTAB	   *hashp = status->hashp;
+	HASHHDR    *hctl = hashp->hctl;
 	uint32		max_bucket;
 	long		ssize;
 	long		segment_num;
@@ -1402,15 +1451,13 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 		status->curEntry = curElem->link;
 		if (status->curEntry == NULL)	/* end of this bucket */
 			++status->curBucket;
-		return (void *) ELEMENTKEY(curElem);
+		return (void *) ELEMENTKEY(curElem, hctl);
 	}
 
 	/*
 	 * Search for next nonempty bucket starting at curBucket.
 	 */
 	curBucket = status->curBucket;
-	hashp = status->hashp;
-	hctl = hashp->hctl;
 	ssize = hashp->ssize;
 	max_bucket = hctl->max_bucket;
 
@@ -1456,7 +1503,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	if (status->curEntry == NULL)	/* end of this bucket */
 		++curBucket;
 	status->curBucket = curBucket;
-	return (void *) ELEMENTKEY(curElem);
+	return (void *) ELEMENTKEY(curElem, hctl);
 }
 
 void
@@ -1550,6 +1597,10 @@ expand_table(HTAB *hashp)
 	 */
 	if ((uint32) new_bucket > hctl->high_mask)
 	{
+		/* try pruning before expansion. return true on success */
+		if (hctl->prunable && prune_entries(hashp))
+			return true;
+
 		hctl->low_mask = hctl->high_mask;
 		hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
 	}
@@ -1592,6 +1643,86 @@ expand_table(HTAB *hashp)
 	return true;
 }
 
+static bool
+prune_entries(HTAB *hashp)
+{
+	HASHHDR		   *hctl = hashp->hctl;
+	HASH_SEQ_STATUS status;
+	void 		   *elm;
+	TimestampTz		currclock = GetCatCacheClock();
+	int				nall = 0,
+					nremoved = 0;
+
+	Assert(hctl->prunable);
+
+	/* not called for frozen or under seqscan. see
+	 * hash_search_with_hash_value. */
+	Assert(IS_PARTITIONED(hctl) ||
+		hashp->frozen ||
+		hctl->freeList[0].nentries / (long) (hctl->max_bucket + 1) <
+		hctl->ffactor ||
+		has_seq_scans(hashp));
+
+	/* This setting prevents pruning */
+	if (*hctl->prune_min_age < 0)
+		return false;
+
+	/*
+	 * return false immediately if this hash is small enough. We only consider
+	 * bucket array size since it is the significant part of memory usage.
+	 * settings is shared with syscache
+	 */
+	if (hctl->dsize * sizeof(HASHBUCKET) * hashp->ssize <
+		(Size) *hctl->memory_target * 1024L)
+		return false;
+
+	/*
+	 * Ok, this hash can be pruned. start pruning. This function is called
+	 * early enough for doing this via public API.
+	 */
+	hash_seq_init(&status, hashp);
+	while ((elm = hash_seq_search(&status)) != NULL)
+	{
+		PRUNABLE_HASHELEMENT *helm =
+			(PRUNABLE_HASHELEMENT *)ELEMENT_FROM_KEY(elm, hctl);
+		long	entry_age;
+		int		us;
+
+		nall++;
+
+		TimestampDifference(helm->last_access, currclock, &entry_age, &us);
+
+		/* settings is shared with syscache */
+		if (entry_age > *hctl->prune_min_age)
+		{
+			/* Wait for the next chance if this is recently used */
+			if (helm->naccess > 0)
+				helm->naccess--;
+			else
+			{
+				/* just call it if callback is provided, remove otherwise */
+				if (hctl->prune_cb)
+				{
+					if (hctl->prune_cb(hashp, (void *)elm))
+						nremoved++;
+				}
+				else
+				{
+					bool found;
+					
+					hash_search(hashp, elm, HASH_REMOVE, &found);
+					Assert(found);
+					nremoved++;
+				}
+			}
+		}
+	}
+
+	elog(DEBUG1, "removed %d/%d entries from hash \"%s\"",
+		 nremoved, nall, hashp->tabname);
+
+	return nremoved > 0;
+}
 
 static bool
 dir_realloc(HTAB *hashp)
@@ -1665,7 +1796,7 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 		return false;
 
 	/* Each element has a HASHELEMENT header plus user data. */
-	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
+	elementSize = HASHELEMENT_SIZE(hctl) + MAXALIGN(hctl->entrysize);
 
 	CurrentDynaHashCxt = hashp->hcxt;
 	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h
index c3c4d65998..fcc680bb82 100644
--- a/src/include/utils/catcache.h
+++ b/src/include/utils/catcache.h
@@ -208,6 +208,18 @@ SetCatCacheClock(TimestampTz ts)
 	catcacheclock = ts;
 }
 
+/*
+ * GetCatCacheClock - get timestamp for catcache access record
+ *
+ * This clock is basically provided for catcache usage, but dynahash has a
+ * similar pruning mechanism and wants to use the same clock.
+ */
+static inline TimestampTz
+GetCatCacheClock(void)
+{
+	return catcacheclock;
+}
+
 extern void CreateCacheMemoryContext(void);
 
 extern CatCache *InitCatCache(int id, Oid reloid, Oid indexoid,
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 8357faac5a..7ea3c75423 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -13,7 +13,7 @@
  */
 #ifndef HSEARCH_H
 #define HSEARCH_H
-
+#include "datatype/timestamp.h"
 
 /*
  * Hash functions must have this signature.
@@ -47,6 +47,7 @@ typedef void *(*HashAllocFunc) (Size request);
  * HASHELEMENT is the private part of a hashtable entry.  The caller's data
  * follows the HASHELEMENT structure (on a MAXALIGN'd boundary).  The hash key
  * is expected to be at the start of the caller's hash entry data structure.
+ * If this hash is prunable, PRUNABLE_HASHELEMENT is used instead.
  */
 typedef struct HASHELEMENT
 {
@@ -54,12 +55,26 @@ typedef struct HASHELEMENT
 	uint32		hashvalue;		/* hash function result for this entry */
 } HASHELEMENT;
 
+typedef struct PRUNABLE_HASHELEMENT
+{
+	struct HASHELEMENT *link;	/* link to next entry in same bucket */
+	uint32		hashvalue;		/* hash function result for this entry */
+	TimestampTz	last_access;	/* timestamp of the last usage */
+	int			naccess;		/* takes 0 to 2, counted up when used */
+} PRUNABLE_HASHELEMENT;
+
 /* Hash table header struct is an opaque type known only within dynahash.c */
 typedef struct HASHHDR HASHHDR;
 
 /* Hash table control struct is an opaque type known only within dynahash.c */
 typedef struct HTAB HTAB;
 
+/*
+ * Hash pruning callback. This is called for the entries which is about to be
+ * removed without the owner's intention.
+ */
+typedef bool (*HASH_PRUNE_CB)(HTAB *hashp, void *ent);
+
 /* Parameter data structure for hash_create */
 /* Only those fields indicated by hash_flags need be set */
 typedef struct HASHCTL
@@ -77,6 +92,9 @@ typedef struct HASHCTL
 	HashAllocFunc alloc;		/* memory allocator */
 	MemoryContext hcxt;			/* memory context to use for allocations */
 	HASHHDR    *hctl;			/* location of header in shared mem */
+	HASH_PRUNE_CB	prune_cb;	/* pruning callback. see above. */
+	int		   *memory_target;	/* pointer to memory target */
+	int		   *prune_min_age;	/* pointer to prune minimum age */
 } HASHCTL;
 
 /* Flags to indicate which parameters are supplied */
@@ -94,6 +112,7 @@ typedef struct HASHCTL
 #define HASH_SHARED_MEM 0x0800	/* Hashtable is in shared memory */
 #define HASH_ATTACH		0x1000	/* Do not initialize hctl */
 #define HASH_FIXED_SIZE 0x2000	/* Initial size is a hard limit */
+#define HASH_PRUNABLE	0x4000	/* pruning setting */
 
 
 /* max_dsize value to indicate expansible directory */
-- 
2.16.2

