From f2c2a04a34ffde2942cc5b75d66eaa6b524c12bc Mon Sep 17 00:00:00 2001 From: David Rowley Date: Thu, 14 Jul 2022 16:56:10 +1200 Subject: [PATCH v11 2/3] Do partition caching another way --- src/backend/executor/execPartition.c | 357 +++++++++++---------------- 1 file changed, 151 insertions(+), 206 deletions(-) diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index aca08791e9..7bdf78af99 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -176,8 +176,6 @@ static void FormPartitionKeyDatum(PartitionDispatch pd, EState *estate, Datum *values, bool *isnull); -static int get_partition_for_tuple_using_cache(PartitionDispatch pd, - Datum *values, bool *isnull); static int get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull); static char *ExecBuildSlotPartitionKeyDescription(Relation rel, @@ -320,7 +318,7 @@ ExecFindPartition(ModifyTableState *mtstate, * these values, error out. */ if (partdesc->nparts == 0 || - (partidx = get_partition_for_tuple_using_cache(dispatch, values, isnull)) < 0) + (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0) { char *val_desc; @@ -1334,195 +1332,42 @@ FormPartitionKeyDatum(PartitionDispatch pd, elog(ERROR, "wrong number of partition key expressions"); } -/* - * find_last_partition_for_tuple - * Checks if 'values' and 'isnull' matches the last found partition and - * returns the partition index of that partition or -1 if the given - * values don't belong to the last found partition. - * - * Note: If calculating the correct partition is just as cheap as checking if - * these values belong to the last partition, here we just calculate the - * correct partition for the given values. This is the case for HASH - * partitioning and for LIST partitioning with a NULL value. - */ -static inline int -find_last_partition_for_tuple(PartitionDispatch pd, PartitionDesc partdesc, - Datum *values, bool *isnull) -{ - PartitionKey key; - PartitionBoundInfo boundinfo; - - /* No last partition? No match then. */ - if (partdesc->last_found_part_index == -1) - return -1; - - key = pd->key; - boundinfo = partdesc->boundinfo; - - switch (key->strategy) - { - case PARTITION_STRATEGY_HASH: - { - uint64 rowHash; - - rowHash = compute_partition_hash_value(key->partnatts, - key->partsupfunc, - key->partcollation, - values, isnull); - - /* Just calculate the correct partition and return it */ - return boundinfo->indexes[rowHash % boundinfo->nindexes]; - } - - case PARTITION_STRATEGY_LIST: - if (isnull[0]) - { - /* Just return the NULL partition, if there is one */ - return boundinfo->null_index; - } - else - { - int last_datum_offset = partdesc->last_found_datum_index; - Datum lastDatum = boundinfo->datums[last_datum_offset][0]; - int32 cmpval; - - cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], - key->partcollation[0], - lastDatum, - values[0])); - - if (cmpval == 0) - return boundinfo->indexes[last_datum_offset]; - break; - } - - case PARTITION_STRATEGY_RANGE: - { - int last_datum_offset = partdesc->last_found_datum_index; - Datum *lastDatums = boundinfo->datums[last_datum_offset]; - PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset]; - int32 cmpval; - - /* Check for NULLs and abort the cache check if we find any */ - for (int i = 0; i < key->partnatts; i++) - { - if (isnull[i]) - return -1; - } - - /* Check if the value is equal to the lower bound */ - cmpval = partition_rbound_datum_cmp(key->partsupfunc, - key->partcollation, - lastDatums, - kind, - values, - key->partnatts); - - if (cmpval == 0) - return boundinfo->indexes[last_datum_offset + 1]; - - else if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums) - { - /* Check if the value is below the upper bound */ - lastDatums = boundinfo->datums[last_datum_offset + 1]; - kind = boundinfo->kind[last_datum_offset + 1]; - cmpval = partition_rbound_datum_cmp(key->partsupfunc, - key->partcollation, - lastDatums, - kind, - values, - key->partnatts); - - if (cmpval > 0) - return boundinfo->indexes[last_datum_offset + 1]; - } - break; - } - - default: - elog(ERROR, "unexpected partition strategy: %d", - (int) key->strategy); - } - - return -1; -} - /* * The number of times the same partition must be found in a row before we * switch from a search for the given values to just checking if the values - * belong to the last found partition. - */ -#define PARTITION_CACHED_FIND_THRESHOLD 16 - -/* - * get_partition_for_tuple_using_cache - * As get_partition_for_tuple, but use caching logic and check if the - * given 'values' and 'isnull' array also belong to the last found - * partition. If it does then this can save an expensive binary search - * for LIST and RANGE partitioning. + * belong to the last found partition. This must be above 0. */ -static int -get_partition_for_tuple_using_cache(PartitionDispatch pd, Datum *values, - bool *isnull) -{ - PartitionDesc partdesc = pd->partdesc; - int lastpart; - - /* - * When we've found that the same partition matches - * PARTITION_CACHED_FIND_THRESHOLD times in a row, instead of doing a - * partition search, we just check if the last partition found will also - * accept these values. If it does then that'll save us from searching - * for the correct partition. - */ +#define PARTITION_CACHED_FIND_THRESHOLD 16 - /* Have we found the same partition enough times to use the cache? */ - if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD) - { - /* check if these values also belong to the last found partition */ - lastpart = find_last_partition_for_tuple(pd, partdesc, values, isnull); - - if (lastpart == -1) - { - /* - * The last partition did not match. We must fall back on a - * search for the correct partition without the cache. - */ - lastpart = get_partition_for_tuple(pd, values, isnull); - partdesc->last_found_count = 1; - return lastpart; - } - else - { - /* no point in advancing last_found_count any further */ - return lastpart; - } - } - else - { - int thispart; - - /* - * We've not met the threshold for caching yet. Just perform a search. - * get_partition_for_tuple will stash the last_found_part_index. - */ - lastpart = partdesc->last_found_part_index; - thispart = get_partition_for_tuple(pd, values, isnull); - - /* adjust the count accordingly if the partition matched or not */ - if (thispart == lastpart) - partdesc->last_found_count++; - else - partdesc->last_found_count = 1; - - return thispart; - } -} - -/* + /* * get_partition_for_tuple * Finds partition of relation which accepts the partition key specified - * in values and isnull + * in values and isnull. + * + * Calling this function can be quite expensive for LIST and RANGE partitioned + * tables have many partitions. This is due to the binary search that's done + * to find the correct partition. Many of the use cases for LIST and RANGE + * partitioned tables mean that the same partition is likely to be found in + * subsequent ExecFindPartition() calls. This is especially true for cases + * such as RANGE partitioned tables on a TIMESTAMP column where the partition + * key is the current time. When asked to find a partition for a RANGE or + * LIST partitioned table, we record the partition index we've found in the + * PartitionDesc (which is stored in the relcache), and if we keep finding the + * same partition PARTITION_CACHED_FIND_THRESHOLD times, then we'll enable + * caching logic and instead of performing a binary search, we'll double check + * that the values still belong to the last found partition, and if so, we'll + * return that partition index without performing the binary search. If we + * fail to match the last partition when double checking, then we fall back on + * doing a normal search. In this case, we'll set the number of times we've + * hit the partition back to 1 again so that we don't attempt to use the cache + * again. For cases where the partition changes on each lookup, the amount + * of additional work required just amounts to recording the last found + * partition and setting the found counter back to 1 again. + * + * No caching of partitions is done when the last found partition is th + * DEFAULT partition. In this case, we don't have sufficient information about + * the last found partition to confirm the Datum being looked up belongs to + * the DEFAULT partition. * * Return value is index of the partition (>= 0 and < partdesc->nparts) if one * found or -1 if none found. @@ -1536,6 +1381,18 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) PartitionDesc partdesc = pd->partdesc; PartitionBoundInfo boundinfo = partdesc->boundinfo; + /* + * In the switch statement below, when we perform a cached lookup for + * RANGE and LIST partitioned tables, if we find that the last found + * partition matches the 'values', we return the partition index right + * away. We do this instead of breaking out of the switch as we don't + * want to execute the code about the default partition or do any updates + * for any of the cache-related fields. That would be a waste of effort + * as we already know it's not the DEFAULT partition and have no need + * to increment the number of times we found the same partition any + * higher than PARTITION_CACHED_FIND_THRESHOLD. + */ + /* Route as appropriate based on partitioning strategy. */ switch (key->strategy) { @@ -1543,24 +1400,56 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) { uint64 rowHash; + /* hash partitioning is too cheap to bother caching */ rowHash = compute_partition_hash_value(key->partnatts, key->partsupfunc, key->partcollation, values, isnull); - part_index = boundinfo->indexes[rowHash % boundinfo->nindexes]; + /* + * HASH partitions can't have a DEFAULT partition and we don't + * do any caching work for them, so just return the part index + */ + return boundinfo->indexes[rowHash % boundinfo->nindexes]; } - break; case PARTITION_STRATEGY_LIST: if (isnull[0]) { + /* this is far too cheap to bother doing any caching */ if (partition_bound_accepts_nulls(boundinfo)) part_index = boundinfo->null_index; } else { - bool equal = false; + bool equal; + + if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD) + { + int last_datum_offset = partdesc->last_found_datum_index; + Datum lastDatum = boundinfo->datums[last_datum_offset][0]; + int32 cmpval; + + /* + * Check if the last found datum index is the same as this + * Datum. + */ + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], + key->partcollation[0], + lastDatum, + values[0])); + + if (cmpval == 0) + return boundinfo->indexes[last_datum_offset]; + + /* + * The Datum has changed. Zero the number of times we've + * found last_found_datum_index in a row. + */ + partdesc->last_found_count = 0; + + /* fall-through and do a manual lookup */ + } bound_offset = partition_list_bsearch(key->partsupfunc, key->partcollation, @@ -1593,24 +1482,65 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) } } - if (!range_partkey_has_null) + if (range_partkey_has_null) { - bound_offset = partition_range_datum_bsearch(key->partsupfunc, - key->partcollation, - boundinfo, - key->partnatts, - values, - &equal); + /* Zero the "winning streak" on the cache hit count */ + partdesc->last_found_count = 0; + break; + } - /* - * The bound at bound_offset is less than or equal to the - * tuple value, so the bound at offset+1 is the upper - * bound of the partition we're looking for, if there - * actually exists one. - */ - part_index = boundinfo->indexes[bound_offset + 1]; - partdesc->last_found_datum_index = bound_offset; + if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD) + { + int last_datum_offset = partdesc->last_found_datum_index; + Datum *lastDatums = boundinfo->datums[last_datum_offset]; + PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset]; + int32 cmpval; + + /* Check if the value is equal to the lower bound */ + cmpval = partition_rbound_datum_cmp(key->partsupfunc, + key->partcollation, + lastDatums, + kind, + values, + key->partnatts); + + if (cmpval == 0) + return boundinfo->indexes[last_datum_offset + 1]; + + else if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums) + { + /* Check if the value is below the upper bound */ + lastDatums = boundinfo->datums[last_datum_offset + 1]; + kind = boundinfo->kind[last_datum_offset + 1]; + cmpval = partition_rbound_datum_cmp(key->partsupfunc, + key->partcollation, + lastDatums, + kind, + values, + key->partnatts); + + if (cmpval > 0) + return boundinfo->indexes[last_datum_offset + 1]; + } + + /* fall-through and do a manual lookup */ } + + bound_offset = partition_range_datum_bsearch(key->partsupfunc, + key->partcollation, + boundinfo, + key->partnatts, + values, + &equal); + + /* + * The bound at bound_offset is less than or equal to the + * tuple value, so the bound at offset+1 is the upper + * bound of the partition we're looking for, if there + * actually exists one. + */ + part_index = boundinfo->indexes[bound_offset + 1]; + partdesc->last_found_datum_index = bound_offset; } break; @@ -1625,9 +1555,24 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) */ if (part_index < 0) part_index = boundinfo->default_index; - - partdesc->last_found_part_index = part_index; - + else + { + /* + * Attend to the cache fields. If this partition is the same as the + * last partition found, then bump the count by one. If all goes well + * we'll eventually reach PARTITION_CACHED_FIND_THRESHOLD and we'll + * try the cache path next time around. If the part_index is not the + * same as last time then we'll reset the cache count back to 1 and + * record this partition to say we've found this one once. + */ + if (part_index == partdesc->last_found_part_index) + partdesc->last_found_count++; + else + { + partdesc->last_found_count = 1; + partdesc->last_found_part_index = part_index; + } + } return part_index; } -- 2.35.1.windows.2