diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index e03ea27299..6a323436d5 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -1332,10 +1332,48 @@ FormPartitionKeyDatum(PartitionDispatch pd, elog(ERROR, "wrong number of partition key expressions"); } +/* + * The number of times the same partition must be found in a row before we + * switch from a search for the given values to just checking if the values + * belong to the last found partition. This must be above 0. + */ +#define PARTITION_CACHED_FIND_THRESHOLD 16 + /* * get_partition_for_tuple * Finds partition of relation which accepts the partition key specified - * in values and isnull + * in values and isnull. + * + * Calling this function can be quite expensive when LIST and RANGE + * partitioned tables have many partitions. This is due to the binary search + * that's done to find the correct partition. Many of the use cases for LIST + * and RANGE partitioned tables make it likely that the same partition is + * found in subsequent ExecFindPartition() calls. This is especially true for + * cases such as RANGE partitioned tables on a TIMESTAMP column where the + * partition key is the current time. When asked to find a partition for a + * RANGE or LIST partitioned table, we record the partition index and datum + * offset we've found for the given 'values' in the PartitionDesc (which is + * stored in relcache), and if we keep finding the same partition + * PARTITION_CACHED_FIND_THRESHOLD times in a row, then we'll enable caching + * logic and instead of performing a binary search to find the correct + * partition, we'll just double-check that 'values' still belong to the last + * found partition, and if so, we'll return that partition index, thus + * skipping the need for the binary search. If we fail to match the last + * partition when double checking, then we fall back on doing a binary search. + * In this case, we'll reset the number of times we've hit the same partition + * so that we don't attempt to use the cache again until we've found that + * partition at least PARTITION_CACHED_FIND_THRESHOLD times in a row. + * + * For cases where the partition changes on each lookup, the amount of + * additional work required just amounts to recording the last found partition + * and bound offset then resetting the found counter. This is cheap and does + * not appear to cause any meaningful slowdowns for such cases. + * + * No caching of partitions is done when the last found partition is the + * DEFAULT or NULL partition. For the case of the DEFAULT partition, there + * is no bound offset storing the matching datum, so we cannot confirm the + * indexes match. For the NULL partition, this is just so cheap, there's no + * sense in caching. * * Return value is index of the partition (>= 0 and < partdesc->nparts) if one * found or -1 if none found. @@ -1343,12 +1381,24 @@ FormPartitionKeyDatum(PartitionDispatch pd, static int get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) { - int bound_offset; + int bound_offset = -1; int part_index = -1; PartitionKey key = pd->key; PartitionDesc partdesc = pd->partdesc; PartitionBoundInfo boundinfo = partdesc->boundinfo; + /* + * In the switch statement below, when we perform a cached lookup for + * RANGE and LIST partitioned tables, if we find that the last found + * partition matches the 'values', we return the partition index right + * away. We do this instead of breaking out of the switch as we don't + * want to execute the code about the default partition or do any updates + * for any of the cache-related fields. That would be a waste of effort + * as we already know it's not the DEFAULT partition and have no need to + * increment the number of times we found the same partition any higher + * than PARTITION_CACHED_FIND_THRESHOLD. + */ + /* Route as appropriate based on partitioning strategy. */ switch (key->strategy) { @@ -1356,24 +1406,62 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) { uint64 rowHash; + /* hash partitioning is too cheap to bother caching */ rowHash = compute_partition_hash_value(key->partnatts, key->partsupfunc, key->partcollation, values, isnull); - part_index = boundinfo->indexes[rowHash % boundinfo->nindexes]; + /* + * HASH partitions can't have a DEFAULT partition and we don't + * do any caching work for them, so just return the part index + */ + return boundinfo->indexes[rowHash % boundinfo->nindexes]; } - break; case PARTITION_STRATEGY_LIST: if (isnull[0]) { + /* this is far too cheap to bother doing any caching */ if (partition_bound_accepts_nulls(boundinfo)) - part_index = boundinfo->null_index; + { + /* + * When there is a NULL partition we just return that + * directly. We don't have a bound_offset so it's not + * valid to drop into the code after the switch which + * checks and updates the cache fields. We perhaps should + * be invalidating the details of the last cached + * partition but there's no real need to. Keeping those + * fields set gives a chance at a matching to the cached + * partition on the next lookup. + */ + return boundinfo->null_index; + } } else { - bool equal = false; + bool equal; + + if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD) + { + int last_datum_offset = partdesc->last_found_datum_index; + Datum lastDatum = boundinfo->datums[last_datum_offset][0]; + int32 cmpval; + + /* + * Check if the last found datum index is the same as this + * Datum. + */ + cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0], + key->partcollation[0], + lastDatum, + values[0])); + + if (cmpval == 0) + return boundinfo->indexes[last_datum_offset]; + + /* fall-through and do a manual lookup */ + } bound_offset = partition_list_bsearch(key->partsupfunc, key->partcollation, @@ -1403,23 +1491,63 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) } } - if (!range_partkey_has_null) + if (range_partkey_has_null) + break; + + if (partdesc->last_found_count >= PARTITION_CACHED_FIND_THRESHOLD) { - bound_offset = partition_range_datum_bsearch(key->partsupfunc, - key->partcollation, - boundinfo, - key->partnatts, - values, - &equal); + int last_datum_offset = partdesc->last_found_datum_index; + Datum *lastDatums = boundinfo->datums[last_datum_offset]; + PartitionRangeDatumKind *kind = boundinfo->kind[last_datum_offset]; + int32 cmpval; + + /* Check if the value is >= to the lower bound */ + cmpval = partition_rbound_datum_cmp(key->partsupfunc, + key->partcollation, + lastDatums, + kind, + values, + key->partnatts); /* - * The bound at bound_offset is less than or equal to the - * tuple value, so the bound at offset+1 is the upper - * bound of the partition we're looking for, if there - * actually exists one. + * If it's equal to the lower bound then no need to check + * the upper bound. */ - part_index = boundinfo->indexes[bound_offset + 1]; + if (cmpval == 0) + return boundinfo->indexes[last_datum_offset + 1]; + + else if (cmpval < 0 && last_datum_offset + 1 < boundinfo->ndatums) + { + /* Check if the value is below the upper bound */ + lastDatums = boundinfo->datums[last_datum_offset + 1]; + kind = boundinfo->kind[last_datum_offset + 1]; + cmpval = partition_rbound_datum_cmp(key->partsupfunc, + key->partcollation, + lastDatums, + kind, + values, + key->partnatts); + + if (cmpval > 0) + return boundinfo->indexes[last_datum_offset + 1]; + } + /* fall-through and do a manual lookup */ } + + bound_offset = partition_range_datum_bsearch(key->partsupfunc, + key->partcollation, + boundinfo, + key->partnatts, + values, + &equal); + + /* + * The bound at bound_offset is less than or equal to the + * tuple value, so the bound at offset+1 is the upper bound of + * the partition we're looking for, if there actually exists + * one. + */ + part_index = boundinfo->indexes[bound_offset + 1]; } break; @@ -1433,7 +1561,39 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull) * the default partition, if there is one. */ if (part_index < 0) - part_index = boundinfo->default_index; + { + /* + * Since we don't do caching for the default partition or failed + * lookups, we'll just wipe the cache fields back to their initial + * values. The count becomes 0 rather than 1 as 1 means it's the + * first time we've found a partition we're recording for the cache. + */ + partdesc->last_found_datum_index = -1; + partdesc->last_found_part_index = -1; + partdesc->last_found_count = 0; + + return boundinfo->default_index; + } + + /* We should only make it here when the code above set bound_offset */ + Assert(bound_offset >= 0); + + /* + * Attend to the cache fields. If the bound_offset matches the last + * cached bound offset then we've found the same partition as last time, + * so bump the count by one. If all goes well we'll eventually reach + * PARTITION_CACHED_FIND_THRESHOLD and we'll try the cache path next time + * around. Otherwise, we'll reset the cache count back to 1 to mark that + * we've found this partition for the first time. + */ + if (bound_offset == partdesc->last_found_datum_index) + partdesc->last_found_count++; + else + { + partdesc->last_found_count = 1; + partdesc->last_found_part_index = part_index; + partdesc->last_found_datum_index = bound_offset; + } return part_index; } diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c index 8b6e0bd595..737f0edd89 100644 --- a/src/backend/partitioning/partdesc.c +++ b/src/backend/partitioning/partdesc.c @@ -290,6 +290,12 @@ RelationBuildPartitionDesc(Relation rel, bool omit_detached) { oldcxt = MemoryContextSwitchTo(new_pdcxt); partdesc->boundinfo = partition_bounds_copy(boundinfo, key); + + /* Initialize caching fields for speeding up ExecFindPartition */ + partdesc->last_found_datum_index = -1; + partdesc->last_found_part_index = -1; + partdesc->last_found_count = 0; + partdesc->oids = (Oid *) palloc(nparts * sizeof(Oid)); partdesc->is_leaf = (bool *) palloc(nparts * sizeof(bool)); diff --git a/src/include/partitioning/partdesc.h b/src/include/partitioning/partdesc.h index ae1afe3d78..4659fe0e64 100644 --- a/src/include/partitioning/partdesc.h +++ b/src/include/partitioning/partdesc.h @@ -36,6 +36,32 @@ typedef struct PartitionDescData * the corresponding 'oids' element belongs to * a leaf partition or not */ PartitionBoundInfo boundinfo; /* collection of partition bounds */ + + /* Caching fields to cache lookups in get_partition_for_tuple() */ + + /* + * Index into the PartitionBoundInfo's datum array for the last found + * partition or -1 if none. + */ + int last_found_datum_index; + + /* + * Partition index of the last found partition or -1 if none has been + * found yet, the last found was the DEFAULT partition, or there was no + * valid partition for the last looked up values. + */ + int last_found_part_index; + + /* + * For LIST partitioning, this is the number of times in a row that the + * the datum we're looking for a partition for matches the datum in the + * last_found_datum_index index of the boundinfo->datums array. For RANGE + * partitioning, this is the number of times in a row we've found that the + * datum we're looking for a partition for falls into the range of the + * partition corresponding to the last_found_datum_index index of the + * boundinfo->datums array. + */ + int last_found_count; } PartitionDescData;