From eb5682bde2b070bac09f69b817b2be4dc73e3332 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Wed, 14 Nov 2018 12:15:44 -0500 Subject: [PATCH 2/2] Ensure that RelationBuildPartitionDesc sees a consistent view. If partitions are added or removed concurrently, make sure that we nevertheless get a view of the partition list and the partition descriptor for each partition which is consistent with the system state at some single point in the commit history. To do this, reuse an idea first invented by Noah Misch back in commit 4240e429d0c2d889d0cda23c618f94e12c13ade7. --- src/backend/utils/cache/partcache.c | 135 ++++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 0d732b4b84..623b156dd4 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -28,8 +28,10 @@ #include "optimizer/clauses.h" #include "optimizer/planner.h" #include "partitioning/partbounds.h" +#include "storage/sinval.h" #include "utils/builtins.h" #include "utils/datum.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/partcache.h" @@ -266,45 +268,113 @@ RelationBuildPartitionDesc(Relation rel) MemoryContext oldcxt; int *mapping; - /* Get partition oids from pg_inherits */ - inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); - nparts = list_length(inhoids); - - if (nparts > 0) + /* + * Fetch catalog information. Since we want to allow partitions to be + * added and removed without holding AccessExclusiveLock on the parent + * table, it's possible that the catalog contents could be changing under + * us. That means that by by the time we fetch the partition bound for a + * partition returned by find_inheritance_children, it might no longer be + * a partition or might even be a partition of some other table. + * + * To ensure that we get a consistent view of the catalog data, we first + * fetch everything we need and then call AcceptInvalidationMessages. If + * SharedInvalidMessageCounter advances between the time we start fetching + * information and the time AcceptInvalidationMessages() completes, that + * means something may have changed under us, so we start over and do it + * all again. + */ + for (;;) { - oids = palloc(nparts * sizeof(Oid)); - boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + uint64 inval_count = SharedInvalidMessageCounter; + + /* Get partition oids from pg_inherits */ + inhoids = find_inheritance_children(RelationGetRelid(rel), NoLock); + nparts = list_length(inhoids); + + if (nparts > 0) + { + oids = palloc(nparts * sizeof(Oid)); + boundspecs = palloc(nparts * sizeof(PartitionBoundSpec *)); + } + + /* Collect bound spec nodes for each partition */ + i = 0; + foreach(cell, inhoids) + { + Oid inhrelid = lfirst_oid(cell); + HeapTuple tuple; + PartitionBoundSpec *boundspec = NULL; + + /* + * Don't put any sanity checks here that might fail as a result of + * concurrent DDL, such as a check that relpartbound is not NULL. + * We could transiently see such states as a result of concurrent + * DDL. Such checks can be performed only after we're sure we got + * a consistent view of the underlying data. + */ + tuple = SearchSysCache1(RELOID, inhrelid); + if (HeapTupleIsValid(tuple)) + { + Datum datum; + bool isnull; + + datum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relpartbound, + &isnull); + if (!isnull) + boundspec = stringToNode(TextDatumGetCString(datum)); + ReleaseSysCache(tuple); + } + + oids[i] = inhrelid; + boundspecs[i] = boundspec; + ++i; + } + + /* + * If no relevant catalog changes have occurred (see comments at the + * top of this loop, then we got a consistent view of our partition + * list and can stop now. + */ + AcceptInvalidationMessages(); + if (inval_count == SharedInvalidMessageCounter) + break; + + /* Something changed, so retry from the top. */ + if (oids != NULL) + { + pfree(oids); + oids = NULL; + } + if (boundspecs != NULL) + { + pfree(boundspecs); + boundspecs = NULL; + } + if (inhoids != NIL) + list_free(inhoids); } - /* Collect bound spec nodes for each partition */ - i = 0; - foreach(cell, inhoids) + /* + * At this point, we should have a consistent view of the data we got from + * pg_inherits and pg_class, so it's safe to perform some sanity checks. + */ + for (i = 0; i < nparts; ++i) { - Oid inhrelid = lfirst_oid(cell); - HeapTuple tuple; - Datum datum; - bool isnull; - PartitionBoundSpec *boundspec; + Oid inhrelid = oids[i]; + PartitionBoundSpec *spec = boundspecs[i]; - tuple = SearchSysCache1(RELOID, inhrelid); - if (!HeapTupleIsValid(tuple)) - elog(ERROR, "cache lookup failed for relation %u", inhrelid); - - datum = SysCacheGetAttr(RELOID, tuple, - Anum_pg_class_relpartbound, - &isnull); - if (isnull) - elog(ERROR, "null relpartbound for relation %u", inhrelid); - boundspec = stringToNode(TextDatumGetCString(datum)); - if (!IsA(boundspec, PartitionBoundSpec)) + if (!spec) + elog(ERROR, "missing relpartbound for relation %u", inhrelid); + if (!IsA(spec, PartitionBoundSpec)) elog(ERROR, "invalid relpartbound for relation %u", inhrelid); /* - * Sanity check: If the PartitionBoundSpec says this is the default - * partition, its OID should correspond to whatever's stored in - * pg_partitioned_table.partdefid; if not, the catalog is corrupt. + * If the PartitionBoundSpec says this is the default partition, its + * OID should match pg_partitioned_table.partdefid; if not, the + * catalog is corrupt. */ - if (boundspec->is_default) + if (spec->is_default) { Oid partdefid; @@ -313,11 +383,6 @@ RelationBuildPartitionDesc(Relation rel) elog(ERROR, "expected partdefid %u, but got %u", inhrelid, partdefid); } - - oids[i] = inhrelid; - boundspecs[i] = boundspec; - ++i; - ReleaseSysCache(tuple); } /* Now build the actual relcache partition descriptor */ -- 2.11.0