From 612308a5a663d7398cd4666ec6ddbf2cee0376d1 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 19 Mar 2020 15:25:55 +0100 Subject: [PATCH 2/4] Implement incremental sort Incremental sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the sort keys. For example when a sort by (key1, key2 ... keyN) is requested, and the input is already sorted by (key1, key2 ... keyM), M < N, we can divide the input into groups where keys (key1, ... keyM) are equal, and only sort on the remaining columns. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without checking prefix key group membership and sorting on all columns when safe. - Fetching all tuples for a single prefix key group and sorting on solely the unsorted columns. We always begin in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial. Sorting incrementally can potentially use less memory (and possibly avoid spilling to disk), avoid fetching and sorting all tuples in the dataset (particularly useful when a LIMIT clause has been specified), and begin returning tuples before the entire result set is available. Small datasets which fit entirely in memory and must be fully realized and sorted may be slightly slower, which we reflect in the costing implementation. The hybrid mode approach allows us to optimize for both very small groups (where the overhead of a new tuplesort is high) and very large groups (where we can lower cost by not having to sort on already sorted columns), albeit at some extra cost while switching between modes. Co-authored-by: Alexander Korotkov --- doc/src/sgml/config.sgml | 22 + src/backend/commands/explain.c | 223 ++- src/backend/executor/Makefile | 1 + src/backend/executor/execAmi.c | 14 + src/backend/executor/execParallel.c | 18 + src/backend/executor/execProcnode.c | 34 + src/backend/executor/nodeIncrementalSort.c | 1267 +++++++++++++++ src/backend/executor/nodeSort.c | 3 +- src/backend/nodes/copyfuncs.c | 49 +- src/backend/nodes/outfuncs.c | 25 +- src/backend/nodes/readfuncs.c | 37 +- src/backend/optimizer/path/allpaths.c | 4 + src/backend/optimizer/path/costsize.c | 198 ++- src/backend/optimizer/path/pathkeys.c | 61 +- src/backend/optimizer/plan/createplan.c | 143 +- src/backend/optimizer/plan/planner.c | 74 +- src/backend/optimizer/plan/setrefs.c | 1 + src/backend/optimizer/plan/subselect.c | 1 + src/backend/optimizer/util/pathnode.c | 51 + src/backend/utils/misc/guc.c | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/backend/utils/sort/tuplesort.c | 306 +++- src/include/executor/execdebug.h | 2 + src/include/executor/nodeIncrementalSort.h | 28 + src/include/nodes/execnodes.h | 80 + src/include/nodes/nodes.h | 3 + src/include/nodes/pathnodes.h | 9 + src/include/nodes/plannodes.h | 10 + src/include/optimizer/cost.h | 10 + src/include/optimizer/pathnode.h | 6 + src/include/optimizer/paths.h | 2 + src/include/utils/tuplesort.h | 3 + .../expected/drop-index-concurrently-1.out | 2 +- .../regress/expected/incremental_sort.out | 1400 +++++++++++++++++ .../regress/expected/partition_aggregate.out | 2 + src/test/regress/expected/sysviews.out | 3 +- src/test/regress/parallel_schedule | 2 +- src/test/regress/serial_schedule | 1 + src/test/regress/sql/incremental_sort.sql | 194 +++ src/test/regress/sql/partition_aggregate.sql | 2 + 40 files changed, 4141 insertions(+), 160 deletions(-) create mode 100644 src/backend/executor/nodeIncrementalSort.c create mode 100644 src/include/executor/nodeIncrementalSort.h create mode 100644 src/test/regress/expected/incremental_sort.out create mode 100644 src/test/regress/sql/incremental_sort.sql diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 2de21903a1..2f2e19dc64 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4554,6 +4554,28 @@ ANY num_sync ( + enable_incrementalsort (boolean) + + enable_incrementalsort configuration parameter + + + + + Enables or disables the query planner's use of incremental sort, which + allows the planner to take advantage of data presorted on columns + 1..m when an ordering on columns 1..n + (where m < n) is required. Compared to regular sorts, + incremental sort allows returning tuples before the entire result set + has been sorted, particularly enabling optimizations with + LIMIT queries. It may also reduce memory usage and + the likelihood of spilling sorts to disk, but comes at the cost of + increased overhead splitting the result set into multiple sorting + batches. The default is on. + + + + enable_indexscan (boolean) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index ff2f45cfb2..85d7bcb78f 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -82,6 +82,8 @@ static void show_upper_qual(List *qual, const char *qlabel, ExplainState *es); static void show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es); +static void show_incremental_sort_keys(IncrementalSortState *incrsortstate, + List *ancestors, ExplainState *es); static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors, ExplainState *es); static void show_agg_keys(AggState *astate, List *ancestors, @@ -95,7 +97,7 @@ static void show_grouping_set_keys(PlanState *planstate, static void show_group_keys(GroupState *gstate, List *ancestors, ExplainState *es); static void show_sort_group_keys(PlanState *planstate, const char *qlabel, - int nkeys, AttrNumber *keycols, + int nkeys, int nPresortedKeys, AttrNumber *keycols, Oid *sortOperators, Oid *collations, bool *nullsFirst, List *ancestors, ExplainState *es); static void show_sortorder_options(StringInfo buf, Node *sortexpr, @@ -103,6 +105,8 @@ static void show_sortorder_options(StringInfo buf, Node *sortexpr, static void show_tablesample(TableSampleClause *tsc, PlanState *planstate, List *ancestors, ExplainState *es); static void show_sort_info(SortState *sortstate, ExplainState *es); +static void show_incremental_sort_info(IncrementalSortState *incrsortstate, + ExplainState *es); static void show_hash_info(HashState *hashstate, ExplainState *es); static void show_hashagg_info(AggState *hashstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, @@ -1240,6 +1244,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_Sort: pname = sname = "Sort"; break; + case T_IncrementalSort: + pname = sname = "Incremental Sort"; + break; case T_Group: pname = sname = "Group"; break; @@ -1899,6 +1906,12 @@ ExplainNode(PlanState *planstate, List *ancestors, show_sort_keys(castNode(SortState, planstate), ancestors, es); show_sort_info(castNode(SortState, planstate), es); break; + case T_IncrementalSort: + show_incremental_sort_keys(castNode(IncrementalSortState, planstate), + ancestors, es); + show_incremental_sort_info(castNode(IncrementalSortState, planstate), + es); + break; case T_MergeAppend: show_merge_append_keys(castNode(MergeAppendState, planstate), ancestors, es); @@ -2227,12 +2240,29 @@ show_sort_keys(SortState *sortstate, List *ancestors, ExplainState *es) Sort *plan = (Sort *) sortstate->ss.ps.plan; show_sort_group_keys((PlanState *) sortstate, "Sort Key", - plan->numCols, plan->sortColIdx, + plan->numCols, 0, plan->sortColIdx, plan->sortOperators, plan->collations, plan->nullsFirst, ancestors, es); } +/* + * Show the sort keys for a IncrementalSort node. + */ +static void +show_incremental_sort_keys(IncrementalSortState *incrsortstate, + List *ancestors, ExplainState *es) +{ + IncrementalSort *plan = (IncrementalSort *) incrsortstate->ss.ps.plan; + + show_sort_group_keys((PlanState *) incrsortstate, "Sort Key", + plan->sort.numCols, plan->presortedCols, + plan->sort.sortColIdx, + plan->sort.sortOperators, plan->sort.collations, + plan->sort.nullsFirst, + ancestors, es); +} + /* * Likewise, for a MergeAppend node. */ @@ -2243,7 +2273,7 @@ show_merge_append_keys(MergeAppendState *mstate, List *ancestors, MergeAppend *plan = (MergeAppend *) mstate->ps.plan; show_sort_group_keys((PlanState *) mstate, "Sort Key", - plan->numCols, plan->sortColIdx, + plan->numCols, 0, plan->sortColIdx, plan->sortOperators, plan->collations, plan->nullsFirst, ancestors, es); @@ -2267,7 +2297,7 @@ show_agg_keys(AggState *astate, List *ancestors, show_grouping_sets(outerPlanState(astate), plan, ancestors, es); else show_sort_group_keys(outerPlanState(astate), "Group Key", - plan->numCols, plan->grpColIdx, + plan->numCols, 0, plan->grpColIdx, NULL, NULL, NULL, ancestors, es); @@ -2336,7 +2366,7 @@ show_grouping_set_keys(PlanState *planstate, if (sortnode) { show_sort_group_keys(planstate, "Sort Key", - sortnode->numCols, sortnode->sortColIdx, + sortnode->numCols, 0, sortnode->sortColIdx, sortnode->sortOperators, sortnode->collations, sortnode->nullsFirst, ancestors, es); @@ -2393,7 +2423,7 @@ show_group_keys(GroupState *gstate, List *ancestors, /* The key columns refer to the tlist of the child plan */ ancestors = lcons(plan, ancestors); show_sort_group_keys(outerPlanState(gstate), "Group Key", - plan->numCols, plan->grpColIdx, + plan->numCols, 0, plan->grpColIdx, NULL, NULL, NULL, ancestors, es); ancestors = list_delete_first(ancestors); @@ -2406,13 +2436,14 @@ show_group_keys(GroupState *gstate, List *ancestors, */ static void show_sort_group_keys(PlanState *planstate, const char *qlabel, - int nkeys, AttrNumber *keycols, + int nkeys, int nPresortedKeys, AttrNumber *keycols, Oid *sortOperators, Oid *collations, bool *nullsFirst, List *ancestors, ExplainState *es) { Plan *plan = planstate->plan; List *context; List *result = NIL; + List *resultPresorted = NIL; StringInfoData sortkeybuf; bool useprefix; int keyno; @@ -2452,9 +2483,13 @@ show_sort_group_keys(PlanState *planstate, const char *qlabel, nullsFirst[keyno]); /* Emit one property-list item per sort key */ result = lappend(result, pstrdup(sortkeybuf.data)); + if (keyno < nPresortedKeys) + resultPresorted = lappend(resultPresorted, exprstr); } ExplainPropertyList(qlabel, result, es); + if (nPresortedKeys > 0) + ExplainPropertyList("Presorted Key", resultPresorted, es); } /* @@ -2668,6 +2703,180 @@ show_sort_info(SortState *sortstate, ExplainState *es) } } +/* + * Incremental sort nodes sort in (a potentially very large number of) batches, + * so EXPLAIN ANALYZE needs to roll up the tuplesort stats from each batch into + * an intelligible summary. + * + * This function is used for both a non-parallel node and each worker in a + * parallel incremental sort node. + */ +static void +show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo, + const char *groupLabel, ExplainState *es) +{ + ListCell *methodCell; + int methodCount = list_length(groupInfo->sortMethods); + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfoSpaces(es->str, es->indent * 2); + appendStringInfo(es->str, "%s Groups: %ld (Methods: ", groupLabel, + groupInfo->groupCount); + foreach(methodCell, groupInfo->sortMethods) + { + const char *sortMethodName; + + sortMethodName = tuplesort_method_name(methodCell->int_value); + appendStringInfo(es->str, "%s", sortMethodName); + if (foreach_current_index(methodCell) < methodCount - 1) + appendStringInfo(es->str, ", "); + } + appendStringInfo(es->str, ")"); + + if (groupInfo->maxMemorySpaceUsed > 0) + { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + appendStringInfo(es->str, " %s: %ldkB (avg), %ldkB (max)", + spaceTypeName, avgSpace, + groupInfo->maxMemorySpaceUsed); + } + + if (groupInfo->maxDiskSpaceUsed > 0) + { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + + const char *spaceTypeName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + /* Add a semicolon separator only if memory stats were printed. */ + if (groupInfo->maxMemorySpaceUsed > 0) + appendStringInfo(es->str, ";"); + appendStringInfo(es->str, " %s: %ldkB (avg), %ldkB (max)", + spaceTypeName, avgSpace, + groupInfo->maxDiskSpaceUsed); + } + + appendStringInfo(es->str, "\n"); + } + else + { + List *methodNames = NIL; + StringInfoData groupName; + + initStringInfo(&groupName); + appendStringInfo(&groupName, "%s Groups", groupLabel); + ExplainOpenGroup("Incremental Sort Groups", groupName.data, true, es); + ExplainPropertyInteger("Group Count", NULL, groupInfo->groupCount, es); + + foreach(methodCell, groupInfo->sortMethods) + { + const char *sortMethodName = tuplesort_method_name(methodCell->int_value); + + methodNames = lappend(methodNames, unconstify(char *, sortMethodName)); + } + ExplainPropertyList("Sort Methods Used", methodNames, es); + + if (groupInfo->maxMemorySpaceUsed > 0) + { + long avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount; + const char *spaceTypeName; + StringInfoData memoryName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY); + initStringInfo(&memoryName); + appendStringInfo(&memoryName, "Sort Space %s", spaceTypeName); + ExplainOpenGroup("Sort Space", memoryName.data, true, es); + + ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es); + ExplainPropertyInteger("Maximum Sort Space Used", "kB", + groupInfo->maxMemorySpaceUsed, es); + + ExplainCloseGroup("Sort Spaces", memoryName.data, true, es); + } + if (groupInfo->maxDiskSpaceUsed > 0) + { + long avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount; + const char *spaceTypeName; + StringInfoData diskName; + + spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK); + initStringInfo(&diskName); + appendStringInfo(&diskName, "Sort Space %s", spaceTypeName); + ExplainOpenGroup("Sort Space", diskName.data, true, es); + + ExplainPropertyInteger("Average Sort Space Used", "kB", avgSpace, es); + ExplainPropertyInteger("Maximum Sort Space Used", "kB", + groupInfo->maxDiskSpaceUsed, es); + + ExplainCloseGroup("Sort Spaces", diskName.data, true, es); + } + + ExplainCloseGroup("Incremental Sort Groups", groupName.data, true, es); + } +} + +/* + * If it's EXPLAIN ANALYZE, show tuplesort stats for a incremental sort node + */ +static void +show_incremental_sort_info(IncrementalSortState *incrsortstate, + ExplainState *es) +{ + IncrementalSortGroupInfo *fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo; + + fullsortGroupInfo = &incrsortstate->incsort_info.fullsortGroupInfo; + + if (!(es->analyze && fullsortGroupInfo->groupCount > 0)) + return; + + show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", es); + prefixsortGroupInfo = &incrsortstate->incsort_info.prefixsortGroupInfo; + if (prefixsortGroupInfo->groupCount > 0) + show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", es); + + if (incrsortstate->shared_info != NULL) + { + int n; + bool opened_group = false; + + for (n = 0; n < incrsortstate->shared_info->num_workers; n++) + { + IncrementalSortInfo *incsort_info = + &incrsortstate->shared_info->sinfo[n]; + + /* + * If a worker hasn't process any sort groups at all, then exclude + * it from output since it either didn't launch or didn't + * contribute anything meaningful. + */ + fullsortGroupInfo = &incsort_info->fullsortGroupInfo; + prefixsortGroupInfo = &incsort_info->prefixsortGroupInfo; + if (fullsortGroupInfo->groupCount == 0 && + prefixsortGroupInfo->groupCount == 0) + continue; + + if (!opened_group) + { + ExplainOpenGroup("Workers", "Workers", false, es); + opened_group = true; + } + + if (fullsortGroupInfo->groupCount > 0) + show_incremental_sort_group_info(fullsortGroupInfo, "Full-sort", es); + if (prefixsortGroupInfo->groupCount > 0) + show_incremental_sort_group_info(prefixsortGroupInfo, "Presorted", es); + } + + if (opened_group) + ExplainCloseGroup("Workers", "Workers", false, es); + } +} + /* * Show information on hash buckets/batches. */ diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index a983800e4b..f990c6473a 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -46,6 +46,7 @@ OBJS = \ nodeGroup.o \ nodeHash.o \ nodeHashjoin.o \ + nodeIncrementalSort.o \ nodeIndexonlyscan.o \ nodeIndexscan.o \ nodeLimit.o \ diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index b12aeb3334..e2154ba86a 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -30,6 +30,7 @@ #include "executor/nodeGroup.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeLimit.h" @@ -252,6 +253,10 @@ ExecReScan(PlanState *node) ExecReScanSort((SortState *) node); break; + case T_IncrementalSortState: + ExecReScanIncrementalSort((IncrementalSortState *) node); + break; + case T_GroupState: ExecReScanGroup((GroupState *) node); break; @@ -557,8 +562,17 @@ ExecSupportsBackwardScan(Plan *node) case T_CteScan: case T_Material: case T_Sort: + /* these don't evaluate tlist */ return true; + case T_IncrementalSort: + + /* + * Unlike full sort, incremental sort keeps only a single group of + * tuples in memory, so it can't scan backwards. + */ + return false; + case T_LockRows: case T_Limit: return ExecSupportsBackwardScan(outerPlan(node)); diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index a753d6efa0..333d4ba1fb 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -31,6 +31,7 @@ #include "executor/nodeForeignscan.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeSeqscan.h" @@ -282,6 +283,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortEstimate((SortState *) planstate, e->pcxt); break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt); + break; default: break; @@ -495,6 +500,10 @@ ExecParallelInitializeDSM(PlanState *planstate, /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortInitializeDSM((SortState *) planstate, d->pcxt); break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt); + break; default: break; @@ -957,6 +966,7 @@ ExecParallelReInitializeDSM(PlanState *planstate, break; case T_HashState: case T_SortState: + case T_IncrementalSortState: /* these nodes have DSM state, but no reinitialization is required */ break; @@ -1017,6 +1027,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate, case T_SortState: ExecSortRetrieveInstrumentation((SortState *) planstate); break; + case T_IncrementalSortState: + ExecIncrementalSortRetrieveInstrumentation((IncrementalSortState *) planstate); + break; case T_HashState: ExecHashRetrieveInstrumentation((HashState *) planstate); break; @@ -1303,6 +1316,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt) /* even when not parallel-aware, for EXPLAIN ANALYZE */ ExecSortInitializeWorker((SortState *) planstate, pwcxt); break; + case T_IncrementalSortState: + /* even when not parallel-aware, for EXPLAIN ANALYZE */ + ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate, + pwcxt); + break; default: break; diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 7b2e84f402..5662e7d742 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -88,6 +88,7 @@ #include "executor/nodeGroup.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" +#include "executor/nodeIncrementalSort.h" #include "executor/nodeIndexonlyscan.h" #include "executor/nodeIndexscan.h" #include "executor/nodeLimit.h" @@ -313,6 +314,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_IncrementalSort: + result = (PlanState *) ExecInitIncrementalSort((IncrementalSort *) node, + estate, eflags); + break; + case T_Group: result = (PlanState *) ExecInitGroup((Group *) node, estate, eflags); @@ -693,6 +699,10 @@ ExecEndNode(PlanState *node) ExecEndSort((SortState *) node); break; + case T_IncrementalSortState: + ExecEndIncrementalSort((IncrementalSortState *) node); + break; + case T_GroupState: ExecEndGroup((GroupState *) node); break; @@ -839,6 +849,30 @@ ExecSetTupleBound(int64 tuples_needed, PlanState *child_node) sortState->bound = tuples_needed; } } + else if (IsA(child_node, IncrementalSortState)) + { + /* + * If it is an IncrementalSort node, notify it that it can use bounded + * sort. + * + * Note: it is the responsibility of nodeIncrementalSort.c to react + * properly to changes of these parameters. If we ever redesign this, + * it'd be a good idea to integrate this signaling with the + * parameter-change mechanism. + */ + IncrementalSortState *sortState = (IncrementalSortState *) child_node; + + if (tuples_needed < 0) + { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } + else + { + sortState->bounded = true; + sortState->bound = tuples_needed; + } + } else if (IsA(child_node, AppendState)) { /* diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c new file mode 100644 index 0000000000..9fe93d5979 --- /dev/null +++ b/src/backend/executor/nodeIncrementalSort.c @@ -0,0 +1,1267 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.c + * Routines to handle incremental sorting of relations. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/executor/nodeIncrementalSort.c + * + * DESCRIPTION + * + * Incremental sort is an optimized variant of multikey sort for cases + * when the input is already sorted by a prefix of the sort keys. For + * example when a sort by (key1, key2 ... keyN) is requested, and the + * input is already sorted by (key1, key2 ... keyM), M < N, we can + * divide the input into groups where keys (key1, ... keyM) are equal, + * and only sort on the remaining columns. + * + * Consider the following example. We have input tuples consisting of + * two integers (X, Y) already presorted by X, while it's required to + * sort them by both X and Y. Let input tuples be following. + * + * (1, 5) + * (1, 2) + * (2, 9) + * (2, 1) + * (2, 5) + * (3, 3) + * (3, 7) + * + * An incremental sort algorithm would split the input into the following + * groups, which have equal X, and then sort them by Y individually: + * + * (1, 5) (1, 2) + * (2, 9) (2, 1) (2, 5) + * (3, 3) (3, 7) + * + * After sorting these groups and putting them altogether, we would get + * the following result which is sorted by X and Y, as requested: + * + * (1, 2) + * (1, 5) + * (2, 1) + * (2, 5) + * (2, 9) + * (3, 3) + * (3, 7) + * + * Incremental sort may be more efficient than plain sort, particularly + * on large datasets, as it reduces the amount of data to sort at once, + * making it more likely it fits into work_mem (eliminating the need to + * spill to disk). But the main advantage of incremental sort is that + * it can start producing rows early, before sorting the whole dataset, + * which is a significant benefit especially for queries with LIMIT. + * + * The algorithm we've implemented here is modified from the theoretical + * base described above by operating in two different modes: + * - Fetching a minimum number of tuples without checking prefix key + * group membership and sorting on all columns when safe. + * - Fetching all tuples for a single prefix key group and sorting on + * solely the unsorted columns. + * We always begin in the first mode, and employ a heuristic to switch + * into the second mode if we believe it's beneficial. + * + * Sorting incrementally can potentially use less memory, avoid fetching + * and sorting all tuples in the the dataset, and begin returning tuples + * before the entire result set is available. + * + * The hybrid mode approach allows us to optimize for both very small + * groups (where the overhead of a new tuplesort is high) and very large + * groups (where we can lower cost by not having to sort on already sorted + * columns), albeit at some extra cost while switching between modes. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/htup_details.h" +#include "executor/execdebug.h" +#include "executor/nodeIncrementalSort.h" +#include "miscadmin.h" +#include "utils/lsyscache.h" +#include "utils/tuplesort.h" + +/* ---------------------------------------------------------------- + * instrumentSortedGroup + * + * Because incremental sort processes (potentially many) sort batches, we need + * to capture tuplesort stats each time we finalize a sort state. This summary + * data is later used for EXPLAIN ANALYZE output. + * ---------------------------------------------------------------- + */ +static void +instrumentSortedGroup(PlanState *pstate, IncrementalSortGroupInfo *groupInfo, + Tuplesortstate *sortState) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + TuplesortInstrumentation sort_instr; + + groupInfo->groupCount++; + + tuplesort_get_stats(sortState, &sort_instr); + + /* Calculate total and maximum memory and disk space used. */ + switch (sort_instr.spaceType) + { + case SORT_SPACE_TYPE_DISK: + groupInfo->totalDiskSpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxDiskSpaceUsed) + groupInfo->maxDiskSpaceUsed = sort_instr.spaceUsed; + + break; + case SORT_SPACE_TYPE_MEMORY: + groupInfo->totalMemorySpaceUsed += sort_instr.spaceUsed; + if (sort_instr.spaceUsed > groupInfo->maxMemorySpaceUsed) + groupInfo->maxMemorySpaceUsed = sort_instr.spaceUsed; + + break; + } + + /* Track each sort method we've used. */ + if (!list_member_int(groupInfo->sortMethods, sort_instr.sortMethod)) + groupInfo->sortMethods = lappend_int(groupInfo->sortMethods, + sort_instr.sortMethod); + + /* Record shared stats if we're a parallel worker. */ + if (node->shared_info && node->am_worker) + { + Assert(IsParallelWorker()); + Assert(ParallelWorkerNumber <= node->shared_info->num_workers); + + memcpy(&node->shared_info->sinfo[ParallelWorkerNumber], + &node->incsort_info, sizeof(IncrementalSortInfo)); + } +} + +/* ---------------------------------------------------------------- + * preparePresortedCols + * + * Prepare information for presorted_keys comparisons. + * ---------------------------------------------------------------- + */ +static void +preparePresortedCols(IncrementalSortState *node) +{ + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + node->presorted_keys = + (PresortedKeyData *) palloc(plannode->presortedCols * + sizeof(PresortedKeyData)); + + /* Pre-cache comparison functions for each pre-sorted key. */ + for (int i = 0; i < plannode->presortedCols; i++) + { + Oid equalityOp, + equalityFunc; + PresortedKeyData *key; + + key = &node->presorted_keys[i]; + key->attno = plannode->sort.sortColIdx[i]; + + equalityOp = get_equality_op_for_ordering_op(plannode->sort.sortOperators[i], + NULL); + if (!OidIsValid(equalityOp)) + elog(ERROR, "missing equality operator for ordering operator %u", + plannode->sort.sortOperators[i]); + + equalityFunc = get_opcode(equalityOp); + if (!OidIsValid(equalityFunc)) + elog(ERROR, "missing function for operator %u", equalityOp); + + /* Lookup the comparison function */ + fmgr_info_cxt(equalityFunc, &key->flinfo, CurrentMemoryContext); + + /* We can initialize the callinfo just once and re-use it */ + key->fcinfo = palloc0(SizeForFunctionCallInfo(2)); + InitFunctionCallInfoData(*key->fcinfo, &key->flinfo, 2, + plannode->sort.collations[i], NULL, NULL); + key->fcinfo->args[0].isnull = false; + key->fcinfo->args[1].isnull = false; + } +} + +/* ---------------------------------------------------------------- + * isCurrentGroup + * + * Check whether a given tuple belongs to the current sort group by comparing + * the presorted column values to the pivot tuple of the current group. + * ---------------------------------------------------------------- + */ +static bool +isCurrentGroup(IncrementalSortState *node, TupleTableSlot *pivot, TupleTableSlot *tuple) +{ + int presortedCols; + + presortedCols = castNode(IncrementalSort, node->ss.ps.plan)->presortedCols; + + /* + * That the input is sorted by keys * (0, ... n) implies that the tail + * keys are more likely to change. Therefore we do our comparison starting + * from the last pre-sorted column to optimize for early detection of + * inequality and minimizing the number of function calls.. + */ + for (int i = presortedCols - 1; i >= 0; i--) + { + Datum datumA, + datumB, + result; + bool isnullA, + isnullB; + AttrNumber attno = node->presorted_keys[i].attno; + PresortedKeyData *key; + + datumA = slot_getattr(pivot, attno, &isnullA); + datumB = slot_getattr(tuple, attno, &isnullB); + + /* Special case for NULL-vs-NULL, else use standard comparison */ + if (isnullA || isnullB) + { + if (isnullA == isnullB) + continue; + else + return false; + } + + key = &node->presorted_keys[i]; + + key->fcinfo->args[0].value = datumA; + key->fcinfo->args[1].value = datumB; + + /* just for paranoia's sake, we reset isnull each time */ + key->fcinfo->isnull = false; + + result = FunctionCallInvoke(key->fcinfo); + + /* Check for null result, since caller is clearly not expecting one */ + if (key->fcinfo->isnull) + elog(ERROR, "function %u returned NULL", key->flinfo.fn_oid); + + if (!DatumGetBool(result)) + return false; + } + return true; +} + +/* ---------------------------------------------------------------- + * switchToPresortedPrefixMode + * + * When we determine that we've likely encountered a large batch of tuples all + * having the same presorted prefix values, we want to optimize tuplesort by + * only sorting on unsorted suffix keys. + * + * The problem is that we've already accumulated several tuples in another + * tuplesort configured to sort by all columns (assuming that there may be + * more than one prefix key group). So to switch to presorted prefix mode we + * have to go back and look at all the tuples we've already accumulated to + * verify they're all part of the same prefix key group before sorting them + * solely by unsorted suffix keys. + * + * While it's likely that all already fetch tuples are all part of a single + * prefix group, we also have to handle the possibility that there is at least + * one different prefix key group before the large prefix key group. + * ---------------------------------------------------------------- + */ +static void +switchToPresortedPrefixMode(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + ScanDirection dir; + int64 nTuples = 0; + bool lastTuple = false; + bool firstTuple = true; + TupleDesc tupDesc; + PlanState *outerNode; + IncrementalSort *plannode = castNode(IncrementalSort, node->ss.ps.plan); + + dir = node->ss.ps.state->es_direction; + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Configure the prefix sort state the first time around. */ + if (node->prefixsort_state == NULL) + { + Tuplesortstate *prefixsort_state; + int presortedCols = plannode->presortedCols; + + /* + * Optimize the sort by assuming the prefix columns are all equal and + * thus we only need to sort by any remaining columns. + */ + prefixsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols - presortedCols, + &(plannode->sort.sortColIdx[presortedCols]), + &(plannode->sort.sortOperators[presortedCols]), + &(plannode->sort.collations[presortedCols]), + &(plannode->sort.nullsFirst[presortedCols]), + work_mem, + NULL, + false); + node->prefixsort_state = prefixsort_state; + } + else + { + /* Next group of presorted data */ + tuplesort_reset(node->prefixsort_state); + } + + /* + * If the current node has a bound, then it's reasonably likely that a + * large prefix key group will benefit from bounded sort, so configure the + * tuplesort to allow for that optimization. + */ + if (node->bounded) + { + SO1_printf("Setting bound on presorted prefix tuplesort to: %ld\n", + node->bound - node->bound_Done); + tuplesort_set_bound(node->prefixsort_state, + node->bound - node->bound_Done); + } + + /* + * Copy as many tuples as we can (i.e., in the same prefix key group) from + * the full sort state to the prefix sort state. + */ + for (;;) + { + lastTuple = node->n_fullsort_remaining - nTuples == 1; + + /* + * When we encounter multiple prefix key groups inside the full sort + * tuplesort we have to carry over the last read tuple into the next + * batch. + */ + if (firstTuple && !TupIsNull(node->transfer_tuple)) + { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + nTuples++; + + /* The carried over tuple is our new group pivot tuple. */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + } + else + { + tuplesort_gettupleslot(node->fullsort_state, + ScanDirectionIsForward(dir), + false, node->transfer_tuple, NULL); + + /* + * If this is our first time through the loop, then we need to + * save the first tuple we get as our new group pivot. + */ + if (TupIsNull(node->group_pivot)) + ExecCopySlot(node->group_pivot, node->transfer_tuple); + + if (isCurrentGroup(node, node->group_pivot, node->transfer_tuple)) + { + tuplesort_puttupleslot(node->prefixsort_state, node->transfer_tuple); + nTuples++; + } + else + { + /* + * The tuple isn't part of the current batch so we need to + * carry it over into the next batch of tuples we transfer out + * of the full sort tuplesort into the presorted prefix + * tuplesort. We don't actually have to do anything special to + * save the tuple since we've already loaded it into the + * node->transfer_tuple slot, and, even though that slot + * points to memory inside the full sort tuplesort, we can't + * reset that tuplesort anyway until we've fully transferred + * out of its tuples, so this reference is safe. We do need to + * reset the group pivot tuple though since we've finished the + * current prefix key group. + */ + ExecClearTuple(node->group_pivot); + break; + } + } + + firstTuple = false; + + /* + * If we've copied all of the tuples from the full sort state into the + * prefix sort state, then we don't actually know that we've yet found + * the last tuple in that prefix key group until we check the next + * tuple from the outer plan node, so we retain the current group + * pivot tuple prefix key group comparison. + */ + if (lastTuple) + break; + } + + /* + * Track how many tuples remain in the full sort batch so that we know if + * we need to sort multiple prefix key groups before processing tuples + * remaining in the large single prefix key group we think we've + * encountered. + */ + SO1_printf("Moving %ld tuples to presorted prefix tuplesort\n", nTuples); + node->n_fullsort_remaining -= nTuples; + SO1_printf("Setting n_fullsort_remaining to %ld\n", node->n_fullsort_remaining); + + if (lastTuple) + { + /* + * We've confirmed that all tuples remaining in the full sort batch is + * in the same prefix key group and moved all of those tuples into the + * presorted prefix tuplesort. Now we can save our pivot comparison + * tuple and continue fetching tuples from the outer execution node to + * load into the presorted prefix tuplesort. + */ + ExecCopySlot(node->group_pivot, node->transfer_tuple); + SO_printf("Setting execution_status to INCSORT_LOADPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_LOADPREFIXSORT; + + /* + * Make sure we clear the transfer tuple slot so that next time we + * encounter a large prefix key group we don't incorrectly assume we + * have a tuple carried over from the previous group. + */ + ExecClearTuple(node->transfer_tuple); + } + else + { + /* + * We finished a group but didn't consume all of the tuples from the + * full sort state, so we'll sort this batch, let the outer node read + * out all of those tuples, and then come back around to find another + * batch. + */ + SO1_printf("Sorting presorted prefix tuplesort with %ld tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + if (pstate->instrument != NULL) + instrumentSortedGroup(pstate, + &node->incsort_info.prefixsortGroupInfo, + node->prefixsort_state); + + if (node->bounded) + { + /* + * If the current node has a bound and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from %ld to %ld\n", + Min(node->bound, node->bound_Done + nTuples), node->bound_Done); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (switchToPresortedPrefixMode)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + } +} + +/* + * Sorting many small groups with tuplesort is inefficient. In order to + * cope with this problem we don't start a new group until the current one + * contains at least DEFAULT_MIN_GROUP_SIZE tuples (unfortunately this also + * means we can't assume small groups of tuples all have the same prefix keys.) + * When we have a bound that's less than DEFAULT_MIN_GROUP_SIZE we start looking + * for the new group as soon as we've met our bound to avoid fetching more + * tuples than we absolutely have to fetch. + */ +#define DEFAULT_MIN_GROUP_SIZE 32 + +/* + * While we've optimized for small prefix key groups by not starting our prefix + * key comparisons until we've reached a minimum number of tuples, we don't want + * that optimization to cause us to lose out on the benefits of being able to + * assume a large group of tuples is fully presorted by its prefix keys. + * Therefore we use the DEFAULT_MAX_FULL_SORT_GROUP_SIZE cutoff as a heuristic + * for determining when we believe we've encountered a large group, and, if we + * get to that point without finding a new prefix key group we transition to + * presorted prefix key mode. + */ +#define DEFAULT_MAX_FULL_SORT_GROUP_SIZE (2 * DEFAULT_MIN_GROUP_SIZE) + +/* ---------------------------------------------------------------- + * ExecIncrementalSort + * + * Assuming that outer subtree returns tuple presorted by some prefix + * of target sort columns, performs incremental sort. + * + * Conditions: + * -- none. + * + * Initial States: + * -- the outer child is prepared to return the first tuple. + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecIncrementalSort(PlanState *pstate) +{ + IncrementalSortState *node = castNode(IncrementalSortState, pstate); + EState *estate; + ScanDirection dir; + Tuplesortstate *read_sortstate; + Tuplesortstate *fullsort_state; + TupleTableSlot *slot; + IncrementalSort *plannode = (IncrementalSort *) node->ss.ps.plan; + PlanState *outerNode; + TupleDesc tupDesc; + int64 nTuples = 0; + int64 minGroupSize; + + CHECK_FOR_INTERRUPTS(); + + estate = node->ss.ps.state; + dir = estate->es_direction; + fullsort_state = node->fullsort_state; + + /* + * If a previous iteration has sorted a batch, then we need to check to + * see if there are any remaining tuples in that batch that we can return + * before moving on to other execution states. + */ + if (node->execution_status == INCSORT_READFULLSORT + || node->execution_status == INCSORT_READPREFIXSORT) + { + /* + * Return next tuple from the current sorted group set if available. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + + /* + * We have to populate the slot from the tuplesort before checking + * outerNodeDone because it will set the slot to NULL if no more + * tuples remain. If the tuplesort is empty, but we don't have any + * more tuples available for sort from the outer node, then + * outerNodeDone will have been set so we'll return that now-empty + * slot to the caller. + */ + if (tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + false, slot, NULL) || node->outerNodeDone) + + /* + * Note: there isn't a good test case for the node->outerNodeDone + * check directly, but we need it for any plan where the outer + * node will fail when trying to fetch too many tuples. + */ + return slot; + else if (node->n_fullsort_remaining > 0) + { + /* + * When we transition to presorted prefix mode, we might have + * accumulated at least one additional prefix key group in the + * full sort tuplesort. The first call to + * switchToPresortedPrefixMode() will have pulled the first one of + * those groups out, and we've returned those tuples to the parent + * node, but if at this point we still have tuples remaining in + * the full sort state (i.e., n_fullsort_remaining > 0), then we + * need to re-execute the prefix mode transition function to pull + * out the next prefix key group. + */ + SO1_printf("Re-calling switchToPresortedPrefixMode() because n_fullsort_remaining is > 0 (%ld)\n", + node->n_fullsort_remaining); + switchToPresortedPrefixMode(pstate); + } + else + { + /* + * If we don't have any sorted tuples to read and we're not + * currently transitioning into presorted prefix sort mode, then + * it's time to start the process all over again by building a new + * group in the full sort state. + */ + SO_printf("Setting execution_status to INCSORT_LOADFULLSORT (n_fullsort_remaining > 0)\n"); + node->execution_status = INCSORT_LOADFULLSORT; + } + } + + /* + * Scan the subplan in the forward direction while creating the sorted + * data. + */ + estate->es_direction = ForwardScanDirection; + + outerNode = outerPlanState(node); + tupDesc = ExecGetResultType(outerNode); + + /* Load tuples into the full sort state. */ + if (node->execution_status == INCSORT_LOADFULLSORT) + { + /* + * Initialize sorting structures. + */ + if (fullsort_state == NULL) + { + /* + * Initialize presorted column support structures for + * isCurrentGroup(). It's correct to do this along with the + * initial intialization for the full sort state (and not for the + * prefix sort state) since we always load the full sort state + * first. + */ + preparePresortedCols(node); + + /* + * Since we optimize small prefix key groups by accumulating a + * minimum number of tuples before sorting, we can't assume that a + * group of tuples all have the same prefix key values. Hence we + * setup the full sort tuplesort to sort by all requested sort + * keys. + */ + fullsort_state = tuplesort_begin_heap(tupDesc, + plannode->sort.numCols, + plannode->sort.sortColIdx, + plannode->sort.sortOperators, + plannode->sort.collations, + plannode->sort.nullsFirst, + work_mem, + NULL, + false); + node->fullsort_state = fullsort_state; + } + else + { + /* Reset sort for the next batch. */ + tuplesort_reset(fullsort_state); + } + + /* + * Calculate the remaining tuples left if bounded and configure both + * bounded sort and the minimum group size accordingly. + */ + if (node->bounded) + { + int64 currentBound = node->bound - node->bound_Done; + + /* + * Bounded sort isn't likely to be a useful optimization for full + * sort mode since we limit full sort mode to a relatively small + * number of tuples and tuplesort doesn't switch over to top-n + * heap sort anyway unless it hits (2 * bound) tuples. + */ + if (currentBound < DEFAULT_MIN_GROUP_SIZE) + tuplesort_set_bound(fullsort_state, currentBound); + + minGroupSize = Min(DEFAULT_MIN_GROUP_SIZE, currentBound); + } + else + minGroupSize = DEFAULT_MIN_GROUP_SIZE; + + /* + * Because we have to read the next tuple to find out that we've + * encountered a new prefix key group, on subsequent groups we have to + * carry over that extra tuple and add it to the new group's sort here + * before we read any new tuples from the outer node. + */ + if (!TupIsNull(node->group_pivot)) + { + tuplesort_puttupleslot(fullsort_state, node->group_pivot); + nTuples++; + + /* + * We're in full sort mode accumulating a minimum number of tuples + * and not checking for prefix key equality yet, so we can't + * assume the group pivot tuple will reamin the same -- unless + * we're using a minimum group size of 1, in which case the pivot + * is obviously still the pviot. + */ + if (nTuples != minGroupSize) + ExecClearTuple(node->group_pivot); + } + + + /* + * Pull as many tuples from the outer node as possible given our + * current operating mode. + */ + for (;;) + { + slot = ExecProcNode(outerNode); + + /* + * If the outer node can't provide us any more tuples, then we can + * sort the current group and return those tuples. + */ + if (TupIsNull(slot)) + { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + + SO1_printf("Sorting fullsort with %ld tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + + if (pstate->instrument != NULL) + instrumentSortedGroup(pstate, + &node->incsort_info.fullsortGroupInfo, + fullsort_state); + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (final tuple)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + + /* Accumulate the next group of presorted tuples. */ + if (nTuples < minGroupSize) + { + /* + * If we haven't yet hit our target minimum group size, then + * we don't need to bother checking for inclusion in the + * current prefix group since at this point we'll assume that + * we'll full sort this batch to avoid a large number of very + * tiny (and thus inefficient) sorts. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + + /* + * If we've reach our minimum group size, then we need to + * store the most recent tuple as a pivot. + */ + if (nTuples == minGroupSize) + ExecCopySlot(node->group_pivot, slot); + } + else + { + /* + * If we've already accumulated enough tuples to reach our + * minimum group size, then we need to compare any additional + * tuples to our pivot tuple to see if we reach the end of + * that prefix key group. Only after we find changed prefix + * keys can we guarantee sort stability of the tuples we've + * already accumulated. + */ + if (isCurrentGroup(node, node->group_pivot, slot)) + { + /* + * As long as the prefix keys match the pivot tuple then + * load the tuple into the tuplesort. + */ + tuplesort_puttupleslot(fullsort_state, slot); + nTuples++; + } + else + { + /* + * Since the tuple we fetched isn't part of the current + * prefix key group we don't want to sort it as part of + * the current batch. Instead we use the group_pivot slot + * to carry it over to the next batch (even though we + * won't actually treat it as a group pivot). + */ + ExecCopySlot(node->group_pivot, slot); + + if (node->bounded) + { + /* + * If the current node has a bound, and we've already + * sorted n tuples, then the functional bound + * remaining is (original bound - n), so store the + * current number of processed tuples for later use + * configuring the sort state's bound. + */ + SO2_printf("Changing bound_Done from %ld to %ld\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + + /* + * Once we find changed prefix keys we can complete the + * sort and transition modes to reading out the sorted + * tuples. + */ + SO1_printf("Sorting fullsort tuplesort with %ld tuples\n", + nTuples); + tuplesort_performsort(fullsort_state); + + if (pstate->instrument != NULL) + instrumentSortedGroup(pstate, + &node->incsort_info.fullsortGroupInfo, + fullsort_state); + + SO_printf("Setting execution_status to INCSORT_READFULLSORT (found end of group)\n"); + node->execution_status = INCSORT_READFULLSORT; + break; + } + } + + /* + * Unless we've alrady transitioned modes to reading from the full + * sort state, then we assume that having read at least + * DEFAULT_MAX_FULL_SORT_GROUP_SIZE tuples means it's likely we're + * processing a large group of tuples all having equal prefix keys + * (but haven't yet found the final tuple in that prefix key + * group), so we need to transition in to presorted prefix mode. + */ + if (nTuples > DEFAULT_MAX_FULL_SORT_GROUP_SIZE && + node->execution_status != INCSORT_READFULLSORT) + { + /* + * The group pivot we have stored has already been put into + * the tuplesort; we don't want to carry it over. Since we + * haven't yet found the end of the prefix key group, it might + * seem like we should keep this, but we don't actually know + * how many prefix key groups might be represented in the full + * sort state, so we'll let the mode transition function + * manage this state for us. + */ + ExecClearTuple(node->group_pivot); + + /* + * Unfortunately the tuplesort API doesn't include a way to + * retrieve tuples unless a sort has been performed, so we + * perform the sort even though we could just as easily rely + * on FIFO retrieval semantics when transferring them to the + * presorted prefix tuplesort. + */ + SO1_printf("Sorting fullsort tuplesort with %ld tuples\n", nTuples); + tuplesort_performsort(fullsort_state); + if (pstate->instrument != NULL) + instrumentSortedGroup(pstate, + &node->incsort_info.fullsortGroupInfo, + fullsort_state); + + /* + * If the full sort tuplesort happened to switch into top-n + * heapsort mode then we will only be able to retrieve + * currentBound tuples (since the tuplesort will have only + * retained the top-n tuples). This is safe even though we + * haven't yet completed fetching the current prefix key group + * because the tuples we've "lost" already sorted "below" the + * retained ones, and we're already contractually guaranteed + * to not need any more than the currentBound tuples. + */ + if (tuplesort_used_bound(node->fullsort_state)) + { + int64 currentBound = node->bound - node->bound_Done; + + SO2_printf("Read %ld tuples, but setting to %ld because we used bounded sort\n", + nTuples, Min(currentBound, nTuples)); + nTuples = Min(currentBound, nTuples); + } + + SO1_printf("Setting n_fullsort_remaining to %ld and calling switchToPresortedPrefixMode()\n", + nTuples); + + /* + * We might have multiple prefix key groups in the full sort + * state, so the mode transition function needs to know the it + * needs to move from the fullsort to presorted prefix sort. + */ + node->n_fullsort_remaining = nTuples; + + /* Transition the tuples to the presorted prefix tuplesort. */ + switchToPresortedPrefixMode(pstate); + + /* + * Since we know we had tuples to move to the presorted prefix + * tuplesort, we know that unless that transition has verified + * that all tuples belonged to the same prefix key group (in + * which case we can go straight to continuing to load tuples + * into that tuplesort), we should have a tuple to return + * here. + * + * Either way, the appropriate execution status should have + * been set by switchToPresortedPrefixMode(), so we can drop + * out of the loop here and let the appropriate path kick in. + */ + break; + } + } + } + + if (node->execution_status == INCSORT_LOADPREFIXSORT) + { + /* + * We only enter this state after the mode transition function has + * confirmed all remaining tuples from the full sort state have the + * same prefix and moved those tuples to the prefix sort state. That + * function has also set a group pivot tuple (which doesn't need to be + * carried over; it's already been put into the prefix sort state). + */ + Assert(!TupIsNull(node->group_pivot)); + + /* + * Read tuples from the outer node and load them into the prefix sort + * state until we encounter a tuple whose prefix keys don't match the + * current group_pivot tuple, since we can't guarantee sort stability + * until we have all tuples matching those prefix keys. + */ + for (;;) + { + slot = ExecProcNode(outerNode); + + /* + * If we've exhausted tuples from the outer node we're done + * loading the prefix sort state. + */ + if (TupIsNull(slot)) + { + /* + * We need to know later if the outer node has completed to be + * able to distinguish between being done with a batch and + * being done with the whole node. + */ + node->outerNodeDone = true; + break; + } + + /* + * If the tuple's prefix keys match our pivot tuple, we're not + * done yet and can load it into the prefix sort state. If not, we + * don't want to sort it as part of the current batch. Instead we + * use the group_pivot slot to carry it over to the next batch + * (even though we won't actually treat it as a group pivot). + */ + if (isCurrentGroup(node, node->group_pivot, slot)) + { + tuplesort_puttupleslot(node->prefixsort_state, slot); + nTuples++; + } + else + { + ExecCopySlot(node->group_pivot, slot); + break; + } + } + + /* + * Perform the sort and begin returning the tuples to the parent plan + * node. + */ + SO1_printf("Sorting presorted prefix tuplesort with >= %ld tuples\n", nTuples); + tuplesort_performsort(node->prefixsort_state); + + if (pstate->instrument != NULL) + instrumentSortedGroup(pstate, + &node->incsort_info.prefixsortGroupInfo, + node->prefixsort_state); + + SO_printf("Setting execution_status to INCSORT_READPREFIXSORT (found end of group)\n"); + node->execution_status = INCSORT_READPREFIXSORT; + + if (node->bounded) + { + /* + * If the current node has a bound, and we've already sorted n + * tuples, then the functional bound remaining is (original bound + * - n), so store the current number of processed tuples for use + * in configuring sorting bound. + */ + SO2_printf("Changing bound_Done from %ld to %ld\n", + node->bound_Done, + Min(node->bound, node->bound_Done + nTuples)); + node->bound_Done = Min(node->bound, node->bound_Done + nTuples); + } + } + + /* Restore to user specified direction. */ + estate->es_direction = dir; + + /* + * Get the first or next tuple from tuplesort. Returns NULL if no more + * tuples. + */ + read_sortstate = node->execution_status == INCSORT_READFULLSORT ? + fullsort_state : node->prefixsort_state; + slot = node->ss.ps.ps_ResultTupleSlot; + (void) tuplesort_gettupleslot(read_sortstate, ScanDirectionIsForward(dir), + false, slot, NULL); + return slot; +} + +/* ---------------------------------------------------------------- + * ExecInitIncrementalSort + * + * Creates the run-time state information for the sort node + * produced by the planner and initializes its outer subtree. + * ---------------------------------------------------------------- + */ +IncrementalSortState * +ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags) +{ + IncrementalSortState *incrsortstate; + + SO_printf("ExecInitIncrementalSort: initializing sort node\n"); + + /* + * Incremental sort can't be used with either EXEC_FLAG_REWIND, + * EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, because we only one of many sort + * batches in the current sort state. + */ + Assert((eflags & (EXEC_FLAG_BACKWARD | + EXEC_FLAG_MARK)) == 0); + + /* Initialize state structure. */ + incrsortstate = makeNode(IncrementalSortState); + incrsortstate->ss.ps.plan = (Plan *) node; + incrsortstate->ss.ps.state = estate; + incrsortstate->ss.ps.ExecProcNode = ExecIncrementalSort; + + incrsortstate->execution_status = INCSORT_LOADFULLSORT; + incrsortstate->bounded = false; + incrsortstate->outerNodeDone = false; + incrsortstate->bound_Done = 0; + incrsortstate->fullsort_state = NULL; + incrsortstate->prefixsort_state = NULL; + incrsortstate->group_pivot = NULL; + incrsortstate->transfer_tuple = NULL; + incrsortstate->n_fullsort_remaining = 0; + incrsortstate->presorted_keys = NULL; + + if (incrsortstate->ss.ps.instrument != NULL) + { + IncrementalSortGroupInfo *fullsortGroupInfo = + &incrsortstate->incsort_info.fullsortGroupInfo; + IncrementalSortGroupInfo *prefixsortGroupInfo = + &incrsortstate->incsort_info.prefixsortGroupInfo; + + fullsortGroupInfo->groupCount = 0; + fullsortGroupInfo->maxDiskSpaceUsed = 0; + fullsortGroupInfo->totalDiskSpaceUsed = 0; + fullsortGroupInfo->maxMemorySpaceUsed = 0; + fullsortGroupInfo->totalMemorySpaceUsed = 0; + fullsortGroupInfo->sortMethods = NIL; + prefixsortGroupInfo->groupCount = 0; + prefixsortGroupInfo->maxDiskSpaceUsed = 0; + prefixsortGroupInfo->totalDiskSpaceUsed = 0; + prefixsortGroupInfo->maxMemorySpaceUsed = 0; + prefixsortGroupInfo->totalMemorySpaceUsed = 0; + prefixsortGroupInfo->sortMethods = NIL; + } + + /* + * Miscellaneous initialization + * + * Sort nodes don't initialize their ExprContexts because they never call + * ExecQual or ExecProject. + */ + + /* + * Initialize child nodes. + * + * We shield the child node from the need to support REWIND, BACKWARD, or + * MARK/RESTORE. + */ + eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + outerPlanState(incrsortstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * Initialize scan slot and type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &incrsortstate->ss, &TTSOpsMinimalTuple); + + /* + * Initialize return slot and type. No need to initialize projection info + * because we don't do any projections. + */ + ExecInitResultTupleSlotTL(&incrsortstate->ss.ps, &TTSOpsMinimalTuple); + incrsortstate->ss.ps.ps_ProjInfo = NULL; + + /* + * Initialize standalone slots to store a tuple for pivot prefix keys and + * for carrying over a tuple from one batch to the next. + */ + incrsortstate->group_pivot = + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), + &TTSOpsMinimalTuple); + incrsortstate->transfer_tuple = + MakeSingleTupleTableSlot(ExecGetResultType(outerPlanState(incrsortstate)), + &TTSOpsMinimalTuple); + + SO_printf("ExecInitIncrementalSort: sort node initialized\n"); + + return incrsortstate; +} + +/* ---------------------------------------------------------------- + * ExecEndIncrementalSort(node) + * ---------------------------------------------------------------- + */ +void +ExecEndIncrementalSort(IncrementalSortState *node) +{ + SO_printf("ExecEndIncrementalSort: shutting down sort node\n"); + + /* clean out the scan tuple */ + ExecClearTuple(node->ss.ss_ScanTupleSlot); + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + /* must drop stanalone tuple slots from outer node */ + ExecDropSingleTupleTableSlot(node->group_pivot); + ExecDropSingleTupleTableSlot(node->transfer_tuple); + + /* + * Release tuplesort resources. + */ + if (node->fullsort_state != NULL) + { + tuplesort_end(node->fullsort_state); + node->fullsort_state = NULL; + } + if (node->prefixsort_state != NULL) + { + tuplesort_end(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * Shut down the subplan. + */ + ExecEndNode(outerPlanState(node)); + + SO_printf("ExecEndIncrementalSort: sort node shutdown\n"); +} + +void +ExecReScanIncrementalSort(IncrementalSortState *node) +{ + PlanState *outerPlan = outerPlanState(node); + + /* + * Incremental sort doesn't support efficient rescan even when paramters + * haven't changed (e.g., rewind) because unlike regular sort we don't + * store all tuples at once for the full sort. + * + * So even if EXEC_FLAG_REWIND is set we just reset all of our state and + * reexecute the sort along with the child node below us. + * + * In theory if we've only fill the full sort with one batch (and haven't + * reset it for a new batch yet) then we could efficiently rewind, but + * that seems a narrow enough case that it's not worth handling specially + * at this time. + */ + + /* must drop pointer to sort result tuple */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + + if (node->group_pivot != NULL) + ExecClearTuple(node->group_pivot); + if (node->transfer_tuple != NULL) + ExecClearTuple(node->transfer_tuple); + + node->bounded = false; + node->outerNodeDone = false; + node->n_fullsort_remaining = 0; + node->bound_Done = 0; + node->presorted_keys = NULL; + + node->execution_status = INCSORT_LOADFULLSORT; + + /* + * If we've set up either of the sort states yet, we need to reset them. + * We could end them and null out the pointers, but there's no reason to + * repay the setup cost, and because guard setting up pivot comparator + * state similarly, doing so might actually cause a leak. + */ + if (node->fullsort_state != NULL) + { + tuplesort_reset(node->fullsort_state); + node->fullsort_state = NULL; + } + if (node->prefixsort_state != NULL) + { + tuplesort_reset(node->prefixsort_state); + node->prefixsort_state = NULL; + } + + /* + * If chgParam of subnode is not null, theni the plan will be re-scanned + * by the first ExecProcNode. + */ + if (outerPlan->chgParam == NULL) + ExecReScan(outerPlan); +} + +/* ---------------------------------------------------------------- + * Parallel Query Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecSortEstimate + * + * Estimate space required to propagate sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = mul_size(pcxt->nworkers, sizeof(IncrementalSortInfo)); + size = add_size(size, offsetof(SharedIncrementalSortInfo, sinfo)); + shm_toc_estimate_chunk(&pcxt->estimator, size); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeDSM + * + * Initialize DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt) +{ + Size size; + + /* don't need this if not instrumenting or no workers */ + if (!node->ss.ps.instrument || pcxt->nworkers == 0) + return; + + size = offsetof(SharedIncrementalSortInfo, sinfo) + + pcxt->nworkers * sizeof(IncrementalSortInfo); + node->shared_info = shm_toc_allocate(pcxt->toc, size); + /* ensure any unfilled slots will contain zeroes */ + memset(node->shared_info, 0, size); + node->shared_info->num_workers = pcxt->nworkers; + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, + node->shared_info); +} + +/* ---------------------------------------------------------------- + * ExecSortInitializeWorker + * + * Attach worker to DSM space for sort statistics. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pwcxt) +{ + node->shared_info = + shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true); + node->am_worker = true; +} + +/* ---------------------------------------------------------------- + * ExecSortRetrieveInstrumentation + * + * Transfer sort statistics from DSM to private memory. + * ---------------------------------------------------------------- + */ +void +ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node) +{ + Size size; + SharedIncrementalSortInfo *si; + + if (node->shared_info == NULL) + return; + + size = offsetof(SharedIncrementalSortInfo, sinfo) + + node->shared_info->num_workers * sizeof(IncrementalSortInfo); + si = palloc(size); + memcpy(si, node->shared_info, size); + node->shared_info = si; +} diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 5d1debc196..9d2bfd7ed6 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -93,7 +93,8 @@ ExecSort(PlanState *pstate) plannode->collations, plannode->nullsFirst, work_mem, - NULL, node->randomAccess); + NULL, + node->randomAccess); if (node->bounded) tuplesort_set_bound(tuplesortstate, node->bound); node->tuplesortstate = (void *) tuplesortstate; diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index eaab97f753..e21f48327d 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -927,6 +927,24 @@ _copyMaterial(const Material *from) } +/* + * CopySortFields + * + * This function copies the fields of the Sort node. It is used by + * all the copy functions for classes which inherit from Sort. + */ +static void +CopySortFields(const Sort *from, Sort *newnode) +{ + CopyPlanFields((const Plan *) from, (Plan *) newnode); + + COPY_SCALAR_FIELD(numCols); + COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); +} + /* * _copySort */ @@ -938,13 +956,29 @@ _copySort(const Sort *from) /* * copy node superclass fields */ - CopyPlanFields((const Plan *) from, (Plan *) newnode); + CopySortFields(from, newnode); - COPY_SCALAR_FIELD(numCols); - COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); - COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); - COPY_POINTER_FIELD(collations, from->numCols * sizeof(Oid)); - COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); + return newnode; +} + + +/* + * _copyIncrementalSort + */ +static IncrementalSort * +_copyIncrementalSort(const IncrementalSort *from) +{ + IncrementalSort *newnode = makeNode(IncrementalSort); + + /* + * copy node superclass fields + */ + CopySortFields((const Sort *) from, (Sort *) newnode); + + /* + * copy remainder of node + */ + COPY_SCALAR_FIELD(presortedCols); return newnode; } @@ -4895,6 +4929,9 @@ copyObjectImpl(const void *from) case T_Sort: retval = _copySort(from); break; + case T_IncrementalSort: + retval = _copyIncrementalSort(from); + break; case T_Group: retval = _copyGroup(from); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index e084c3f069..6c83372c9f 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -837,10 +837,8 @@ _outMaterial(StringInfo str, const Material *node) } static void -_outSort(StringInfo str, const Sort *node) +_outSortInfo(StringInfo str, const Sort *node) { - WRITE_NODE_TYPE("SORT"); - _outPlanInfo(str, (const Plan *) node); WRITE_INT_FIELD(numCols); @@ -850,6 +848,24 @@ _outSort(StringInfo str, const Sort *node) WRITE_BOOL_ARRAY(nullsFirst, node->numCols); } +static void +_outSort(StringInfo str, const Sort *node) +{ + WRITE_NODE_TYPE("SORT"); + + _outSortInfo(str, node); +} + +static void +_outIncrementalSort(StringInfo str, const IncrementalSort *node) +{ + WRITE_NODE_TYPE("INCREMENTALSORT"); + + _outSortInfo(str, (const Sort *) node); + + WRITE_INT_FIELD(presortedCols); +} + static void _outUnique(StringInfo str, const Unique *node) { @@ -3783,6 +3799,9 @@ outNode(StringInfo str, const void *obj) case T_Sort: _outSort(str, obj); break; + case T_IncrementalSort: + _outIncrementalSort(str, obj); + break; case T_Unique: _outUnique(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index d5b23a3479..c5bbbf459e 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -2150,12 +2150,13 @@ _readMaterial(void) } /* - * _readSort + * ReadCommonSort + * Assign the basic stuff of all nodes that inherit from Sort */ -static Sort * -_readSort(void) +static void +ReadCommonSort(Sort *local_node) { - READ_LOCALS(Sort); + READ_TEMP_LOCALS(); ReadCommonPlan(&local_node->plan); @@ -2164,6 +2165,32 @@ _readSort(void) READ_OID_ARRAY(sortOperators, local_node->numCols); READ_OID_ARRAY(collations, local_node->numCols); READ_BOOL_ARRAY(nullsFirst, local_node->numCols); +} + +/* + * _readSort + */ +static Sort * +_readSort(void) +{ + READ_LOCALS_NO_FIELDS(Sort); + + ReadCommonSort(local_node); + + READ_DONE(); +} + +/* + * _readIncrementalSort + */ +static IncrementalSort * +_readIncrementalSort(void) +{ + READ_LOCALS(IncrementalSort); + + ReadCommonSort(&local_node->sort); + + READ_INT_FIELD(presortedCols); READ_DONE(); } @@ -2801,6 +2828,8 @@ parseNodeString(void) return_value = _readMaterial(); else if (MATCH("SORT", 4)) return_value = _readSort(); + else if (MATCH("INCREMENTALSORT", 15)) + return_value = _readIncrementalSort(); else if (MATCH("GROUP", 5)) return_value = _readGroup(); else if (MATCH("AGG", 3)) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 905bbe77d8..ccf46dd0aa 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -3881,6 +3881,10 @@ print_path(PlannerInfo *root, Path *path, int indent) ptype = "Sort"; subpath = ((SortPath *) path)->subpath; break; + case T_IncrementalSortPath: + ptype = "IncrementalSort"; + subpath = ((SortPath *) path)->subpath; + break; case T_GroupPath: ptype = "Group"; subpath = ((GroupPath *) path)->subpath; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 9e7e57f118..8a52271692 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -128,6 +128,7 @@ bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; +bool enable_incrementalsort = true; bool enable_hashagg = true; bool enable_hashagg_disk = true; bool enable_groupingsets_hash_disk = false; @@ -1648,9 +1649,9 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) } /* - * cost_sort - * Determines and returns the cost of sorting a relation, including - * the cost of reading the input data. + * cost_tuplesort + * Determines and returns the cost of sorting a relation using tuplesort, + * not including the cost of reading the input data. * * If the total volume of data to sort is less than sort_mem, we will do * an in-memory sort, which requires no I/O and about t*log2(t) tuple @@ -1677,39 +1678,23 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm) * specifying nonzero comparison_cost; typically that's used for any extra * work that has to be done to prepare the inputs to the comparison operators. * - * 'pathkeys' is a list of sort keys - * 'input_cost' is the total cost for reading the input data * 'tuples' is the number of tuples in the relation * 'width' is the average tuple width in bytes * 'comparison_cost' is the extra cost per comparison, if any * 'sort_mem' is the number of kilobytes of work memory allowed for the sort * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound - * - * NOTE: some callers currently pass NIL for pathkeys because they - * can't conveniently supply the sort keys. Since this routine doesn't - * currently do anything with pathkeys anyway, that doesn't matter... - * but if it ever does, it should react gracefully to lack of key data. - * (Actually, the thing we'd most likely be interested in is just the number - * of sort keys, which all callers *could* supply.) */ -void -cost_sort(Path *path, PlannerInfo *root, - List *pathkeys, Cost input_cost, double tuples, int width, - Cost comparison_cost, int sort_mem, - double limit_tuples) +static void +cost_tuplesort(Cost *startup_cost, Cost *run_cost, + double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples) { - Cost startup_cost = input_cost; - Cost run_cost = 0; double input_bytes = relation_byte_size(tuples, width); double output_bytes; double output_tuples; long sort_mem_bytes = sort_mem * 1024L; - if (!enable_sort) - startup_cost += disable_cost; - - path->rows = tuples; - /* * We want to be sure the cost of a sort is never estimated as zero, even * if passed-in tuple count is zero. Besides, mustn't do log(0)... @@ -1748,7 +1733,7 @@ cost_sort(Path *path, PlannerInfo *root, * * Assume about N log2 N comparisons */ - startup_cost += comparison_cost * tuples * LOG2(tuples); + *startup_cost = comparison_cost * tuples * LOG2(tuples); /* Disk costs */ @@ -1759,7 +1744,7 @@ cost_sort(Path *path, PlannerInfo *root, log_runs = 1.0; npageaccesses = 2.0 * npages * log_runs; /* Assume 3/4ths of accesses are sequential, 1/4th are not */ - startup_cost += npageaccesses * + *startup_cost += npageaccesses * (seq_page_cost * 0.75 + random_page_cost * 0.25); } else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes) @@ -1770,12 +1755,12 @@ cost_sort(Path *path, PlannerInfo *root, * factor is a bit higher than for quicksort. Tweak it so that the * cost curve is continuous at the crossover point. */ - startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples); + *startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples); } else { /* We'll use plain quicksort on all the input tuples */ - startup_cost += comparison_cost * tuples * LOG2(tuples); + *startup_cost = comparison_cost * tuples * LOG2(tuples); } /* @@ -1786,8 +1771,163 @@ cost_sort(Path *path, PlannerInfo *root, * here --- the upper LIMIT will pro-rate the run cost so we'd be double * counting the LIMIT otherwise. */ - run_cost += cpu_operator_cost * tuples; + *run_cost = cpu_operator_cost * tuples; +} + +/* + * cost_full_sort + * Determines and returns the cost of sorting a relation, including the + * cost of reading the input data. + * + * For the precise description of how the cost is calculated, see the comment + * for cost_tuplesort(). + */ +void +cost_full_sort(Cost *startup_cost, Cost *run_cost, + Cost input_total_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples) +{ + cost_tuplesort(startup_cost, run_cost, + tuples, width, + comparison_cost, sort_mem, + limit_tuples); + + if (!enable_sort) + *startup_cost += disable_cost; + + *startup_cost += input_total_cost; +} + +/* + * cost_incremental_sort + * Determines and returns the cost of sorting a relation incrementally, when + * the input path is presorted by a prefix of the pathkeys. + * + * 'presorted_keys' is the number of leading pathkeys by which the input path + * is sorted. + * + * We estimate the number of groups into which the relation is divided by the + * leading pathkeys, and then calculate the cost of sorting a single group + * with tuplesort using cost_tuplesort(). + */ +void +cost_incremental_sort(Path *path, + PlannerInfo *root, List *pathkeys, int presorted_keys, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples, int width, Cost comparison_cost, int sort_mem, + double limit_tuples) +{ + Cost startup_cost = 0, + run_cost = 0, + input_run_cost = input_total_cost - input_startup_cost; + double group_tuples, + input_groups; + Cost group_startup_cost, + group_run_cost, + group_input_run_cost; + List *presortedExprs = NIL; + ListCell *l; + int i = 0; + + Assert(presorted_keys != 0); + + /* + * We want to be sure the cost of a sort is never estimated as zero, even + * if passed-in tuple count is zero. Besides, mustn't do log(0)... + */ + if (input_tuples < 2.0) + input_tuples = 2.0; + /* Extract presorted keys as list of expressions */ + foreach(l, pathkeys) + { + PathKey *key = (PathKey *) lfirst(l); + EquivalenceMember *member = (EquivalenceMember *) + linitial(key->pk_eclass->ec_members); + + presortedExprs = lappend(presortedExprs, member->em_expr); + + i++; + if (i >= presorted_keys) + break; + } + + /* Estimate number of groups with equal presorted keys */ + input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL); + group_tuples = input_tuples / input_groups; + group_input_run_cost = input_run_cost / input_groups; + + /* + * Estimate average cost of sorting of one group where presorted keys are + * equal. Incremental sort is sensitive to distribution of tuples to the + * groups, where we're relying on quite rough assumptions. Thus, we're + * pessimistic about incremental sort performance and increase its average + * group size by half. + */ + cost_tuplesort(&group_startup_cost, &group_run_cost, + 1.5 * group_tuples, width, comparison_cost, sort_mem, + limit_tuples); + + /* + * Startup cost of incremental sort is the startup cost of its first group + * plus the cost of its input. + */ + startup_cost += group_startup_cost + + input_startup_cost + group_input_run_cost; + + /* + * After we started producing tuples from the first group, the cost of + * producing all the tuples is given by the cost to finish processing this + * group, plus the total cost to process the remaining groups, plus the + * remaining cost of input. + */ + run_cost += group_run_cost + + (group_run_cost + group_startup_cost) * (input_groups - 1) + + group_input_run_cost * (input_groups - 1); + + /* + * Incremental sort adds some overhead by itself. Firstly, it has to + * detect the sort groups. This is roughly equal to one extra copy and + * comparison per tuple. Secondly, it has to reset the tuplesort context + * for every group. + */ + run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples; + run_cost += 2.0 * cpu_tuple_cost * input_groups; + + path->rows = input_tuples; + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} + +/* + * cost_sort + * Determines and returns the cost of sorting a relation, including + * the cost of reading the input data. + * + * NOTE: some callers currently pass NIL for pathkeys because they + * can't conveniently supply the sort keys. Since this routine doesn't + * currently do anything with pathkeys anyway, that doesn't matter... + * but if it ever does, it should react gracefully to lack of key data. + * (Actually, the thing we'd most likely be interested in is just the number + * of sort keys, which all callers *could* supply.) + */ +void +cost_sort(Path *path, PlannerInfo *root, + List *pathkeys, Cost input_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples) + +{ + Cost startup_cost; + Cost run_cost; + + cost_full_sort(&startup_cost, &run_cost, + input_cost, + tuples, width, comparison_cost, sort_mem, + limit_tuples); + + path->rows = tuples; path->startup_cost = startup_cost; path->total_cost = startup_cost + run_cost; } diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 71b9d42c99..3b84feaf7b 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -334,6 +334,49 @@ pathkeys_contained_in(List *keys1, List *keys2) return false; } +/* + * pathkeys_common_contained_in + * Same as pathkeys_contained_in, but also sets length of longest + * common prefix of keys1 and keys2. + */ +bool +pathkeys_common_contained_in(List *keys1, List *keys2, int *n_common) +{ + int n = 0; + ListCell *key1, + *key2; + + forboth(key1, keys1, key2, keys2) + { + PathKey *pathkey1 = (PathKey *) lfirst(key1); + PathKey *pathkey2 = (PathKey *) lfirst(key2); + + if (pathkey1 != pathkey2) + { + *n_common = n; + return false; + } + n++; + } + + *n_common = n; + return (key1 == NULL); +} + + +/* + * pathkeys_common + * Returns length of longest common prefix of keys1 and keys2. + */ +int +pathkeys_common(List *keys1, List *keys2) +{ + int n; + + (void) pathkeys_common_contained_in(keys1, keys2, &n); + return n; +} + /* * get_cheapest_path_for_pathkeys * Find the cheapest path (according to the specified criterion) that @@ -1786,26 +1829,26 @@ right_merge_direction(PlannerInfo *root, PathKey *pathkey) * Count the number of pathkeys that are useful for meeting the * query's requested output ordering. * - * Unlike merge pathkeys, this is an all-or-nothing affair: it does us - * no good to order by just the first key(s) of the requested ordering. - * So the result is always either 0 or list_length(root->query_pathkeys). + * Because we the have the possibility of incremental sort, a prefix list of + * keys is potentially useful for improving the performance of the requested + * ordering. Thus we return 0, if no valuable keys are found, or the number + * of leading keys shared by the list and the requested ordering.. */ static int pathkeys_useful_for_ordering(PlannerInfo *root, List *pathkeys) { + int n_common_pathkeys; + if (root->query_pathkeys == NIL) return 0; /* no special ordering requested */ if (pathkeys == NIL) return 0; /* unordered path */ - if (pathkeys_contained_in(root->query_pathkeys, pathkeys)) - { - /* It's useful ... or at least the first N keys are */ - return list_length(root->query_pathkeys); - } + (void) pathkeys_common_contained_in(root->query_pathkeys, pathkeys, + &n_common_pathkeys); - return 0; /* path ordering not useful */ + return n_common_pathkeys; } /* diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index fc25908dc6..1d7d4eb3e7 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -98,6 +98,8 @@ static Plan *create_projection_plan(PlannerInfo *root, int flags); static Plan *inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe); static Sort *create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags); +static IncrementalSort *create_incrementalsort_plan(PlannerInfo *root, + IncrementalSortPath *best_path, int flags); static Group *create_group_plan(PlannerInfo *root, GroupPath *best_path); static Unique *create_upper_unique_plan(PlannerInfo *root, UpperUniquePath *best_path, int flags); @@ -244,6 +246,10 @@ static MergeJoin *make_mergejoin(List *tlist, static Sort *make_sort(Plan *lefttree, int numCols, AttrNumber *sortColIdx, Oid *sortOperators, Oid *collations, bool *nullsFirst); +static IncrementalSort *make_incrementalsort(Plan *lefttree, + int numCols, int presortedCols, + AttrNumber *sortColIdx, Oid *sortOperators, + Oid *collations, bool *nullsFirst); static Plan *prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids, const AttrNumber *reqColIdx, @@ -258,6 +264,8 @@ static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec, Relids relids); static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids); +static IncrementalSort *make_incrementalsort_from_pathkeys(Plan *lefttree, + List *pathkeys, Relids relids, int presortedCols); static Sort *make_sort_from_groupcols(List *groupcls, AttrNumber *grpColIdx, Plan *lefttree); @@ -460,6 +468,11 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) (SortPath *) best_path, flags); break; + case T_IncrementalSort: + plan = (Plan *) create_incrementalsort_plan(root, + (IncrementalSortPath *) best_path, + flags); + break; case T_Group: plan = (Plan *) create_group_plan(root, (GroupPath *) best_path); @@ -1994,6 +2007,32 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags) return plan; } +/* + * create_incrementalsort_plan + * + * Do the same as create_sort_plan, but create IncrementalSort plan. + */ +static IncrementalSort * +create_incrementalsort_plan(PlannerInfo *root, IncrementalSortPath *best_path, + int flags) +{ + IncrementalSort *plan; + Plan *subplan; + + /* See comments in create_sort_plan() above */ + subplan = create_plan_recurse(root, best_path->spath.subpath, + flags | CP_SMALL_TLIST); + plan = make_incrementalsort_from_pathkeys(subplan, + best_path->spath.path.pathkeys, + IS_OTHER_REL(best_path->spath.subpath->parent) ? + best_path->spath.path.parent->relids : NULL, + best_path->presortedCols); + + copy_generic_path_info(&plan->sort.plan, (Path *) best_path); + + return plan; +} + /* * create_group_plan * @@ -5088,17 +5127,24 @@ static void label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples) { Plan *lefttree = plan->plan.lefttree; - Path sort_path; /* dummy for result of cost_sort */ - - cost_sort(&sort_path, root, NIL, - lefttree->total_cost, - lefttree->plan_rows, - lefttree->plan_width, - 0.0, - work_mem, - limit_tuples); - plan->plan.startup_cost = sort_path.startup_cost; - plan->plan.total_cost = sort_path.total_cost; + Cost startup_cost, + run_cost; + + /* + * This function shouldn't have to deal with IncrementalSort plans because + * they are only created from corresponding Path nodes. + */ + Assert(IsA(plan, Sort)); + + cost_full_sort(&startup_cost, &run_cost, + lefttree->total_cost, + lefttree->plan_rows, + lefttree->plan_width, + 0.0, + work_mem, + limit_tuples); + plan->plan.startup_cost = startup_cost; + plan->plan.total_cost = startup_cost + run_cost; plan->plan.plan_rows = lefttree->plan_rows; plan->plan.plan_width = lefttree->plan_width; plan->plan.parallel_aware = false; @@ -5677,9 +5723,12 @@ make_sort(Plan *lefttree, int numCols, AttrNumber *sortColIdx, Oid *sortOperators, Oid *collations, bool *nullsFirst) { - Sort *node = makeNode(Sort); - Plan *plan = &node->plan; + Sort *node; + Plan *plan; + node = makeNode(Sort); + + plan = &node->plan; plan->targetlist = lefttree->targetlist; plan->qual = NIL; plan->lefttree = lefttree; @@ -5693,6 +5742,37 @@ make_sort(Plan *lefttree, int numCols, return node; } +/* + * make_incrementalsort --- basic routine to build an IncrementalSort plan node + * + * Caller must have built the sortColIdx, sortOperators, collations, and + * nullsFirst arrays already. + */ +static IncrementalSort * +make_incrementalsort(Plan *lefttree, int numCols, int presortedCols, + AttrNumber *sortColIdx, Oid *sortOperators, + Oid *collations, bool *nullsFirst) +{ + IncrementalSort *node; + Plan *plan; + + node = makeNode(IncrementalSort); + + plan = &node->sort.plan; + plan->targetlist = lefttree->targetlist; + plan->qual = NIL; + plan->lefttree = lefttree; + plan->righttree = NULL; + node->presortedCols = presortedCols; + node->sort.numCols = numCols; + node->sort.sortColIdx = sortColIdx; + node->sort.sortOperators = sortOperators; + node->sort.collations = collations; + node->sort.nullsFirst = nullsFirst; + + return node; +} + /* * prepare_sort_from_pathkeys * Prepare to sort according to given pathkeys @@ -6039,6 +6119,42 @@ make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids) collations, nullsFirst); } +/* + * make_incrementalsort_from_pathkeys + * Create sort plan to sort according to given pathkeys + * + * 'lefttree' is the node which yields input tuples + * 'pathkeys' is the list of pathkeys by which the result is to be sorted + * 'relids' is the set of relations required by prepare_sort_from_pathkeys() + * 'presortedCols' is the number of presorted columns in input tuples + */ +static IncrementalSort * +make_incrementalsort_from_pathkeys(Plan *lefttree, List *pathkeys, + Relids relids, int presortedCols) +{ + int numsortkeys; + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *collations; + bool *nullsFirst; + + /* Compute sort column info, and adjust lefttree as needed */ + lefttree = prepare_sort_from_pathkeys(lefttree, pathkeys, + relids, + NULL, + false, + &numsortkeys, + &sortColIdx, + &sortOperators, + &collations, + &nullsFirst); + + /* Now build the Sort node */ + return make_incrementalsort(lefttree, numsortkeys, presortedCols, + sortColIdx, sortOperators, + collations, nullsFirst); +} + /* * make_sort_from_sortclauses * Create sort plan to sort according to given sortclauses @@ -6774,6 +6890,7 @@ is_projection_capable_path(Path *path) case T_Hash: case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: case T_LockRows: diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b65abf6046..753e23676b 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -4922,13 +4922,16 @@ create_distinct_paths(PlannerInfo *root, * Build a new upperrel containing Paths for ORDER BY evaluation. * * All paths in the result must satisfy the ORDER BY ordering. - * The only new path we need consider is an explicit sort on the - * cheapest-total existing path. + * The only new paths we need consider are an explicit full sort + * and incremental sort on the cheapest-total existing path. * * input_rel: contains the source-data Paths * target: the output tlist the result Paths must emit * limit_tuples: estimated bound on the number of output tuples, * or -1 if no LIMIT or couldn't estimate + * + * XXX This only looks at sort_pathkeys. I wonder if it needs to look at the + * other pathkeys (grouping, ...) like generate_useful_gather_paths. */ static RelOptInfo * create_ordered_paths(PlannerInfo *root, @@ -4962,29 +4965,66 @@ create_ordered_paths(PlannerInfo *root, foreach(lc, input_rel->pathlist) { - Path *path = (Path *) lfirst(lc); + Path *input_path = (Path *) lfirst(lc); + Path *sorted_path = input_path; bool is_sorted; + int presorted_keys; + + is_sorted = pathkeys_common_contained_in(root->sort_pathkeys, + input_path->pathkeys, &presorted_keys); - is_sorted = pathkeys_contained_in(root->sort_pathkeys, - path->pathkeys); - if (path == cheapest_input_path || is_sorted) + if (is_sorted) { - if (!is_sorted) + /* Use the input path as is, but add a projection step if needed */ + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); + + add_path(ordered_rel, sorted_path); + } + else + { + if (input_path == cheapest_input_path) { - /* An explicit sort here can take advantage of LIMIT */ - path = (Path *) create_sort_path(root, - ordered_rel, - path, - root->sort_pathkeys, - limit_tuples); + /* + * Sort the cheapest input path. An explicit sort here can + * take advantage of LIMIT. + */ + sorted_path = (Path *) create_sort_path(root, + ordered_rel, + input_path, + root->sort_pathkeys, + limit_tuples); + /* Add projection step if needed */ + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); + + add_path(ordered_rel, sorted_path); } + /* With incremental sort disabled, don't build those paths. */ + if (!enable_incrementalsort) + continue; + + /* Likewise, if the path can't be used for incremental sort. */ + if (!presorted_keys) + continue; + + /* Also consider incremental sort. */ + sorted_path = (Path *) create_incremental_sort_path(root, + ordered_rel, + input_path, + root->sort_pathkeys, + presorted_keys, + limit_tuples); + /* Add projection step if needed */ - if (path->pathtarget != target) - path = apply_projection_to_path(root, ordered_rel, - path, target); + if (sorted_path->pathtarget != target) + sorted_path = apply_projection_to_path(root, ordered_rel, + sorted_path, target); - add_path(ordered_rel, path); + add_path(ordered_rel, sorted_path); } } diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 3dcded506b..2b676bf406 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -678,6 +678,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index 3650e8329d..b02fcb9bfe 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2688,6 +2688,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, case T_Hash: case T_Material: case T_Sort: + case T_IncrementalSort: case T_Unique: case T_SetOp: case T_Group: diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index b570bfd3be..e20c055dea 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -2750,6 +2750,57 @@ create_set_projection_path(PlannerInfo *root, return pathnode; } +/* + * create_incremental_sort_path + * Creates a pathnode that represents performing an incremental sort. + * + * 'rel' is the parent relation associated with the result + * 'subpath' is the path representing the source of data + * 'pathkeys' represents the desired sort order + * 'presorted_keys' is the number of keys by which the input path is + * already sorted + * 'limit_tuples' is the estimated bound on the number of output tuples, + * or -1 if no LIMIT or couldn't estimate + */ +SortPath * +create_incremental_sort_path(PlannerInfo *root, + RelOptInfo *rel, + Path *subpath, + List *pathkeys, + int presorted_keys, + double limit_tuples) +{ + IncrementalSortPath *sort = makeNode(IncrementalSortPath); + SortPath *pathnode = &sort->spath; + + pathnode->path.pathtype = T_IncrementalSort; + pathnode->path.parent = rel; + /* Sort doesn't project, so use source path's pathtarget */ + pathnode->path.pathtarget = subpath->pathtarget; + /* For now, assume we are above any joins, so no parameterization */ + pathnode->path.param_info = NULL; + pathnode->path.parallel_aware = false; + pathnode->path.parallel_safe = rel->consider_parallel && + subpath->parallel_safe; + pathnode->path.parallel_workers = subpath->parallel_workers; + pathnode->path.pathkeys = pathkeys; + + pathnode->subpath = subpath; + + cost_incremental_sort(&pathnode->path, + root, pathkeys, presorted_keys, + subpath->startup_cost, + subpath->total_cost, + subpath->rows, + subpath->pathtarget->width, + 0.0, /* XXX comparison_cost shouldn't be 0? */ + work_mem, limit_tuples); + + sort->presortedCols = presorted_keys; + + return pathnode; +} + /* * create_sort_path * Creates a pathnode that represents performing an explicit sort. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 79bc7ac8ca..fe87d549d9 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -989,6 +989,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of incremental sort steps."), + NULL + }, + &enable_incrementalsort, + true, + NULL, NULL, NULL + }, { {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of hashed aggregation plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e9f8ca775d..427e5e967e 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -358,6 +358,7 @@ #enable_parallel_append = on #enable_seqscan = on #enable_sort = on +#enable_incrementalsort = on #enable_tidscan = on #enable_partitionwise_join = off #enable_partitionwise_aggregate = off diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d02e676aa3..cc33a85731 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -125,6 +125,16 @@ #define PARALLEL_SORT(state) ((state)->shared == NULL ? 0 : \ (state)->worker >= 0 ? 1 : 2) +/* + * Initial size of memtuples array. We're trying to select this size so that + * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of + * allocation might possibly be lowered. However, we don't consider array sizes + * less than 1024. + * + */ +#define INITIAL_MEMTUPSIZE Max(1024, \ + ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1) + /* GUC variables */ #ifdef TRACE_SORT bool trace_sort = false; @@ -241,6 +251,14 @@ struct Tuplesortstate int64 allowedMem; /* total memory allowed, in bytes */ int maxTapes; /* number of tapes (Knuth's T) */ int tapeRange; /* maxTapes-1 (Knuth's P) */ + int64 maxSpace; /* maximum amount of space occupied among sort + * of groups, either in-memory or on-disk */ + bool isMaxSpaceDisk; /* true when maxSpace is value for on-disk + * space, false when it's value for in-memory + * space */ + TupSortStatus maxSpaceStatus; /* sort status when maxSpace was reached */ + MemoryContext maincontext; /* memory context for tuple sort metadata that + * persists across multiple batches */ MemoryContext sortcontext; /* memory context holding most sort data */ MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ @@ -591,6 +609,7 @@ struct Sharedsort static Tuplesortstate *tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess); +static void tuplesort_begin_batch(Tuplesortstate *state); static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); static bool consider_abort_common(Tuplesortstate *state); static void inittapes(Tuplesortstate *state, bool mergeruns); @@ -647,6 +666,8 @@ static void worker_freeze_result_tape(Tuplesortstate *state); static void worker_nomergeruns(Tuplesortstate *state); static void leader_takeover_tapes(Tuplesortstate *state); static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); +static void tuplesort_free(Tuplesortstate *state); +static void tuplesort_updatemax(Tuplesortstate *state); /* * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts @@ -682,8 +703,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, bool randomAccess) { Tuplesortstate *state; + MemoryContext maincontext; MemoryContext sortcontext; - MemoryContext tuplecontext; MemoryContext oldcontext; /* See leader_takeover_tapes() remarks on randomAccess support */ @@ -691,31 +712,31 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, elog(ERROR, "random access disallowed under parallel sort"); /* - * Create a working memory context for this sort operation. All data - * needed by the sort will live inside this context. + * Memory context surviving tuplesort_reset. This memory context holds + * data which is useful to keep while sorting multiple similar batches. */ - sortcontext = AllocSetContextCreate(CurrentMemoryContext, + maincontext = AllocSetContextCreate(CurrentMemoryContext, "TupleSort main", ALLOCSET_DEFAULT_SIZES); /* - * Caller tuple (e.g. IndexTuple) memory context. - * - * A dedicated child context used exclusively for caller passed tuples - * eases memory management. Resetting at key points reduces - * fragmentation. Note that the memtuples array of SortTuples is allocated - * in the parent context, not this context, because there is no need to - * free memtuples early. + * Create a working memory context for one sort operation. The content of + * this context is deleted by tuplesort_reset. + */ + sortcontext = AllocSetContextCreate(maincontext, + "TupleSort sort", + ALLOCSET_DEFAULT_SIZES); + + /* + * Additionally a working memory context for tuples is setup in + * tuplesort_begin_batch. */ - tuplecontext = AllocSetContextCreate(sortcontext, - "Caller tuples", - ALLOCSET_DEFAULT_SIZES); /* - * Make the Tuplesortstate within the per-sort context. This way, we + * Make the Tuplesortstate within the per-sortstate context. This way, we * don't need a separate pfree() operation for it at shutdown. */ - oldcontext = MemoryContextSwitchTo(sortcontext); + oldcontext = MemoryContextSwitchTo(maincontext); state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate)); @@ -724,11 +745,8 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, pg_rusage_init(&state->ru_start); #endif - state->status = TSS_INITIAL; state->randomAccess = randomAccess; - state->bounded = false; state->tuples = true; - state->boundUsed = false; /* * workMem is forced to be at least 64KB, the current minimum valid value @@ -737,38 +755,21 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, * with very little memory. */ state->allowedMem = Max(workMem, 64) * (int64) 1024; - state->availMem = state->allowedMem; state->sortcontext = sortcontext; - state->tuplecontext = tuplecontext; - state->tapeset = NULL; - - state->memtupcount = 0; + state->maincontext = maincontext; /* * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; * see comments in grow_memtuples(). */ - state->memtupsize = Max(1024, - ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1); - - state->growmemtuples = true; - state->slabAllocatorUsed = false; - state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); - - USEMEM(state, GetMemoryChunkSpace(state->memtuples)); - - /* workMem must be large enough for the minimal memtuples array */ - if (LACKMEM(state)) - elog(ERROR, "insufficient memory allowed for sort"); - - state->currentRun = 0; + state->memtupsize = INITIAL_MEMTUPSIZE; + state->memtuples = NULL; /* - * maxTapes, tapeRange, and Algorithm D variables will be initialized by - * inittapes(), if needed + * After all of the other non-parallel-related state, we setup all of the + * state needed for each batch. */ - - state->result_tape = -1; /* flag that result tape has not been formed */ + tuplesort_begin_batch(state); /* * Initialize parallel-related state based on coordination information @@ -802,6 +803,77 @@ tuplesort_begin_common(int workMem, SortCoordinate coordinate, return state; } +/* + * tuplesort_begin_batch + * + * Setup, or reset, all state need for processing a new set of tuples with this + * sort state. Called both from tuplesort_begin_common (the first time sorting + * with this sort state) and tuplesort_reseti (for subsequent usages). + */ +static void +tuplesort_begin_batch(Tuplesortstate *state) +{ + MemoryContext oldcontext; + + oldcontext = MemoryContextSwitchTo(state->maincontext); + + /* + * Caller tuple (e.g. IndexTuple) memory context. + * + * A dedicated child context used exclusively for caller passed tuples + * eases memory management. Resetting at key points reduces + * fragmentation. Note that the memtuples array of SortTuples is allocated + * in the parent context, not this context, because there is no need to + * free memtuples early. + */ + state->tuplecontext = AllocSetContextCreate(state->sortcontext, + "Caller tuples", + ALLOCSET_DEFAULT_SIZES); + + state->status = TSS_INITIAL; + state->bounded = false; + state->boundUsed = false; + + state->availMem = state->allowedMem; + + state->tapeset = NULL; + + state->memtupcount = 0; + + /* + * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; + * see comments in grow_memtuples(). + */ + state->growmemtuples = true; + state->slabAllocatorUsed = false; + if (state->memtuples != NULL && state->memtupsize != INITIAL_MEMTUPSIZE) + { + pfree(state->memtuples); + state->memtuples = NULL; + state->memtupsize = INITIAL_MEMTUPSIZE; + } + if (state->memtuples == NULL) + { + state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple)); + USEMEM(state, GetMemoryChunkSpace(state->memtuples)); + } + + /* workMem must be large enough for the minimal memtuples array */ + if (LACKMEM(state)) + elog(ERROR, "insufficient memory allowed for sort"); + + state->currentRun = 0; + + /* + * maxTapes, tapeRange, and Algorithm D variables will be initialized by + * inittapes(), if needed + */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + MemoryContextSwitchTo(oldcontext); +} + Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, @@ -814,7 +886,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); AssertArg(nkeys > 0); @@ -890,7 +962,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, Assert(indexRel->rd_rel->relam == BTREE_AM_OID); - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -985,7 +1057,7 @@ tuplesort_begin_index_btree(Relation heapRel, MemoryContext oldcontext; int i; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1063,7 +1135,7 @@ tuplesort_begin_index_hash(Relation heapRel, randomAccess); MemoryContext oldcontext; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1106,7 +1178,7 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, int16 typlen; bool typbyval; - oldcontext = MemoryContextSwitchTo(state->sortcontext); + oldcontext = MemoryContextSwitchTo(state->maincontext); #ifdef TRACE_SORT if (trace_sort) @@ -1224,16 +1296,23 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound) } /* - * tuplesort_end + * tuplesort_used_bound * - * Release resources and clean up. + * Allow callers to find out if the sort state was able to use a bound. + */ +bool +tuplesort_used_bound(Tuplesortstate *state) +{ + return state->boundUsed; +} + +/* + * tuplesort_free * - * NOTE: after calling this, any pointers returned by tuplesort_getXXX are - * pointing to garbage. Be careful not to attempt to use or free such - * pointers afterwards! + * Internal routine for freeing resources of tuplesort. */ -void -tuplesort_end(Tuplesortstate *state) +static void +tuplesort_free(Tuplesortstate *state) { /* context swap probably not needed, but let's be safe */ MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); @@ -1291,10 +1370,104 @@ tuplesort_end(Tuplesortstate *state) MemoryContextSwitchTo(oldcontext); /* - * Free the per-sort memory context, thereby releasing all working memory, - * including the Tuplesortstate struct itself. + * Free the per-sort memory context, thereby releasing all working memory. */ - MemoryContextDelete(state->sortcontext); + MemoryContextReset(state->sortcontext); +} + +/* + * tuplesort_end + * + * Release resources and clean up. + * + * NOTE: after calling this, any pointers returned by tuplesort_getXXX are + * pointing to garbage. Be careful not to attempt to use or free such + * pointers afterwards! + */ +void +tuplesort_end(Tuplesortstate *state) +{ + tuplesort_free(state); + + /* + * Free the main memory context, including the Tuplesortstate struct + * itself. + */ + MemoryContextDelete(state->maincontext); +} + +/* + * tuplesort_updatemax + * + * Update maximum resource usage statistics. + */ +static void +tuplesort_updatemax(Tuplesortstate *state) +{ + int64 spaceUsed; + bool isSpaceDisk; + + /* + * Note: it might seem we should provide both memory and disk usage for a + * disk-based sort. However, the current code doesn't track memory space + * accurately once we have begun to return tuples to the caller (since we + * don't account for pfree's the caller is expected to do), so we cannot + * rely on availMem in a disk sort. This does not seem worth the overhead + * to fix. Is it worth creating an API for the memory context code to + * tell us how much is actually used in sortcontext? + */ + if (state->tapeset) + { + isSpaceDisk = true; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + } + else + { + isSpaceDisk = false; + spaceUsed = state->allowedMem - state->availMem; + } + + /* + * Sort evicts data to the disk when it didn't manage to fit those data to + * the main memory. This is why we assume space used on the disk to be + * more important for tracking resource usage than space used in memory. + * Note that amount of space occupied by some tuple set on the disk might + * be less than amount of space occupied by the same tuple set in the + * memory due to more compact representation. + */ + if ((isSpaceDisk && !state->isMaxSpaceDisk) || + (isSpaceDisk == state->isMaxSpaceDisk && spaceUsed > state->maxSpace)) + { + state->maxSpace = spaceUsed; + state->isMaxSpaceDisk = isSpaceDisk; + state->maxSpaceStatus = state->status; + } +} + +/* + * tuplesort_reset + * + * Reset the tuplesort. Reset all the data in the tuplesort, but leave the + * meta-information in. After tuplesort_reset, tuplesort is ready to start + * a new sort. This allows avoiding recreation of tuple sort states (and + * save resources) when sorting multiple small batches. + */ +void +tuplesort_reset(Tuplesortstate *state) +{ + tuplesort_updatemax(state); + tuplesort_free(state); + + /* + * After we've freed up per-batch memory, re-setup all of the state common + * to both the first batch and any subsequent batch. + */ + tuplesort_begin_batch(state); + + state->lastReturnedTuple = NULL; + state->slabMemoryBegin = NULL; + state->slabMemoryEnd = NULL; + state->slabFreeHead = NULL; } /* @@ -2591,8 +2764,7 @@ mergeruns(Tuplesortstate *state) * Reset tuple memory. We've freed all the tuples that we previously * allocated. We will use the slab allocator from now on. */ - MemoryContextDelete(state->tuplecontext); - state->tuplecontext = NULL; + MemoryContextResetOnly(state->tuplecontext); /* * We no longer need a large memtuples array. (We will allocate a smaller @@ -2642,7 +2814,8 @@ mergeruns(Tuplesortstate *state) * from each input tape. */ state->memtupsize = numInputTapes; - state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple)); + state->memtuples = (SortTuple *) MemoryContextAlloc(state->maincontext, + numInputTapes * sizeof(SortTuple)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); /* @@ -3138,18 +3311,15 @@ tuplesort_get_stats(Tuplesortstate *state, * to fix. Is it worth creating an API for the memory context code to * tell us how much is actually used in sortcontext? */ - if (state->tapeset) - { + tuplesort_updatemax(state); + + if (state->isMaxSpaceDisk) stats->spaceType = SORT_SPACE_TYPE_DISK; - stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024); - } else - { stats->spaceType = SORT_SPACE_TYPE_MEMORY; - stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024; - } + stats->spaceUsed = (state->maxSpace + 1023) / 1024; - switch (state->status) + switch (state->maxSpaceStatus) { case TSS_SORTEDINMEM: if (state->boundUsed) diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h index 2e9920111f..4af6e0013d 100644 --- a/src/include/executor/execdebug.h +++ b/src/include/executor/execdebug.h @@ -86,10 +86,12 @@ #define SO_nodeDisplay(l) nodeDisplay(l) #define SO_printf(s) printf(s) #define SO1_printf(s, p) printf(s, p) +#define SO2_printf(s, p1, p2) printf(s, p1, p2) #else #define SO_nodeDisplay(l) #define SO_printf(s) #define SO1_printf(s, p) +#define SO2_printf(s, p1, p2) #endif /* EXEC_SORTDEBUG */ /* ---------------- diff --git a/src/include/executor/nodeIncrementalSort.h b/src/include/executor/nodeIncrementalSort.h new file mode 100644 index 0000000000..e62c02a4f3 --- /dev/null +++ b/src/include/executor/nodeIncrementalSort.h @@ -0,0 +1,28 @@ +/*------------------------------------------------------------------------- + * + * nodeIncrementalSort.h + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeIncrementalSort.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODEINCREMENTALSORT_H +#define NODEINCREMENTALSORT_H + +#include "access/parallel.h" +#include "nodes/execnodes.h" + +extern IncrementalSortState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags); +extern void ExecEndIncrementalSort(IncrementalSortState *node); +extern void ExecReScanIncrementalSort(IncrementalSortState *node); + +/* parallel instrumentation support */ +extern void ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt); +extern void ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt); +extern void ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pcxt); +extern void ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node); + +#endif /* NODEINCREMENTALSORT_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 3d27d50f09..6127ab5912 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1980,6 +1980,21 @@ typedef struct MaterialState Tuplestorestate *tuplestorestate; } MaterialState; + +/* ---------------- + * When performing sorting by multiple keys, it's possible that the input + * dataset is already sorted on a prefix of those keys. We call these + * "presorted keys". + * PresortedKeyData represents information about one such key. + * ---------------- + */ +typedef struct PresortedKeyData +{ + FmgrInfo flinfo; /* comparison function info */ + FunctionCallInfo fcinfo; /* comparison function call info */ + OffsetNumber attno; /* attribute number in tuple */ +} PresortedKeyData; + /* ---------------- * Shared memory container for per-worker sort information * ---------------- @@ -2008,6 +2023,71 @@ typedef struct SortState SharedSortInfo *shared_info; /* one entry per worker */ } SortState; +/* ---------------- + * Instruementation information for IncrementalSort + * ---------------- + */ +typedef struct IncrementalSortGroupInfo +{ + int64 groupCount; + long maxDiskSpaceUsed; + long totalDiskSpaceUsed; + long maxMemorySpaceUsed; + long totalMemorySpaceUsed; + List *sortMethods; +} IncrementalSortGroupInfo; + +typedef struct IncrementalSortInfo +{ + IncrementalSortGroupInfo fullsortGroupInfo; + IncrementalSortGroupInfo prefixsortGroupInfo; +} IncrementalSortInfo; + +/* ---------------- + * Shared memory container for per-worker incremental sort information + * ---------------- + */ +typedef struct SharedIncrementalSortInfo +{ + int num_workers; + IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER]; +} SharedIncrementalSortInfo; + +/* ---------------- + * IncrementalSortState information + * ---------------- + */ +typedef enum +{ + INCSORT_LOADFULLSORT, + INCSORT_LOADPREFIXSORT, + INCSORT_READFULLSORT, + INCSORT_READPREFIXSORT, +} IncrementalSortExecutionStatus; + +typedef struct IncrementalSortState +{ + ScanState ss; /* its first field is NodeTag */ + bool bounded; /* is the result set bounded? */ + int64 bound; /* if bounded, how many tuples are needed */ + bool outerNodeDone; /* finished fetching tuples from outer node */ + int64 bound_Done; /* value of bound we did the sort with */ + IncrementalSortExecutionStatus execution_status; + int64 n_fullsort_remaining; + Tuplesortstate *fullsort_state; /* private state of tuplesort.c */ + Tuplesortstate *prefixsort_state; /* private state of tuplesort.c */ + /* the keys by which the input path is already sorted */ + PresortedKeyData *presorted_keys; + + IncrementalSortInfo incsort_info; + + /* slot for pivot tuple defining values of presorted keys within group */ + TupleTableSlot *group_pivot; + TupleTableSlot *transfer_tuple; + bool am_worker; /* are we a worker? */ + SharedIncrementalSortInfo *shared_info; /* one entry per worker */ +} IncrementalSortState; + /* --------------------- * GroupState information * --------------------- diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 8a76afe8cc..50b1ba5186 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -74,6 +74,7 @@ typedef enum NodeTag T_HashJoin, T_Material, T_Sort, + T_IncrementalSort, T_Group, T_Agg, T_WindowAgg, @@ -130,6 +131,7 @@ typedef enum NodeTag T_HashJoinState, T_MaterialState, T_SortState, + T_IncrementalSortState, T_GroupState, T_AggState, T_WindowAggState, @@ -245,6 +247,7 @@ typedef enum NodeTag T_ProjectionPath, T_ProjectSetPath, T_SortPath, + T_IncrementalSortPath, T_GroupPath, T_UpperUniquePath, T_AggPath, diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 0ceb809644..28d580dd3c 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -1620,6 +1620,15 @@ typedef struct SortPath Path *subpath; /* path representing input source */ } SortPath; +/* + * IncrementalSortPath + */ +typedef struct IncrementalSortPath +{ + SortPath spath; + int presortedCols; /* number of presorted columns */ +} IncrementalSortPath; + /* * GroupPath represents grouping (of presorted input) * diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 4869fe7b6d..136d794219 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -774,6 +774,16 @@ typedef struct Sort bool *nullsFirst; /* NULLS FIRST/LAST directions */ } Sort; +/* ---------------- + * incremental sort node + * ---------------- + */ +typedef struct IncrementalSort +{ + Sort sort; + int presortedCols; /* number of presorted columns */ +} IncrementalSort; + /* --------------- * group node - * Used for queries with GROUP BY (but no aggregates) specified. diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 735ba09650..5725b4828e 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -53,6 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; +extern PGDLLIMPORT bool enable_incrementalsort; extern PGDLLIMPORT bool enable_hashagg; extern PGDLLIMPORT bool enable_hashagg_disk; extern PGDLLIMPORT bool enable_groupingsets_hash_disk; @@ -103,6 +104,15 @@ extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, Cost input_cost, double tuples, int width, Cost comparison_cost, int sort_mem, double limit_tuples); +extern void cost_full_sort(Cost *startup_cost, Cost *run_cost, + Cost input_total_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, + double limit_tuples); +extern void cost_incremental_sort(Path *path, + PlannerInfo *root, List *pathkeys, int presorted_keys, + Cost input_startup_cost, Cost input_total_cost, + double input_tuples, int width, Cost comparison_cost, int sort_mem, + double limit_tuples); extern void cost_append(AppendPath *path); extern void cost_merge_append(Path *path, PlannerInfo *root, List *pathkeys, int n_streams, diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e450fe112a..bcd08af753 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -184,6 +184,12 @@ extern ProjectSetPath *create_set_projection_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, PathTarget *target); +extern SortPath *create_incremental_sort_path(PlannerInfo *root, + RelOptInfo *rel, + Path *subpath, + List *pathkeys, + int presorted_keys, + double limit_tuples); extern SortPath *create_sort_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 9ab73bd20c..85f5fe37ea 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -188,6 +188,8 @@ typedef enum extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2); extern bool pathkeys_contained_in(List *keys1, List *keys2); +extern bool pathkeys_common_contained_in(List *keys1, List *keys2, int *n_common); +extern int pathkeys_common(List *keys1, List *keys2); extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys, Relids required_outer, CostSelector cost_criterion, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index a2fdd3fcd3..0e9ab4e586 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -215,6 +215,7 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, bool randomAccess); extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); +extern bool tuplesort_used_bound(Tuplesortstate *state); extern void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot); @@ -239,6 +240,8 @@ extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, extern void tuplesort_end(Tuplesortstate *state); +extern void tuplesort_reset(Tuplesortstate *state); + extern void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats); extern const char *tuplesort_method_name(TuplesortMethod m); diff --git a/src/test/isolation/expected/drop-index-concurrently-1.out b/src/test/isolation/expected/drop-index-concurrently-1.out index 75dff56bc4..8e6adb66bb 100644 --- a/src/test/isolation/expected/drop-index-concurrently-1.out +++ b/src/test/isolation/expected/drop-index-concurrently-1.out @@ -21,7 +21,7 @@ QUERY PLAN Sort Sort Key: id, data - -> Seq Scan on test_dc + -> Index Scan using test_dc_pkey on test_dc Filter: ((data)::text = '34'::text) step select2: SELECT * FROM test_dc WHERE data=34 ORDER BY id,data; id data diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out new file mode 100644 index 0000000000..ebb8412237 --- /dev/null +++ b/src/test/regress/expected/incremental_sort.out @@ -0,0 +1,1400 @@ +-- When we have to sort the entire table, incremental sort will +-- be slower than plain sort, so it should not be used. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + QUERY PLAN +----------------------------------- + Sort + Sort Key: tenk1.four, tenk1.ten + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(5 rows) + +-- When there is a LIMIT clause, incremental sort is beneficial because +-- it only has to sort some of the groups, and not the entire table. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + QUERY PLAN +----------------------------------------- + Limit + -> Incremental Sort + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(7 rows) + +-- When work_mem is not enough to sort the entire table, incremental sort +-- may be faster if individual groups still fit into work_mem. +set work_mem to '2MB'; +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + QUERY PLAN +----------------------------------- + Incremental Sort + Sort Key: tenk1.four, tenk1.ten + Presorted Key: tenk1.four + -> Sort + Sort Key: tenk1.four + -> Seq Scan on tenk1 +(6 rows) + +reset work_mem; +create table t(a integer, b integer); +create or replace function explain_analyze_without_memory(query text) +returns table (out_line text) language plpgsql +as +$$ +declare + line text; +begin + for line in + execute 'explain (analyze, costs off, summary off, timing off) ' || query + loop + out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); + return next; + end loop; +end; +$$; +create or replace function explain_analyze_inc_sort_nodes(query text) +returns jsonb language plpgsql +as +$$ +declare + elements jsonb; + element jsonb; + matching_nodes jsonb := '[]'::jsonb; +begin + execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements; + while jsonb_array_length(elements) > 0 loop + element := elements->0; + elements := elements - 0; + case jsonb_typeof(element) + when 'array' then + if jsonb_array_length(element) > 0 then + elements := elements || element; + end if; + when 'object' then + if element ? 'Plan' then + elements := elements || jsonb_build_array(element->'Plan'); + element := element - 'Plan'; + else + if element ? 'Plans' then + elements := elements || jsonb_build_array(element->'Plans'); + element := element - 'Plans'; + end if; + if (element->>'Node Type')::text = 'Incremental Sort' then + matching_nodes := matching_nodes || element; + end if; + end if; + end case; + end loop; + return matching_nodes; +end; +$$; +create or replace function explain_analyze_inc_sort_nodes_without_memory(query text) +returns jsonb language plpgsql +as +$$ +declare + nodes jsonb := '[]'::jsonb; + node jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false); + node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false); + end loop; + end loop; + nodes := nodes || node; + end loop; + return nodes; +end; +$$; +create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text) +returns bool language plpgsql +as +$$ +declare + node jsonb; + group_stats jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + group_stats := node->group_key; + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then + raise exception '% has invalid max space < average space', group_key; + end if; + end loop; + end loop; + end loop; + return true; +end; +$$; +-- A single large group tested around each mode transition point. +insert into t(a, b) select 1, i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 1 | 51 + 1 | 52 + 1 | 53 + 1 | 54 + 1 | 55 + 1 | 56 + 1 | 57 + 1 | 58 + 1 | 59 + 1 | 60 + 1 | 61 + 1 | 62 + 1 | 63 + 1 | 64 + 1 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 1 | 50 + 1 | 51 + 1 | 52 + 1 | 53 + 1 | 54 + 1 | 55 + 1 | 56 + 1 | 57 + 1 | 58 + 1 | 59 + 1 | 60 + 1 | 61 + 1 | 62 + 1 | 63 + 1 | 64 + 1 | 65 + 1 | 66 +(66 rows) + +delete from t; +-- An initial large group followed by a small group. +insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 55; + a | b +---+---- + 1 | 1 + 1 | 2 + 1 | 3 + 1 | 4 + 1 | 5 + 1 | 6 + 1 | 7 + 1 | 8 + 1 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 1 | 20 + 1 | 21 + 1 | 22 + 1 | 23 + 1 | 24 + 1 | 25 + 1 | 26 + 1 | 27 + 1 | 28 + 1 | 29 + 1 | 30 + 1 | 31 + 1 | 32 + 1 | 33 + 1 | 34 + 1 | 35 + 1 | 36 + 1 | 37 + 1 | 38 + 1 | 39 + 1 | 40 + 1 | 41 + 1 | 42 + 1 | 43 + 1 | 44 + 1 | 45 + 1 | 46 + 1 | 47 + 1 | 48 + 1 | 49 + 2 | 50 + 2 | 51 + 2 | 52 + 2 | 53 + 2 | 54 + 2 | 55 +(55 rows) + +-- Test EXPLAIN ANALYZE with only a fullsort group. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); + explain_analyze_without_memory +------------------------------------------------------------------------------------------------- + Limit (actual rows=55 loops=1) + -> Incremental Sort (actual rows=55 loops=1) + Sort Key: t.a, t.b + Presorted Key: t.a + Full-sort Groups: 2 (Methods: quicksort, top-N heapsort) Memory: NNkB (avg), NNkB (max) + -> Sort (actual rows=100 loops=1) + Sort Key: t.a + Sort Method: quicksort Memory: NNkB + -> Seq Scan on t (actual rows=100 loops=1) +(9 rows) + +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); + jsonb_pretty +-------------------------------------------------- + [ + + { + + "Sort Key": [ + + "t.a", + + "t.b" + + ], + + "Node Type": "Incremental Sort", + + "Actual Rows": 55, + + "Actual Loops": 1, + + "Presorted Key": [ + + "t.a" + + ], + + "Parallel Aware": false, + + "Full-sort Groups": { + + "Group Count": 2, + + "Sort Methods Used": [ + + "quicksort", + + "top-N heapsort" + + ], + + "Sort Space Memory": { + + "Average Sort Space Used": "NN",+ + "Maximum Sort Space Used": "NN" + + } + + }, + + "Parent Relationship": "Outer" + + } + + ] +(1 row) + +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55'); + explain_analyze_inc_sort_nodes_verify_invariants +-------------------------------------------------- + t +(1 row) + +delete from t; +-- An initial small group followed by a large group. +insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 70; + a | b +---+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 9 | 5 + 9 | 6 + 9 | 7 + 9 | 8 + 9 | 9 + 9 | 10 + 9 | 11 + 9 | 12 + 9 | 13 + 9 | 14 + 9 | 15 + 9 | 16 + 9 | 17 + 9 | 18 + 9 | 19 + 9 | 20 + 9 | 21 + 9 | 22 + 9 | 23 + 9 | 24 + 9 | 25 + 9 | 26 + 9 | 27 + 9 | 28 + 9 | 29 + 9 | 30 + 9 | 31 + 9 | 32 + 9 | 33 + 9 | 34 + 9 | 35 + 9 | 36 + 9 | 37 + 9 | 38 + 9 | 39 + 9 | 40 + 9 | 41 + 9 | 42 + 9 | 43 + 9 | 44 + 9 | 45 + 9 | 46 + 9 | 47 + 9 | 48 + 9 | 49 + 9 | 50 + 9 | 51 + 9 | 52 + 9 | 53 + 9 | 54 + 9 | 55 + 9 | 56 + 9 | 57 + 9 | 58 + 9 | 59 + 9 | 60 + 9 | 61 + 9 | 62 + 9 | 63 + 9 | 64 + 9 | 65 + 9 | 66 + 9 | 67 + 9 | 68 + 9 | 69 + 9 | 70 +(70 rows) + +-- Test rescan. +begin; +-- We force the planner to choose a plan with incremental sort on the right side +-- of a nested loop join node. That way we trigger the rescan code path. +set local enable_hashjoin = off; +set local enable_mergejoin = off; +set local enable_material = off; +set local enable_sort = off; +explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); + QUERY PLAN +------------------------------------------------ + Nested Loop Left Join + Join Filter: (t_1.a = t.a) + -> Seq Scan on t + Filter: (a = ANY ('{1,2}'::integer[])) + -> Incremental Sort + Sort Key: t_1.a, t_1.b + Presorted Key: t_1.a + -> Sort + Sort Key: t_1.a + -> Seq Scan on t t_1 +(10 rows) + +select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); + a | b | a | b +---+---+---+--- + 1 | 1 | 1 | 1 + 2 | 2 | 2 | 2 +(2 rows) + +rollback; +-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); + explain_analyze_without_memory +--------------------------------------------------------------------------------- + Limit (actual rows=70 loops=1) + -> Incremental Sort (actual rows=70 loops=1) + Sort Key: t.a, t.b + Presorted Key: t.a + Full-sort Groups: 1 (Methods: quicksort) Memory: NNkB (avg), NNkB (max) + Presorted Groups: 5 (Methods: quicksort) Memory: NNkB (avg), NNkB (max) + -> Sort (actual rows=100 loops=1) + Sort Key: t.a + Sort Method: quicksort Memory: NNkB + -> Seq Scan on t (actual rows=100 loops=1) +(10 rows) + +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); + jsonb_pretty +-------------------------------------------------- + [ + + { + + "Sort Key": [ + + "t.a", + + "t.b" + + ], + + "Node Type": "Incremental Sort", + + "Actual Rows": 70, + + "Actual Loops": 1, + + "Presorted Key": [ + + "t.a" + + ], + + "Parallel Aware": false, + + "Full-sort Groups": { + + "Group Count": 1, + + "Sort Methods Used": [ + + "quicksort" + + ], + + "Sort Space Memory": { + + "Average Sort Space Used": "NN",+ + "Maximum Sort Space Used": "NN" + + } + + }, + + "Presorted Groups": { + + "Group Count": 5, + + "Sort Methods Used": [ + + "quicksort" + + ], + + "Sort Space Memory": { + + "Average Sort Space Used": "NN",+ + "Maximum Sort Space Used": "NN" + + } + + }, + + "Parent Relationship": "Outer" + + } + + ] +(1 row) + +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70'); + explain_analyze_inc_sort_nodes_verify_invariants +-------------------------------------------------- + t +(1 row) + +delete from t; +-- Small groups of 10 tuples each tested around each mode transition point. +insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 + 3 | 34 + 3 | 35 + 3 | 36 + 3 | 37 + 3 | 38 + 3 | 39 + 4 | 40 + 4 | 41 + 4 | 42 + 4 | 43 + 4 | 44 + 4 | 45 + 4 | 46 + 4 | 47 + 4 | 48 + 4 | 49 + 5 | 50 + 5 | 51 + 5 | 52 + 5 | 53 + 5 | 54 + 5 | 55 + 5 | 56 + 5 | 57 + 5 | 58 + 5 | 59 + 6 | 60 + 6 | 61 + 6 | 62 + 6 | 63 + 6 | 64 + 6 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +---+---- + 0 | 1 + 0 | 2 + 0 | 3 + 0 | 4 + 0 | 5 + 0 | 6 + 0 | 7 + 0 | 8 + 0 | 9 + 1 | 10 + 1 | 11 + 1 | 12 + 1 | 13 + 1 | 14 + 1 | 15 + 1 | 16 + 1 | 17 + 1 | 18 + 1 | 19 + 2 | 20 + 2 | 21 + 2 | 22 + 2 | 23 + 2 | 24 + 2 | 25 + 2 | 26 + 2 | 27 + 2 | 28 + 2 | 29 + 3 | 30 + 3 | 31 + 3 | 32 + 3 | 33 + 3 | 34 + 3 | 35 + 3 | 36 + 3 | 37 + 3 | 38 + 3 | 39 + 4 | 40 + 4 | 41 + 4 | 42 + 4 | 43 + 4 | 44 + 4 | 45 + 4 | 46 + 4 | 47 + 4 | 48 + 4 | 49 + 5 | 50 + 5 | 51 + 5 | 52 + 5 | 53 + 5 | 54 + 5 | 55 + 5 | 56 + 5 | 57 + 5 | 58 + 5 | 59 + 6 | 60 + 6 | 61 + 6 | 62 + 6 | 63 + 6 | 64 + 6 | 65 + 6 | 66 +(66 rows) + +delete from t; +-- Small groups of only 1 tuple each tested around each mode transition point. +insert into t(a, b) select i, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 31; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 +(31 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 32; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 +(32 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 33; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 +(33 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 65; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 + 34 | 34 + 35 | 35 + 36 | 36 + 37 | 37 + 38 | 38 + 39 | 39 + 40 | 40 + 41 | 41 + 42 | 42 + 43 | 43 + 44 | 44 + 45 | 45 + 46 | 46 + 47 | 47 + 48 | 48 + 49 | 49 + 50 | 50 + 51 | 51 + 52 | 52 + 53 | 53 + 54 | 54 + 55 | 55 + 56 | 56 + 57 | 57 + 58 | 58 + 59 | 59 + 60 | 60 + 61 | 61 + 62 | 62 + 63 | 63 + 64 | 64 + 65 | 65 +(65 rows) + +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; + QUERY PLAN +--------------------------------- + Limit + -> Incremental Sort + Sort Key: t.a, t.b + Presorted Key: t.a + -> Sort + Sort Key: t.a + -> Seq Scan on t +(7 rows) + +select * from (select * from t order by a) s order by a, b limit 66; + a | b +----+---- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 + 6 | 6 + 7 | 7 + 8 | 8 + 9 | 9 + 10 | 10 + 11 | 11 + 12 | 12 + 13 | 13 + 14 | 14 + 15 | 15 + 16 | 16 + 17 | 17 + 18 | 18 + 19 | 19 + 20 | 20 + 21 | 21 + 22 | 22 + 23 | 23 + 24 | 24 + 25 | 25 + 26 | 26 + 27 | 27 + 28 | 28 + 29 | 29 + 30 | 30 + 31 | 31 + 32 | 32 + 33 | 33 + 34 | 34 + 35 | 35 + 36 | 36 + 37 | 37 + 38 | 38 + 39 | 39 + 40 | 40 + 41 | 41 + 42 | 42 + 43 | 43 + 44 | 44 + 45 | 45 + 46 | 46 + 47 | 47 + 48 | 48 + 49 | 49 + 50 | 50 + 51 | 51 + 52 | 52 + 53 | 53 + 54 | 54 + 55 | 55 + 56 | 56 + 57 | 57 + 58 | 58 + 59 | 59 + 60 | 60 + 61 | 61 + 62 | 62 + 63 | 63 + 64 | 64 + 65 | 65 + 66 | 66 +(66 rows) + +delete from t; +drop table t; diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out index 69724d54b9..9ac816177e 100644 --- a/src/test/regress/expected/partition_aggregate.out +++ b/src/test/regress/expected/partition_aggregate.out @@ -8,6 +8,8 @@ SET enable_partitionwise_aggregate TO true; SET enable_partitionwise_join TO true; -- Disable parallel plans. SET max_parallel_workers_per_gather TO 0; +-- Disable incremental sort, which can influence selected plans due to fuzz factor. +SET enable_incrementalsort TO off; -- -- Tests for list partitioned tables. -- diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 715842b87a..a126f0ad61 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashagg | on enable_hashagg_disk | on enable_hashjoin | on + enable_incrementalsort | on enable_indexonlyscan | on enable_indexscan | on enable_material | on @@ -91,7 +92,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(19 rows) +(20 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index d2b17dd3ea..175c1d5a49 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -78,7 +78,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- # Another group of parallel tests # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 +test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tidscan collate.icu.utf8 incremental_sort # rules cannot run concurrently with any test that creates # a view or rule in the public schema diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index acba391332..2bcd994361 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -88,6 +88,7 @@ test: select_distinct_on test: select_implicit test: select_having test: subselect +test: incremental_sort test: union test: case test: join diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql new file mode 100644 index 0000000000..b990b3b3de --- /dev/null +++ b/src/test/regress/sql/incremental_sort.sql @@ -0,0 +1,194 @@ +-- When we have to sort the entire table, incremental sort will +-- be slower than plain sort, so it should not be used. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; + +-- When there is a LIMIT clause, incremental sort is beneficial because +-- it only has to sort some of the groups, and not the entire table. +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten +limit 1; + +-- When work_mem is not enough to sort the entire table, incremental sort +-- may be faster if individual groups still fit into work_mem. +set work_mem to '2MB'; +explain (costs off) +select * from (select * from tenk1 order by four) t order by four, ten; +reset work_mem; + +create table t(a integer, b integer); + +create or replace function explain_analyze_without_memory(query text) +returns table (out_line text) language plpgsql +as +$$ +declare + line text; +begin + for line in + execute 'explain (analyze, costs off, summary off, timing off) ' || query + loop + out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g'); + return next; + end loop; +end; +$$; + +create or replace function explain_analyze_inc_sort_nodes(query text) +returns jsonb language plpgsql +as +$$ +declare + elements jsonb; + element jsonb; + matching_nodes jsonb := '[]'::jsonb; +begin + execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements; + while jsonb_array_length(elements) > 0 loop + element := elements->0; + elements := elements - 0; + case jsonb_typeof(element) + when 'array' then + if jsonb_array_length(element) > 0 then + elements := elements || element; + end if; + when 'object' then + if element ? 'Plan' then + elements := elements || jsonb_build_array(element->'Plan'); + element := element - 'Plan'; + else + if element ? 'Plans' then + elements := elements || jsonb_build_array(element->'Plans'); + element := element - 'Plans'; + end if; + if (element->>'Node Type')::text = 'Incremental Sort' then + matching_nodes := matching_nodes || element; + end if; + end if; + end case; + end loop; + return matching_nodes; +end; +$$; + +create or replace function explain_analyze_inc_sort_nodes_without_memory(query text) +returns jsonb language plpgsql +as +$$ +declare + nodes jsonb := '[]'::jsonb; + node jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false); + node := jsonb_set(node, array[group_key, space_key, 'Maximum Sort Space Used'], '"NN"', false); + end loop; + end loop; + nodes := nodes || node; + end loop; + return nodes; +end; +$$; + +create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text) +returns bool language plpgsql +as +$$ +declare + node jsonb; + group_stats jsonb; + group_key text; + space_key text; +begin + for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop + for group_key in select unnest(array['Full-sort Groups', 'Presorted Groups']::text[]) t loop + group_stats := node->group_key; + for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop + if (group_stats->space_key->'Maximum Sort Space Used')::bigint < (group_stats->space_key->'Maximum Sort Space Used')::bigint then + raise exception '% has invalid max space < average space', group_key; + end if; + end loop; + end loop; + end loop; + return true; +end; +$$; + +-- A single large group tested around each mode transition point. +insert into t(a, b) select 1, i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +-- An initial large group followed by a small group. +insert into t(a, b) select (case when i < 50 then 1 else 2 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 55; +select * from (select * from t order by a) s order by a, b limit 55; +-- Test EXPLAIN ANALYZE with only a fullsort group. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55'); +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55')); +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55'); +delete from t; + +-- An initial small group followed by a large group. +insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 100) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 70; +select * from (select * from t order by a) s order by a, b limit 70; +-- Test rescan. +begin; +-- We force the planner to choose a plan with incremental sort on the right side +-- of a nested loop join node. That way we trigger the rescan code path. +set local enable_hashjoin = off; +set local enable_mergejoin = off; +set local enable_material = off; +set local enable_sort = off; +explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); +select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2); +rollback; +-- Test EXPLAIN ANALYZE with both fullsort and presorted groups. +select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70'); +select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70')); +select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70'); +delete from t; + +-- Small groups of 10 tuples each tested around each mode transition point. +insert into t(a, b) select i / 10, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +-- Small groups of only 1 tuple each tested around each mode transition point. +insert into t(a, b) select i, i from generate_series(1, 70) n(i); +explain (costs off) select * from (select * from t order by a) s order by a, b limit 31; +select * from (select * from t order by a) s order by a, b limit 31; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 32; +select * from (select * from t order by a) s order by a, b limit 32; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 33; +select * from (select * from t order by a) s order by a, b limit 33; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 65; +select * from (select * from t order by a) s order by a, b limit 65; +explain (costs off) select * from (select * from t order by a) s order by a, b limit 66; +select * from (select * from t order by a) s order by a, b limit 66; +delete from t; + +drop table t; diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql index 331d92708d..f63e71c075 100644 --- a/src/test/regress/sql/partition_aggregate.sql +++ b/src/test/regress/sql/partition_aggregate.sql @@ -9,6 +9,8 @@ SET enable_partitionwise_aggregate TO true; SET enable_partitionwise_join TO true; -- Disable parallel plans. SET max_parallel_workers_per_gather TO 0; +-- Disable incremental sort, which can influence selected plans due to fuzz factor. +SET enable_incrementalsort TO off; -- -- Tests for list partitioned tables. -- 2.21.1