From d09908f16025319da7771c40892a3a66df5fdb7e Mon Sep 17 00:00:00 2001 From: Henson Choi Date: Tue, 7 Apr 2026 20:49:27 +0900 Subject: [PATCH] Add inline comments for complex RPR algorithms and design notes Document END chain traversal in nfa_match(), fast-forward paths in nfa_advance_end(), absorption safety rules with navigation lookup table, per-context evaluation strategy table, fixed-length group unrolling rationale, and BEGIN/END pointer layout diagram. --- src/backend/executor/execRPR.c | 97 ++++++++++++++++++++++++++------ src/backend/optimizer/plan/rpr.c | 41 ++++++++++++-- 2 files changed, 118 insertions(+), 20 deletions(-) diff --git a/src/backend/executor/execRPR.c b/src/backend/executor/execRPR.c index dede2dfab0d..037d3b2e232 100644 --- a/src/backend/executor/execRPR.c +++ b/src/backend/executor/execRPR.c @@ -440,7 +440,14 @@ * Case 2: GROUP+ with fixed-length children (min == max, recursively) * e.g., (A B)+, (A B{2})+, ((A (B C){2}){2})+ * -> ABSORBABLE_BRANCH on all elements within the group, - * ABSORBABLE | ABSORBABLE_BRANCH on END + * ABSORBABLE | ABSORBABLE_BRANCH on END + * + * Why this is safe: when every child has min == max, the group + * is semantically equivalent to unrolling its body into {1,1} + * elements. E.g., (A B{2})+ behaves like (A B B)+. Each + * iteration consumes a fixed number of rows, so an earlier + * context's count always dominates a later one's (monotonicity). + * * Case 3: GROUP+ whose body starts with VAR+ (e.g., (A+ B)+) * -> Recurses from BEGIN into the body, applying Case 1. * ABSORBABLE | ABSORBABLE_BRANCH set on A. @@ -647,6 +654,19 @@ * variables. The original nav_match_start and currentpos are saved and * restored after re-evaluation. * + * Summary of evaluation strategy by navigation content: + * + * Navigation content evaluation + * ------------------------------------------------------- + * No navigation shared (once per row) + * PREV/NEXT only shared (once per row) + * LAST (no offset) shared (once per row) + * LAST (with offset) per-context + * FIRST (any) per-context + * Compound (inner FIRST) per-context + * Compound (inner LAST, no off.) shared (once per row) + * Compound (inner LAST, w/off.) per-context + * * VI-5. Tuplestore Mark and Trim (nodeWindowAgg.c) * * Navigation functions require access to past rows via the tuplestore. @@ -762,11 +782,26 @@ * (b) Unbounded frame (ROWS BETWEEN CURRENT ROW AND UNBOUNDED * FOLLOWING). Limited frames apply differently to each context, * breaking the monotonicity principle. - * (c) No match_start-dependent navigation in DEFINE. FIRST, - * LAST-with-offset, and compound navigation referencing match_start - * (PREV_FIRST, NEXT_FIRST, PREV_LAST/NEXT_LAST with offset) - * cause different contexts to evaluate to different values for the - * same row, breaking monotonicity. + * (c) No match_start-dependent navigation in DEFINE. + * + * Mechanism: each context has a different matchStartRow, so FIRST + * resolves to a different row for each context at the same + * currentpos. An earlier context's DEFINE result no longer + * subsumes a later one's, making count-dominance comparison + * invalid. Rather than comparing matchStartRow at runtime + * (which would complicate the absorb path), any match_start + * dependency disables absorption entirely. + * + * Navigation content match_start dep. absorption + * ------------------------------------------------------------ + * No navigation none safe + * PREV/NEXT only none safe + * LAST (no offset) none safe + * LAST (with offset) boundary check unsafe + * FIRST (any) direct unsafe + * Compound (inner FIRST) direct unsafe + * Compound (inner LAST, no off.) none safe + * Compound (inner LAST, w/off.) boundary chk unsafe * * Runtime conditions (evaluated per context pair): * @@ -2260,7 +2295,13 @@ nfa_absorb_contexts(WindowAggState *winstate) * nfa_eval_var_match * * Evaluate if a VAR element matches the current row. - * Undefined variables (varId >= defineVariableList length) default to TRUE. + * + * varMatched is a pre-evaluated boolean array indexed by varId, computed + * once per row by evaluating all DEFINE expressions. NULL means no DEFINE + * clauses exist (only possible during early development/testing). + * + * Per SQL:2016 R020, pattern variables not listed in DEFINE are implicitly + * TRUE -- they match every row. This is checked via varId >= list_length. */ static bool nfa_eval_var_match(WindowAggState *winstate, RPRPatternElement *elem, @@ -2337,9 +2378,20 @@ nfa_match(WindowAggState *winstate, RPRNFAContext *ctx, bool *varMatched) /* * For VAR at max count with END next, advance through END - * chain to reach the absorption judgment point. Only + * chain to reach the absorption judgment point. Only * deterministic exits (count >= max, max finite) are handled; * unbounded VARs stay for advance phase. + * + * In nested patterns like ((A B){2}){3}, a VAR reaching its + * max triggers an exit cascade: inner END increments inner + * group count, which may itself reach max, requiring an exit + * to the next outer END. The loop below walks this chain. + * + * ABSORBABLE_BRANCH marks elements inside the absorbable + * region; ABSORBABLE marks the outermost judgment point + * where count-dominance is evaluated. We chain through + * BRANCH elements until reaching the ABSORBABLE point or + * an element that can still loop (count < max). */ if (RPRElemIsAbsorbableBranch(elem) && !RPRElemIsAbsorbable(elem) && @@ -2561,12 +2613,25 @@ nfa_advance_end(WindowAggState *winstate, RPRNFAContext *ctx, RPRPatternElement *jumpElem; RPRNFAState *ffState = NULL; - /* Snapshot state for ff path before modifying for loop-back */ + /* + * Two paths are explored in parallel when the group body is + * nullable (RPR_ELEM_EMPTY_LOOP): + * + * 1. Primary path: loop back and attempt real matches in the + * next iteration (state, modified below). + * + * 2. Fast-forward path: skip directly to after the group, + * treating all remaining required iterations as empty + * matches (ffState, handled after the primary path). + * + * The snapshot must be taken BEFORE modifying state for the + * loop-back, since both paths diverge from the same point. + */ if (RPRElemCanEmptyLoop(elem)) ffState = nfa_state_create(winstate, state->elemIdx, state->counts, state->isAbsorbable); - /* Loop back for real matches (primary path) */ + /* Primary path: loop back for real matches */ for (int d = depth + 1; d < pattern->maxDepth; d++) state->counts[d] = 0; state->elemIdx = elem->jump; @@ -2575,12 +2640,12 @@ nfa_advance_end(WindowAggState *winstate, RPRNFAContext *ctx, currentPos); /* - * Fast-forward fallback for nullable bodies. E.g. (A?){2,3} when A - * doesn't match: the loop-back produces empty iterations that cycle - * detection would kill. Instead, exit directly treating all - * remaining required iterations as empty. Route to elem->next (not - * nfa_advance_end) to avoid creating competing greedy/reluctant loop - * states. + * Fast-forward path for nullable bodies. E.g. (A?){2,3} when + * A doesn't match: the primary loop-back produces empty + * iterations that cycle detection would kill. Instead, exit + * directly with count satisfied. Route to elem->next (not + * nfa_advance_end) to avoid creating competing greedy/reluctant + * loop states. */ if (ffState != NULL) { diff --git a/src/backend/optimizer/plan/rpr.c b/src/backend/optimizer/plan/rpr.c index 767a214016c..754fcd53099 100644 --- a/src/backend/optimizer/plan/rpr.c +++ b/src/backend/optimizer/plan/rpr.c @@ -478,6 +478,19 @@ mergeConsecutiveAlts(List *children) * mergeGroupPrefixSuffix * Merge sequence prefix/suffix into GROUP with matching children. * + * When a GROUP's children appear as a prefix before and/or suffix after + * the GROUP in a SEQ, absorb them by incrementing the GROUP's quantifier. + * This runs iteratively: A B A B (A B)+ A B -> (A B){5,}. + * + * Algorithm: + * For each GROUP encountered in the sequence: + * 1. PREFIX phase: compare the last N elements already in the result + * list against the GROUP's children. On match, remove them from + * result and increment the GROUP's min/max. Repeat until no match. + * 2. SUFFIX phase: compare the next N elements in the input against + * the GROUP's children. On match, skip them (via skipUntil) and + * increment min/max. Repeat until no match. + * * Examples: * A B (A B)+ -> (A B){2,} * (A B)+ A B -> (A B){2,} @@ -813,8 +826,16 @@ tryMultiplyQuantifiers(RPRPatternNode *pattern) } /* - * Case 2/3: Safe when child is finite AND (outer is exact OR child is - * {1,1}) + * Case 2: Outer exact (min == max): (A{2,3}){4} -> A{8,12}. + * Safe because every iteration produces the same range. + * + * Case 3: Child {1,1}: (A){2,5} -> A{2,5}. + * Safe because the child contributes exactly one per + * iteration, so the outer range maps directly. + * + * Unsafe example: (A{2}){2,3} produces counts 4 or 6 only, not + * the full range 4..6, so we cannot flatten when child has a + * non-trivial range AND outer is also a range. */ if (child->max != RPR_QUANTITY_INF && (pattern->min == pattern->max || @@ -824,6 +845,7 @@ tryMultiplyQuantifiers(RPRPatternNode *pattern) if (new_min_64 >= RPR_QUANTITY_INF) return pattern; + /* Outer unbounded: result is unbounded regardless of child */ if (pattern->max == RPR_QUANTITY_INF) new_max_64 = RPR_QUANTITY_INF; else @@ -1186,8 +1208,19 @@ fillRPRPatternVar(RPRPatternNode *node, RPRPattern *pat, int *idx, RPRDepth dept * fillRPRPatternGroup * Fill a GROUP pattern and its children. * - * Creates elements for group content at increased depth, plus an END marker - * if the group has a non-trivial quantifier. + * Creates elements for group content at increased depth, plus BEGIN/END + * marker pair if the group has a non-trivial quantifier (not {1,1}). + * + * Element layout for (A B){2,3}: + * + * [BEGIN] [A] [B] [END] [next element...] + * | | ^ + * | +-- jump --+ (loop back to first child) + * +---- jump -------------------+ (skip to after END) + * + * BEGIN.jump points past END (skip path when count >= max or min == 0). + * END.jump points to the first child (loop-back path). + * BEGIN.next and END.next are set later by finalizeRPRPattern(). * * Returns true if this group is nullable. A group is nullable when its * min is 0 (can be skipped entirely) or its body is nullable (every path -- 2.50.1 (Apple Git-155)