From 25116ddcc61c3d71690b7fc65d60cb06809631bc Mon Sep 17 00:00:00 2001 From: Henson Choi Date: Thu, 15 Jan 2026 12:52:36 +0900 Subject: [PATCH] Row pattern recognition: Test cases from Jacob Champion's branch --- src/backend/executor/nodeWindowAgg.c | 51 ++- src/backend/optimizer/plan/rpr.c | 4 +- src/test/regress/expected/rpr.out | 474 ++++++++++++++++++++++++++- src/test/regress/sql/rpr.sql | 355 +++++++++++++++++++- 4 files changed, 855 insertions(+), 29 deletions(-) diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c index cf43c1f6127..d56e2e67170 100644 --- a/src/backend/executor/nodeWindowAgg.c +++ b/src/backend/executor/nodeWindowAgg.c @@ -4788,6 +4788,8 @@ update_reduced_frame(WindowObject winobj, int64 pos) if (ctx->states != NULL) nfa_finalize_boundary(winstate, ctx, currentPos - 1); } + /* Absorb completed contexts at partition boundary */ + nfa_absorb_contexts(winstate, NULL, currentPos - 1); break; } @@ -5093,8 +5095,9 @@ nfa_state_clone(WindowAggState *winstate, int16 elemIdx, int16 altPriority, /* * nfa_add_matched_state * - * Record a matched state following SQL standard lexical order preference. - * Priority: lower altPriority wins (lexical order), then longer match. + * Record a matched state following SQL standard semantics. + * For greedy quantifiers, longer match wins. For alternation at the same + * match length, lexical order (lower altPriority) wins. */ static void nfa_add_matched_state(WindowAggState *winstate, RPRNFAContext *ctx, @@ -5109,13 +5112,13 @@ nfa_add_matched_state(WindowAggState *winstate, RPRNFAContext *ctx, } else if (state->altPriority < ctx->matchedState->altPriority) { - /* New state has better lexical order priority */ + /* Better lexical order always wins (SQL standard preference) */ shouldUpdate = true; } else if (state->altPriority == ctx->matchedState->altPriority && matchEndRow > ctx->matchEndRow) { - /* Same priority, but longer match */ + /* Same lexical order, longer match wins (greedy) */ shouldUpdate = true; } @@ -5864,14 +5867,42 @@ nfa_step_single(WindowAggState *winstate, RPRNFAContext *ctx, { /* * Between min and max - can exit or continue. - * Exit state continues processing (pending). - * Loop state waits for next row (ctx->states). + * Exit state handling depends on what comes next: + * - If next is FIN: process on same row (match ends here) + * - If next is VAR/ALT: wait for next row (needs new input) + * Loop state always waits for next row. */ - RPRNFAState *exitState = nfa_state_clone(winstate, elem->next, - state->altPriority, - state->counts, pending); + RPRPatternElement *nextElem = &elements[elem->next]; + RPRElemIdx exitAltPriority; + RPRNFAState *exitState; + + /* + * For greedy extension: if context already has a match recorded, + * preserve its altPriority. This ensures that iterations of a + * quantified group don't compete on lexical order - the first + * iteration's choice sets the preference, and subsequent + * iterations can extend it if they produce a longer match. + */ + exitAltPriority = state->altPriority; + if (ctx->matchedState != NULL) + exitAltPriority = ctx->matchedState->altPriority; + + exitState = nfa_state_clone(winstate, elem->next, + exitAltPriority, + state->counts, NULL); exitState->counts[depth] = 0; - pending = exitState; + + if (RPRElemIsFin(nextElem)) + { + /* Match ends at current row */ + exitState->next = pending; + pending = exitState; + } + else + { + /* Next element (VAR/ALT) needs new row input */ + nfa_add_state_unique(winstate, ctx, exitState); + } state->counts[depth] = count; for (int d = depth + 1; d < pattern->maxDepth; d++) diff --git a/src/backend/optimizer/plan/rpr.c b/src/backend/optimizer/plan/rpr.c index 0eed6b865ae..4385e49d6fb 100644 --- a/src/backend/optimizer/plan/rpr.c +++ b/src/backend/optimizer/plan/rpr.c @@ -995,8 +995,8 @@ computeAbsorbability(RPRPattern *pattern) } } - /* Must have exactly one unbounded element/group */ - if (numUnbounded != 1) + /* Must have at least one unbounded element for absorption */ + if (numUnbounded < 1) return; /* diff --git a/src/test/regress/expected/rpr.out b/src/test/regress/expected/rpr.out index cea986d6704..652b1eb811d 100644 --- a/src/test/regress/expected/rpr.out +++ b/src/test/regress/expected/rpr.out @@ -884,23 +884,23 @@ SELECT company, tdate, price, first_value(tdate) OVER w, last_value(tdate) OVER ----------+------------+-------+-------------+------------ company1 | 07-01-2023 | 100 | | company1 | 07-02-2023 | 200 | 07-02-2023 | 07-05-2023 - company1 | 07-03-2023 | 150 | 07-03-2023 | 07-05-2023 - company1 | 07-04-2023 | 140 | 07-04-2023 | 07-05-2023 + company1 | 07-03-2023 | 150 | | + company1 | 07-04-2023 | 140 | | company1 | 07-05-2023 | 150 | | company1 | 07-06-2023 | 90 | | company1 | 07-07-2023 | 110 | 07-07-2023 | 07-10-2023 - company1 | 07-08-2023 | 130 | 07-08-2023 | 07-10-2023 - company1 | 07-09-2023 | 120 | 07-09-2023 | 07-10-2023 + company1 | 07-08-2023 | 130 | | + company1 | 07-09-2023 | 120 | | company1 | 07-10-2023 | 130 | | company2 | 07-01-2023 | 50 | | company2 | 07-02-2023 | 2000 | 07-02-2023 | 07-05-2023 - company2 | 07-03-2023 | 1500 | 07-03-2023 | 07-05-2023 - company2 | 07-04-2023 | 1400 | 07-04-2023 | 07-05-2023 + company2 | 07-03-2023 | 1500 | | + company2 | 07-04-2023 | 1400 | | company2 | 07-05-2023 | 1500 | | company2 | 07-06-2023 | 60 | | company2 | 07-07-2023 | 1100 | 07-07-2023 | 07-10-2023 - company2 | 07-08-2023 | 1300 | 07-08-2023 | 07-10-2023 - company2 | 07-09-2023 | 1200 | 07-09-2023 | 07-10-2023 + company2 | 07-08-2023 | 1300 | | + company2 | 07-09-2023 | 1200 | | company2 | 07-10-2023 | 1300 | | (20 rows) @@ -2701,7 +2701,7 @@ WINDOW w AS ( -- Pattern optimized: (A B) (A B)+ -> (A B){2,} -- Potential matches: 1-6 (3 reps), 3-6 (2 reps), 5-6 needs 2 reps (fail) -- Row 1 (1-6) absorbs Row 3 (3-6) - same endpoint, top-level unbounded GROUP --- Test absorption 6: Multiple unbounded - NOT absorbable +-- Test absorption 6: Multiple unbounded - first element unbounded enables absorption WITH test_multi_unbounded AS ( SELECT * FROM (VALUES (1, ARRAY['A']), @@ -2725,12 +2725,460 @@ WINDOW w AS ( id | flags | match_start | match_end ----+-------+-------------+----------- 1 | {A} | 1 | 4 - 2 | {A} | 2 | 4 + 2 | {A} | | 3 | {B} | | 4 | {B} | | 5 | {X} | | (5 rows) --- Pattern A+ B+ has TWO unbounded elements - NOT absorbable --- Greedy A+ consumes rows 1-2, then B+ matches 3-4 --- Row 1: 1-4, Row 2: 2-4 (different A+ counts, not absorbed) +-- Row 1: 1-4, Row 2: absorbed (same endpoint 4) +-- ============================================ +-- Jacob's RPR Patterns (from jacob branch) +-- ============================================ +-- Test: A? (optional, greedy) +WITH jacob_optional AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_optional +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A?) + DEFINE A AS 'A' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 1 + 2 | {X} | | +(2 rows) + +-- Expected: 1-1 (matches A) +-- Test: A{2} (exact count) +WITH jacob_exact AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_exact +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A{2}) + DEFINE A AS 'A' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 2 + 2 | {A} | | + 3 | {A} | | + 4 | {X} | | +(4 rows) + +-- Expected: 1-2 +-- Test: A{1,3} (bounded range, greedy) +WITH jacob_bounded AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['A']), + (5, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_bounded +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A{1,3}) + DEFINE A AS 'A' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 3 + 2 | {A} | | + 3 | {A} | | + 4 | {A} | 4 | 4 + 5 | {X} | | +(5 rows) + +-- Expected: 1-3 (greedy takes max), then 4-4 +-- Test: A | B (simple alternation) +WITH jacob_simple_alt AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_simple_alt +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A | B) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 1 + 2 | {B} | 2 | 2 + 3 | {X} | | +(3 rows) + +-- Expected: 1-1 (A), 2-2 (B) +-- Test: A | B | C (three-way alternation) +WITH jacob_three_alt AS ( + SELECT * FROM (VALUES + (1, ARRAY['B']), + (2, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_three_alt +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A | B | C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {B} | 1 | 1 + 2 | {X} | | +(2 rows) + +-- Expected: 1-1 (B) +-- Test: A B C (concatenation) +WITH jacob_concat AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['C']), + (4, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_concat +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 3 + 2 | {B} | | + 3 | {C} | | + 4 | {X} | | +(4 rows) + +-- Expected: 1-3 +-- Test: A B? C (optional middle) +WITH jacob_optional_mid AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['C']), + (3, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_optional_mid +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B? C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 2 + 2 | {C} | | + 3 | {X} | | +(3 rows) + +-- Expected: 1-2 (A C, B skipped) +-- Test: (A B){2} (nested group with quantifier) +WITH jacob_nested_group AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['A']), + (4, ARRAY['B']), + (5, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_nested_group +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN ((A B){2}) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 4 + 2 | {B} | | + 3 | {A} | | + 4 | {B} | | + 5 | {X} | | +(5 rows) + +-- Expected: 1-4 (A B A B) +-- Test: (A){3} (quantifier on grouped single element) +WITH jacob_group_quant AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_group_quant +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN ((A){3}) + DEFINE A AS 'A' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 3 + 2 | {A} | | + 3 | {A} | | + 4 | {X} | | +(4 rows) + +-- Expected: 1-3 +-- Test: A B C | A B C D E (lexical order - first alt wins) +WITH jacob_lex_first AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['C']), + (4, ARRAY['D']), + (5, ARRAY['E']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_lex_first +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B C | A B C D E) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags), + D AS 'D' = ANY(flags), + E AS 'E' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 3 + 2 | {B} | | + 3 | {C} | | + 4 | {D} | | + 5 | {E} | | +(5 rows) + +-- Expected: 1-3 (A B C wins by lexical order) +-- Test: A B C D E | A B C (lexical order - longer first wins) +WITH jacob_lex_long AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['C']), + (4, ARRAY['D']), + (5, ARRAY['E']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_lex_long +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B C D E | A B C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags), + D AS 'D' = ANY(flags), + E AS 'E' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 5 + 2 | {B} | | + 3 | {C} | | + 4 | {D} | | + 5 | {E} | | +(5 rows) + +-- Expected: 1-5 (A B C D E wins by lexical order) +-- ============================================ +-- Alternation with quantifiers (BUG cases from Jacob's tests) +-- ============================================ +-- Test: (A | B)+ C - alternation inside quantified group followed by C +-- BUG: Currently returns NULL instead of matching 1-4 +WITH jacob_alt_quant AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['A']), + (4, ARRAY['C']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_alt_quant +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN ((A | B)+ C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 4 + 2 | {B} | | + 3 | {A} | | + 4 | {C} | | +(4 rows) + +-- Expected: 1-4 (A B A C) +-- Test: ((A | B) C)+ - alternation inside group with outer quantifier +-- Currently returns two separate matches (1-2, 3-4) instead of single 1-4 +WITH jacob_alt_group AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['C']), + (3, ARRAY['B']), + (4, ARRAY['C']), + (5, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_alt_group +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (((A | B) C)+) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 4 + 2 | {C} | | + 3 | {B} | | + 4 | {C} | | + 5 | {X} | | +(5 rows) + +-- Expected: 1-4 (A C B C) +-- ============================================ +-- RELUCTANT quantifiers (parser only - executor TODO) +-- ============================================ +-- Test: A+? B (reluctant) - CURRENTLY ACTS LIKE A+ (GREEDY) +-- Parser accepts the syntax but executor doesn't differentiate +WITH jacob_reluctant AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['B']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_reluctant +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A+? B) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 4 + 2 | {A} | | + 3 | {A} | | + 4 | {B} | | +(4 rows) + +-- Current: 1-4 (A A A B) - greedy behavior +-- Expected when RELUCTANT implemented: 3-4 (A B) - shortest A before B +-- Test: A{1,3}? B (reluctant bounded) +WITH jacob_reluctant_bounded AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['B']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_reluctant_bounded +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A{1,3}? B) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); + id | flags | match_start | match_end +----+-------+-------------+----------- + 1 | {A} | 1 | 4 + 2 | {A} | | + 3 | {A} | | + 4 | {B} | | +(4 rows) + +-- Current: 1-4 (greedy, takes max A before B) +-- Expected when RELUCTANT implemented: 3-4 (takes min A before B) diff --git a/src/test/regress/sql/rpr.sql b/src/test/regress/sql/rpr.sql index fa8aa50fcb9..19146f7f03d 100644 --- a/src/test/regress/sql/rpr.sql +++ b/src/test/regress/sql/rpr.sql @@ -1368,7 +1368,7 @@ WINDOW w AS ( -- Potential matches: 1-6 (3 reps), 3-6 (2 reps), 5-6 needs 2 reps (fail) -- Row 1 (1-6) absorbs Row 3 (3-6) - same endpoint, top-level unbounded GROUP --- Test absorption 6: Multiple unbounded - NOT absorbable +-- Test absorption 6: Multiple unbounded - first element unbounded enables absorption WITH test_multi_unbounded AS ( SELECT * FROM (VALUES (1, ARRAY['A']), @@ -1389,6 +1389,353 @@ WINDOW w AS ( A AS 'A' = ANY(flags), B AS 'B' = ANY(flags) ); --- Pattern A+ B+ has TWO unbounded elements - NOT absorbable --- Greedy A+ consumes rows 1-2, then B+ matches 3-4 --- Row 1: 1-4, Row 2: 2-4 (different A+ counts, not absorbed) +-- Row 1: 1-4, Row 2: absorbed (same endpoint 4) + +-- ============================================ +-- Jacob's RPR Patterns (from jacob branch) +-- ============================================ + +-- Test: A? (optional, greedy) +WITH jacob_optional AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_optional +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A?) + DEFINE A AS 'A' = ANY(flags) +); +-- Expected: 1-1 (matches A) + +-- Test: A{2} (exact count) +WITH jacob_exact AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_exact +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A{2}) + DEFINE A AS 'A' = ANY(flags) +); +-- Expected: 1-2 + +-- Test: A{1,3} (bounded range, greedy) +WITH jacob_bounded AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['A']), + (5, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_bounded +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A{1,3}) + DEFINE A AS 'A' = ANY(flags) +); +-- Expected: 1-3 (greedy takes max), then 4-4 + +-- Test: A | B (simple alternation) +WITH jacob_simple_alt AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_simple_alt +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A | B) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); +-- Expected: 1-1 (A), 2-2 (B) + +-- Test: A | B | C (three-way alternation) +WITH jacob_three_alt AS ( + SELECT * FROM (VALUES + (1, ARRAY['B']), + (2, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_three_alt +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A | B | C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); +-- Expected: 1-1 (B) + +-- Test: A B C (concatenation) +WITH jacob_concat AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['C']), + (4, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_concat +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); +-- Expected: 1-3 + +-- Test: A B? C (optional middle) +WITH jacob_optional_mid AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['C']), + (3, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_optional_mid +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B? C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); +-- Expected: 1-2 (A C, B skipped) + +-- Test: (A B){2} (nested group with quantifier) +WITH jacob_nested_group AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['A']), + (4, ARRAY['B']), + (5, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_nested_group +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN ((A B){2}) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); +-- Expected: 1-4 (A B A B) + +-- Test: (A){3} (quantifier on grouped single element) +WITH jacob_group_quant AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_group_quant +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN ((A){3}) + DEFINE A AS 'A' = ANY(flags) +); +-- Expected: 1-3 + +-- Test: A B C | A B C D E (lexical order - first alt wins) +WITH jacob_lex_first AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['C']), + (4, ARRAY['D']), + (5, ARRAY['E']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_lex_first +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B C | A B C D E) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags), + D AS 'D' = ANY(flags), + E AS 'E' = ANY(flags) +); +-- Expected: 1-3 (A B C wins by lexical order) + +-- Test: A B C D E | A B C (lexical order - longer first wins) +WITH jacob_lex_long AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['C']), + (4, ARRAY['D']), + (5, ARRAY['E']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_lex_long +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B C D E | A B C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags), + D AS 'D' = ANY(flags), + E AS 'E' = ANY(flags) +); +-- Expected: 1-5 (A B C D E wins by lexical order) + +-- ============================================ +-- Alternation with quantifiers (BUG cases from Jacob's tests) +-- ============================================ + +-- Test: (A | B)+ C - alternation inside quantified group followed by C +-- BUG: Currently returns NULL instead of matching 1-4 +WITH jacob_alt_quant AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['B']), + (3, ARRAY['A']), + (4, ARRAY['C']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_alt_quant +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN ((A | B)+ C) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); +-- Expected: 1-4 (A B A C) + +-- Test: ((A | B) C)+ - alternation inside group with outer quantifier +-- Currently returns two separate matches (1-2, 3-4) instead of single 1-4 +WITH jacob_alt_group AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['C']), + (3, ARRAY['B']), + (4, ARRAY['C']), + (5, ARRAY['X']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_alt_group +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (((A | B) C)+) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags), + C AS 'C' = ANY(flags) +); +-- Expected: 1-4 (A C B C) + +-- ============================================ +-- RELUCTANT quantifiers (parser only - executor TODO) +-- ============================================ + +-- Test: A+? B (reluctant) - CURRENTLY ACTS LIKE A+ (GREEDY) +-- Parser accepts the syntax but executor doesn't differentiate +WITH jacob_reluctant AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['B']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_reluctant +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A+? B) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); +-- Current: 1-4 (A A A B) - greedy behavior +-- Expected when RELUCTANT implemented: 3-4 (A B) - shortest A before B + +-- Test: A{1,3}? B (reluctant bounded) +WITH jacob_reluctant_bounded AS ( + SELECT * FROM (VALUES + (1, ARRAY['A']), + (2, ARRAY['A']), + (3, ARRAY['A']), + (4, ARRAY['B']) + ) AS t(id, flags) +) +SELECT id, flags, first_value(id) OVER w AS match_start, last_value(id) OVER w AS match_end +FROM jacob_reluctant_bounded +WINDOW w AS ( + ORDER BY id + ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A{1,3}? B) + DEFINE + A AS 'A' = ANY(flags), + B AS 'B' = ANY(flags) +); +-- Current: 1-4 (greedy, takes max A before B) +-- Expected when RELUCTANT implemented: 3-4 (takes min A before B) \ No newline at end of file -- 2.50.1 (Apple Git-155)