From caa1ede6a7b5ac3e19b73943a1a810bf98e32e21 Mon Sep 17 00:00:00 2001 From: Rintaro Ikeda Date: Wed, 9 Jul 2025 23:36:37 +0900 Subject: [PATCH v6 1/3] Add --continue-on-error option When the option is set, client rolls back the failed transaction and starts a new one when its transaction fails due to the reason other than the deadlock and serialization failure. --- doc/src/sgml/ref/pgbench.sgml | 71 +++++++++++++++----- src/bin/pgbench/pgbench.c | 55 +++++++++++++-- src/bin/pgbench/t/001_pgbench_with_server.pl | 22 ++++++ 3 files changed, 124 insertions(+), 24 deletions(-) diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml index ab252d9fc74..15fcb45e223 100644 --- a/doc/src/sgml/ref/pgbench.sgml +++ b/doc/src/sgml/ref/pgbench.sgml @@ -76,9 +76,8 @@ tps = 896.967014 (without initial connection time) and number of transactions per client); these will be equal unless the run failed before completion or some SQL command(s) failed. (In mode, only the actual number of transactions is printed.) - The next line reports the number of failed transactions due to - serialization or deadlock errors (see - for more information). + The next line reports the number of failed transactions (see + for more information). The last line reports the number of transactions per second. @@ -790,6 +789,9 @@ pgbench options d deadlock failures; + + other failures; + See for more information. @@ -914,6 +916,26 @@ pgbench options d + + + + + Allows clients to continue their run even if an SQL statement fails due to + errors other than serialization or deadlock. Unlike serialization and deadlock + failures, clients do not retry the same transactions but start new transaction. + This option is useful when your custom script may raise errors due to some + reason like unique constraints violation. Without this option, the client is + aborted after such errors. + + + Note that serialization and deadlock failures never cause the client to be + aborted even after clients retries times by + default, so they are not affected by this option. + See for more information. + + + + @@ -2409,8 +2431,8 @@ END; will be reported as failed. If you use the option, the time of the failed transaction will be reported as - serialization or - deadlock depending on the type of failure (see + serialization, deadlock, or + other depending on the type of failure (see for more information). @@ -2638,6 +2660,16 @@ END; + + + other_sql_failures + + + number of transactions that got a SQL error + (zero unless is specified) + + + @@ -2646,8 +2678,8 @@ END; pgbench --aggregate-interval=10 --time=20 --client=10 --log --rate=1000 --latency-limit=10 --failures-detailed --max-tries=10 test -1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0 -1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0 +1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0 0 +1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0 0 @@ -2839,9 +2871,11 @@ statement latencies in milliseconds, failures and retries: is specified. Otherwise in the worst case they only lead to the abortion of the failed client while other clients continue their run (but some client errors are handled without - an abortion of the client and reported separately, see below). Later in - this section it is assumed that the discussed errors are only the - direct client errors and they are not internal + an abortion of the client and reported separately, see below). When + is specified, the client + continues to process new transactions even if it encounters an error. + Later in this section it is assumed that the discussed errors are only + the direct client errors and they are not internal pgbench errors. @@ -2851,14 +2885,17 @@ statement latencies in milliseconds, failures and retries: A client's run is aborted in case of a serious error; for example, the connection with the database server was lost or the end of script was reached - without completing the last transaction. In addition, if execution of an SQL + without completing the last transaction. By default, if execution of an SQL or meta command fails for reasons other than serialization or deadlock errors, - the client is aborted. Otherwise, if an SQL command fails with serialization or - deadlock errors, the client is not aborted. In such cases, the current - transaction is rolled back, which also includes setting the client variables - as they were before the run of this transaction (it is assumed that one - transaction script contains only one transaction; see - for more information). + the client is aborted. However, if the --continue-on-error option is specified, + the client does not abort and proceeds to the next transaction regardless of + the error. These cases are reported as "other failures" in the output. + On contrast, if an SQL command fails with serialization or deadlock errors, the + client is not aborted even without . + Instead, the current transaction is rolled back, which also includes setting + the client variables as they were before the run of this transaction + (it is assumed that one transaction script contains only one transaction; + see for more information). Transactions with serialization or deadlock errors are repeated after rollbacks until they complete successfully or reach the maximum number of tries (specified by the option) / the maximum diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 497a936c141..4b3ddb3146f 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -402,15 +402,23 @@ typedef struct StatsData * directly successful transactions (they were successfully completed on * the first try). * - * A failed transaction is defined as unsuccessfully retried transactions. - * It can be one of two types: + * A failed transaction is counted differently depending on whether + * the --continue-on-error option is specified. * + * Without --continue-on-error: * failed (the number of failed transactions) = * 'serialization_failures' (they got a serialization error and were not * successfully retried) + * 'deadlock_failures' (they got a deadlock error and were not * successfully retried). * + * When --continue-on-error is specified: + * + * failed (number of failed transactions) = + * 'serialization_failures' + 'deadlock_failures' + + * 'other_sql_failures' (they got some other SQL error; the transaction was + * not retried and counted as failed due to --continue-on-error). + * * If the transaction was retried after a serialization or a deadlock * error this does not guarantee that this retry was successful. Thus * @@ -440,6 +448,11 @@ typedef struct StatsData int64 deadlock_failures; /* number of transactions that were not * successfully retried after a deadlock * error */ + int64 other_sql_failures; /* number of failed transactions for + * reasons other than + * serialization/deadlock failure, which + * is counted if --continue-on-error is + * specified */ SimpleStats latency; SimpleStats lag; } StatsData; @@ -770,6 +783,7 @@ static int64 total_weight = 0; static bool verbose_errors = false; /* print verbose messages of all errors */ static bool exit_on_abort = false; /* exit when any client is aborted */ +static bool continue_on_error = false; /* continue after errors */ /* Builtin test scripts */ typedef struct BuiltinScript @@ -954,6 +968,7 @@ usage(void) " --log-prefix=PREFIX prefix for transaction time log file\n" " (default: \"pgbench_log\")\n" " --max-tries=NUM max number of tries to run transaction (default: 1)\n" + " --continue-on-error continue running after an SQL error\n" " --progress-timestamp use Unix epoch timestamps for progress\n" " --random-seed=SEED set random seed (\"time\", \"rand\", integer)\n" " --sampling-rate=NUM fraction of transactions to log (e.g., 0.01 for 1%%)\n" @@ -1467,6 +1482,7 @@ initStats(StatsData *sd, pg_time_usec_t start) sd->retried = 0; sd->serialization_failures = 0; sd->deadlock_failures = 0; + sd->other_sql_failures = 0; initSimpleStats(&sd->latency); initSimpleStats(&sd->lag); } @@ -1516,6 +1532,9 @@ accumStats(StatsData *stats, bool skipped, double lat, double lag, case ESTATUS_DEADLOCK_ERROR: stats->deadlock_failures++; break; + case ESTATUS_OTHER_SQL_ERROR: + stats->other_sql_failures++; + break; default: /* internal error which should never occur */ pg_fatal("unexpected error status: %d", estatus); @@ -4007,7 +4026,8 @@ advanceConnectionState(TState *thread, CState *st, StatsData *agg) if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON) st->state = CSTATE_END_COMMAND; } - else if (canRetryError(st->estatus)) + else if ((st->estatus == ESTATUS_OTHER_SQL_ERROR && continue_on_error) || + canRetryError(st->estatus)) st->state = CSTATE_ERROR; else st->state = CSTATE_ABORTED; @@ -4528,7 +4548,8 @@ static int64 getFailures(const StatsData *stats) { return (stats->serialization_failures + - stats->deadlock_failures); + stats->deadlock_failures + + stats->other_sql_failures); } /* @@ -4548,6 +4569,8 @@ getResultString(bool skipped, EStatus estatus) return "serialization"; case ESTATUS_DEADLOCK_ERROR: return "deadlock"; + case ESTATUS_OTHER_SQL_ERROR: + return "other"; default: /* internal error which should never occur */ pg_fatal("unexpected error status: %d", estatus); @@ -4603,6 +4626,7 @@ doLog(TState *thread, CState *st, int64 skipped = 0; int64 serialization_failures = 0; int64 deadlock_failures = 0; + int64 other_sql_failures = 0; int64 retried = 0; int64 retries = 0; @@ -4643,10 +4667,12 @@ doLog(TState *thread, CState *st, { serialization_failures = agg->serialization_failures; deadlock_failures = agg->deadlock_failures; + other_sql_failures = agg->other_sql_failures; } - fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT, + fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT " " INT64_FORMAT, serialization_failures, - deadlock_failures); + deadlock_failures, + other_sql_failures); fputc('\n', logfile); @@ -6285,6 +6311,7 @@ printProgressReport(TState *threads, int64 test_start, pg_time_usec_t now, cur.serialization_failures += threads[i].stats.serialization_failures; cur.deadlock_failures += threads[i].stats.deadlock_failures; + cur.other_sql_failures += threads[i].stats.other_sql_failures; } /* we count only actually executed transactions */ @@ -6427,7 +6454,8 @@ printResults(StatsData *total, /* * Remaining stats are nonsensical if we failed to execute any xacts due - * to others than serialization or deadlock errors + * to other than serialization or deadlock errors and --continue-on-error + * is not set. */ if (total_cnt <= 0) return; @@ -6443,6 +6471,9 @@ printResults(StatsData *total, printf("number of deadlock failures: " INT64_FORMAT " (%.3f%%)\n", total->deadlock_failures, 100.0 * total->deadlock_failures / total_cnt); + printf("number of other failures: " INT64_FORMAT " (%.3f%%)\n", + total->other_sql_failures, + 100.0 * total->other_sql_failures / total_cnt); } /* it can be non-zero only if max_tries is not equal to one */ @@ -6546,6 +6577,10 @@ printResults(StatsData *total, sstats->deadlock_failures, (100.0 * sstats->deadlock_failures / script_total_cnt)); + printf(" - number of other failures: " INT64_FORMAT " (%.3f%%)\n", + sstats->other_sql_failures, + (100.0 * sstats->other_sql_failures / + script_total_cnt)); } /* @@ -6705,6 +6740,7 @@ main(int argc, char **argv) {"verbose-errors", no_argument, NULL, 15}, {"exit-on-abort", no_argument, NULL, 16}, {"debug", no_argument, NULL, 17}, + {"continue-on-error", no_argument, NULL, 18}, {NULL, 0, NULL, 0} }; @@ -7058,6 +7094,10 @@ main(int argc, char **argv) case 17: /* debug */ pg_logging_increase_verbosity(); break; + case 18: /* continue-on-error */ + benchmarking_option_set = true; + continue_on_error = true; + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -7413,6 +7453,7 @@ main(int argc, char **argv) stats.retried += thread->stats.retried; stats.serialization_failures += thread->stats.serialization_failures; stats.deadlock_failures += thread->stats.deadlock_failures; + stats.other_sql_failures += thread->stats.other_sql_failures; latency_late += thread->latency_late; conn_total_duration += thread->conn_duration; diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl index 7dd78940300..8bb35dda5f7 100644 --- a/src/bin/pgbench/t/001_pgbench_with_server.pl +++ b/src/bin/pgbench/t/001_pgbench_with_server.pl @@ -1813,6 +1813,28 @@ update counter set i = i+1 returning i \gset # Clean up $node->safe_psql('postgres', 'DROP TABLE counter;'); +# Test --continue-on-error +$node->safe_psql('postgres', + 'CREATE TABLE unique_table(i int unique);' . 'INSERT INTO unique_table VALUES (0);'); + +$node->pgbench( + '-t 10 --continue-on-error --failures-detailed', + 0, + [ + qr{processed: 0/10\b}, + qr{other failures: 10\b} + ], + [], + 'test --continue-on-error', + { + '002_continue_on_error' => q{ + insert into unique_table values 0; + } + }); + +# Clean up +$node->safe_psql('postgres', 'DROP TABLE unique_table;'); + # done $node->safe_psql('postgres', 'DROP TABLESPACE regress_pgbench_tap_1_ts'); $node->stop; -- 2.39.5 (Apple Git-154)