From d0d3ec97abe98d77501dbf38bbb248493535be52 Mon Sep 17 00:00:00 2001
From: Rintaro Ikeda <ikedarintarof@oss.nttdata.com>
Date: Wed, 9 Jul 2025 23:36:37 +0900
Subject: [PATCH v7 1/3] Add --continue-on-error option

When the option is set, client rolls back the failed transaction and starts a
new one when its transaction fails due to the reason other than the deadlock and
serialization failure.
---
 doc/src/sgml/ref/pgbench.sgml                | 71 +++++++++++++++-----
 src/bin/pgbench/pgbench.c                    | 57 +++++++++++++---
 src/bin/pgbench/t/001_pgbench_with_server.pl | 22 ++++++
 3 files changed, 125 insertions(+), 25 deletions(-)

diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index ab252d9fc74..15fcb45e223 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -76,9 +76,8 @@ tps = 896.967014 (without initial connection time)
   and number of transactions per client); these will be equal unless the run
   failed before completion or some SQL command(s) failed.  (In
   <option>-T</option> mode, only the actual number of transactions is printed.)
-  The next line reports the number of failed transactions due to
-  serialization or deadlock errors (see <xref linkend="failures-and-retries"/>
-  for more information).
+  The next line reports the number of failed transactions (see
+  <xref linkend="failures-and-retries"/> for more information).
   The last line reports the number of transactions per second.
  </para>
 
@@ -790,6 +789,9 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
          <listitem>
           <para>deadlock failures;</para>
          </listitem>
+         <listitem>
+          <para>other failures;</para>
+         </listitem>
         </itemizedlist>
         See <xref linkend="failures-and-retries"/> for more information.
        </para>
@@ -914,6 +916,26 @@ pgbench <optional> <replaceable>options</replaceable> </optional> <replaceable>d
       </listitem>
      </varlistentry>
 
+     <varlistentry id="pgbench-option-continue-on-error">
+      <term><option>--continue-on-error</option></term>
+      <listitem>
+       <para>
+        Allows clients to continue their run even if an SQL statement fails due to
+        errors other than serialization or deadlock. Unlike serialization and deadlock
+        failures, clients do not retry the same transactions but start new transaction.
+        This option is useful when your custom script may raise errors due to some
+        reason like unique constraints violation. Without this option, the client is
+        aborted after such errors.
+       </para>
+       <para>
+        Note that serialization and deadlock failures never cause the client to be
+        aborted even after clients retries <option>--max-tries</option> times by
+        default, so they are not affected by this option.
+        See <xref linkend="failures-and-retries"/> for more information.
+       </para>
+      </listitem>
+     </varlistentry>
+
     </variablelist>
    </para>
 
@@ -2409,8 +2431,8 @@ END;
    will be reported as <literal>failed</literal>. If you use the
    <option>--failures-detailed</option> option, the
    <replaceable>time</replaceable> of the failed transaction will be reported as
-   <literal>serialization</literal> or
-   <literal>deadlock</literal> depending on the type of failure (see
+   <literal>serialization</literal>, <literal>deadlock</literal>, or
+   <literal>other</literal> depending on the type of failure (see
    <xref linkend="failures-and-retries"/> for more information).
   </para>
 
@@ -2638,6 +2660,16 @@ END;
       </para>
      </listitem>
     </varlistentry>
+
+    <varlistentry>
+     <term><replaceable>other_sql_failures</replaceable></term>
+     <listitem>
+      <para>
+       number of transactions that got a SQL error
+       (zero unless <option>--failures-detailed</option> is specified)
+      </para>
+     </listitem>
+    </varlistentry>
    </variablelist>
   </para>
 
@@ -2646,8 +2678,8 @@ END;
 <screen>
 <userinput>pgbench --aggregate-interval=10 --time=20 --client=10 --log --rate=1000 --latency-limit=10 --failures-detailed --max-tries=10 test</userinput>
 
-1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0
-1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0
+1650260552 5178 26171317 177284491527 1136 44462 2647617 7321113867 0 9866 64 7564 28340 4148 0 0
+1650260562 4808 25573984 220121792172 1171 62083 3037380 9666800914 0 9998 598 7392 26621 4527 0 0
 </screen>
   </para>
 
@@ -2839,9 +2871,11 @@ statement latencies in milliseconds, failures and retries:
          <option>--exit-on-abort</option> is specified. Otherwise in the worst
          case they only lead to the abortion of the failed client while other
          clients continue their run (but some client errors are handled without
-         an abortion of the client and reported separately, see below). Later in
-         this section it is assumed that the discussed errors are only the
-         direct client errors and they are not internal
+         an abortion of the client and reported separately, see below). When
+         <option>--continue-on-error</option> is specified, the client
+         continues to process new transactions even if it encounters an error.
+         Later in this section it is assumed that the discussed errors are only
+         the direct client errors and they are not internal
          <application>pgbench</application> errors.
        </para>
      </listitem>
@@ -2851,14 +2885,17 @@ statement latencies in milliseconds, failures and retries:
   <para>
    A client's run is aborted in case of a serious error; for example, the
    connection with the database server was lost or the end of script was reached
-   without completing the last transaction. In addition, if execution of an SQL
+   without completing the last transaction. By default, if execution of an SQL
    or meta command fails for reasons other than serialization or deadlock errors,
-   the client is aborted. Otherwise, if an SQL command fails with serialization or
-   deadlock errors, the client is not aborted. In such cases, the current
-   transaction is rolled back, which also includes setting the client variables
-   as they were before the run of this transaction (it is assumed that one
-   transaction script contains only one transaction; see
-   <xref linkend="transactions-and-scripts"/> for more information).
+   the client is aborted. However, if the --continue-on-error option is specified,
+   the client does not abort and proceeds to the next transaction regardless of
+   the error. These cases are reported as "other failures" in the output.
+   On contrast, if an SQL command fails with serialization or deadlock errors, the
+   client is not aborted even without  <option>--continue-on-error</option>.
+   Instead, the current transaction is rolled back, which also includes setting
+   the client variables as they were before the run of this transaction
+   (it is assumed that one transaction script contains only one transaction;
+   see <xref linkend="transactions-and-scripts"/> for more information).
    Transactions with serialization or deadlock errors are repeated after
    rollbacks until they complete successfully or reach the maximum
    number of tries (specified by the <option>--max-tries</option> option) / the maximum
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 497a936c141..edd8b01f794 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -402,15 +402,23 @@ typedef struct StatsData
 	 *   directly successful transactions (they were successfully completed on
 	 *                                     the first try).
 	 *
-	 * A failed transaction is defined as unsuccessfully retried transactions.
-	 * It can be one of two types:
+	 * A failed transaction is counted differently depending on whether
+	 * the --continue-on-error option is specified.
 	 *
+	 * Without --continue-on-error:
 	 * failed (the number of failed transactions) =
 	 *   'serialization_failures' (they got a serialization error and were not
 	 *                             successfully retried) +
 	 *   'deadlock_failures' (they got a deadlock error and were not
 	 *                        successfully retried).
 	 *
+	 * When --continue-on-error is specified:
+	 *
+	 * failed (number of failed transactions) =
+	 *   'serialization_failures' + 'deadlock_failures' +
+	 *   'other_sql_failures'  (they got some other SQL error; the transaction was
+	 * not retried and counted as failed due to --continue-on-error).
+	 *
 	 * If the transaction was retried after a serialization or a deadlock
 	 * error this does not guarantee that this retry was successful. Thus
 	 *
@@ -440,6 +448,11 @@ typedef struct StatsData
 	int64		deadlock_failures;	/* number of transactions that were not
 									 * successfully retried after a deadlock
 									 * error */
+	int64		other_sql_failures; /* number of failed transactions for
+									 * reasons other than
+									 * serialization/deadlock failure, which
+									 * is counted if --continue-on-error is
+									 * specified */
 	SimpleStats latency;
 	SimpleStats lag;
 } StatsData;
@@ -770,6 +783,7 @@ static int64 total_weight = 0;
 static bool verbose_errors = false; /* print verbose messages of all errors */
 
 static bool exit_on_abort = false;	/* exit when any client is aborted */
+static bool continue_on_error = false;	/* continue after errors */
 
 /* Builtin test scripts */
 typedef struct BuiltinScript
@@ -954,6 +968,7 @@ usage(void)
 		   "  --log-prefix=PREFIX      prefix for transaction time log file\n"
 		   "                           (default: \"pgbench_log\")\n"
 		   "  --max-tries=NUM          max number of tries to run transaction (default: 1)\n"
+		   "  --continue-on-error      continue running after an SQL error\n"
 		   "  --progress-timestamp     use Unix epoch timestamps for progress\n"
 		   "  --random-seed=SEED       set random seed (\"time\", \"rand\", integer)\n"
 		   "  --sampling-rate=NUM      fraction of transactions to log (e.g., 0.01 for 1%%)\n"
@@ -1467,6 +1482,7 @@ initStats(StatsData *sd, pg_time_usec_t start)
 	sd->retried = 0;
 	sd->serialization_failures = 0;
 	sd->deadlock_failures = 0;
+	sd->other_sql_failures = 0;
 	initSimpleStats(&sd->latency);
 	initSimpleStats(&sd->lag);
 }
@@ -1516,6 +1532,9 @@ accumStats(StatsData *stats, bool skipped, double lat, double lag,
 		case ESTATUS_DEADLOCK_ERROR:
 			stats->deadlock_failures++;
 			break;
+		case ESTATUS_OTHER_SQL_ERROR:
+			stats->other_sql_failures++;
+			break;
 		default:
 			/* internal error which should never occur */
 			pg_fatal("unexpected error status: %d", estatus);
@@ -3356,7 +3375,7 @@ readCommandResponse(CState *st, MetaCommand meta, char *varprefix)
 			case PGRES_FATAL_ERROR:
 				st->estatus = getSQLErrorStatus(PQresultErrorField(res,
 																   PG_DIAG_SQLSTATE));
-				if (canRetryError(st->estatus))
+				if (continue_on_error || canRetryError(st->estatus))
 				{
 					if (verbose_errors)
 						commandError(st, PQerrorMessage(st->con));
@@ -4007,7 +4026,8 @@ advanceConnectionState(TState *thread, CState *st, StatsData *agg)
 					if (PQpipelineStatus(st->con) != PQ_PIPELINE_ON)
 						st->state = CSTATE_END_COMMAND;
 				}
-				else if (canRetryError(st->estatus))
+				else if ((st->estatus == ESTATUS_OTHER_SQL_ERROR && continue_on_error) ||
+						 canRetryError(st->estatus))
 					st->state = CSTATE_ERROR;
 				else
 					st->state = CSTATE_ABORTED;
@@ -4528,7 +4548,8 @@ static int64
 getFailures(const StatsData *stats)
 {
 	return (stats->serialization_failures +
-			stats->deadlock_failures);
+			stats->deadlock_failures +
+			stats->other_sql_failures);
 }
 
 /*
@@ -4548,6 +4569,8 @@ getResultString(bool skipped, EStatus estatus)
 				return "serialization";
 			case ESTATUS_DEADLOCK_ERROR:
 				return "deadlock";
+			case ESTATUS_OTHER_SQL_ERROR:
+				return "other";
 			default:
 				/* internal error which should never occur */
 				pg_fatal("unexpected error status: %d", estatus);
@@ -4603,6 +4626,7 @@ doLog(TState *thread, CState *st,
 			int64		skipped = 0;
 			int64		serialization_failures = 0;
 			int64		deadlock_failures = 0;
+			int64		other_sql_failures = 0;
 			int64		retried = 0;
 			int64		retries = 0;
 
@@ -4643,10 +4667,12 @@ doLog(TState *thread, CState *st,
 			{
 				serialization_failures = agg->serialization_failures;
 				deadlock_failures = agg->deadlock_failures;
+				other_sql_failures = agg->other_sql_failures;
 			}
-			fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT,
+			fprintf(logfile, " " INT64_FORMAT " " INT64_FORMAT " " INT64_FORMAT,
 					serialization_failures,
-					deadlock_failures);
+					deadlock_failures,
+					other_sql_failures);
 
 			fputc('\n', logfile);
 
@@ -6285,6 +6311,7 @@ printProgressReport(TState *threads, int64 test_start, pg_time_usec_t now,
 		cur.serialization_failures +=
 			threads[i].stats.serialization_failures;
 		cur.deadlock_failures += threads[i].stats.deadlock_failures;
+		cur.other_sql_failures += threads[i].stats.other_sql_failures;
 	}
 
 	/* we count only actually executed transactions */
@@ -6427,7 +6454,8 @@ printResults(StatsData *total,
 
 	/*
 	 * Remaining stats are nonsensical if we failed to execute any xacts due
-	 * to others than serialization or deadlock errors
+	 * to other than serialization or deadlock errors and --continue-on-error
+	 * is not set.
 	 */
 	if (total_cnt <= 0)
 		return;
@@ -6443,6 +6471,9 @@ printResults(StatsData *total,
 		printf("number of deadlock failures: " INT64_FORMAT " (%.3f%%)\n",
 			   total->deadlock_failures,
 			   100.0 * total->deadlock_failures / total_cnt);
+		printf("number of other failures: " INT64_FORMAT " (%.3f%%)\n",
+			   total->other_sql_failures,
+			   100.0 * total->other_sql_failures / total_cnt);
 	}
 
 	/* it can be non-zero only if max_tries is not equal to one */
@@ -6546,6 +6577,10 @@ printResults(StatsData *total,
 							   sstats->deadlock_failures,
 							   (100.0 * sstats->deadlock_failures /
 								script_total_cnt));
+						printf(" - number of other failures: " INT64_FORMAT " (%.3f%%)\n",
+							   sstats->other_sql_failures,
+							   (100.0 * sstats->other_sql_failures /
+								script_total_cnt));
 					}
 
 					/*
@@ -6705,6 +6740,7 @@ main(int argc, char **argv)
 		{"verbose-errors", no_argument, NULL, 15},
 		{"exit-on-abort", no_argument, NULL, 16},
 		{"debug", no_argument, NULL, 17},
+		{"continue-on-error", no_argument, NULL, 18},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -7058,6 +7094,10 @@ main(int argc, char **argv)
 			case 17:			/* debug */
 				pg_logging_increase_verbosity();
 				break;
+			case 18:			/* continue-on-error */
+				benchmarking_option_set = true;
+				continue_on_error = true;
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -7413,6 +7453,7 @@ main(int argc, char **argv)
 		stats.retried += thread->stats.retried;
 		stats.serialization_failures += thread->stats.serialization_failures;
 		stats.deadlock_failures += thread->stats.deadlock_failures;
+		stats.other_sql_failures += thread->stats.other_sql_failures;
 		latency_late += thread->latency_late;
 		conn_total_duration += thread->conn_duration;
 
diff --git a/src/bin/pgbench/t/001_pgbench_with_server.pl b/src/bin/pgbench/t/001_pgbench_with_server.pl
index 7dd78940300..8bb35dda5f7 100644
--- a/src/bin/pgbench/t/001_pgbench_with_server.pl
+++ b/src/bin/pgbench/t/001_pgbench_with_server.pl
@@ -1813,6 +1813,28 @@ update counter set i = i+1 returning i \gset
 # Clean up
 $node->safe_psql('postgres', 'DROP TABLE counter;');
 
+# Test --continue-on-error
+$node->safe_psql('postgres',
+	'CREATE TABLE unique_table(i int unique);' . 'INSERT INTO unique_table VALUES (0);');
+
+$node->pgbench(
+	'-t 10 --continue-on-error --failures-detailed',
+	0,
+	[
+		qr{processed: 0/10\b},
+		qr{other failures: 10\b}
+	],
+	[],
+	'test --continue-on-error',
+	{
+		'002_continue_on_error' => q{
+		insert into unique_table values 0;
+		}
+	});
+
+# Clean up
+$node->safe_psql('postgres', 'DROP TABLE unique_table;');
+
 # done
 $node->safe_psql('postgres', 'DROP TABLESPACE regress_pgbench_tap_1_ts');
 $node->stop;
-- 
2.43.0

