#!/usr/bin/env bash set -euo pipefail prefix="${PGPREFIX:-$PWD/tmp_install/usr/local/pgsql}" bindir="$prefix/bin" sharedir="$prefix/share" libdir="$prefix/lib" rows="${ROWS:-80000}" followup_calls="${FOLLOWUP_CALLS:-5}" followup_sleep="${FOLLOWUP_SLEEP:-0.2}" post_run_sleep="${POST_RUN_SLEEP:-8}" if [[ ! -x "$bindir/initdb" || ! -x "$bindir/pg_ctl" || ! -x "$bindir/psql" ]]; then cat >&2 <&2 <"$base/pg_ctl_stop.out" 2>&1 || true fi echo "repro artifacts left in: $base" } trap cleanup EXIT initdb --no-data-checksums --auth trust --no-sync --no-instructions -D "$datadir" >"$base/initdb.out" cat >>"$datadir/postgresql.conf" <"$base/pg_ctl_start.out" started=1 psql_args=(-h "$base" -p "$port" -v ON_ERROR_STOP=1 -X postgres) count_log() { grep -F -c "$1" "$logfile" 2>/dev/null || true } wait_for_log_count() { local needle="$1" local expected="$2" local timeout_seconds="$3" local end=$((SECONDS + timeout_seconds)) local count while (( SECONDS < end )); do count=$(count_log "$needle") if (( count >= expected )); then return 0 fi sleep 0.1 done count=$(count_log "$needle") echo "timed out waiting for at least $expected occurrence(s) of: $needle" >&2 echo "saw $count occurrence(s)" >&2 return 1 } psql "${psql_args[@]}" -c "CREATE EXTENSION test_checksums;" >"$base/setup-extension.out" psql "${psql_args[@]}" -c "CREATE TABLE t AS SELECT g, repeat('x', 2000) AS payload FROM generate_series(1, $rows) g;" >"$base/setup-table.out" psql "${psql_args[@]}" -c "CHECKPOINT;" >"$base/checkpoint.out" # Delay the two initial pg_enable_data_checksums() callers at the launcher # registration entry point, so both callers enter the launch path together. psql "${psql_args[@]}" -c "SELECT dcw_inject_startup_delay(true);" >"$base/attach-startup-delay.out" # Delay launcher main before it claims DataChecksumState->launcher_running. This # makes the same race window deterministic instead of relying on scheduler luck. psql "${psql_args[@]}" -c "SELECT dcw_inject_launcher_delay(true);" >"$base/attach-launcher-delay.out" # Delay the final transition to on, keeping the winning launcher active long # enough for follow-up pg_enable_data_checksums() calls to probe the state. psql "${psql_args[@]}" -c "SELECT dcw_inject_delay_barrier(true);" >"$base/attach-final-delay.out" psql "${psql_args[@]}" -c "SELECT pg_enable_data_checksums(20, 1);" >"$base/enable1.out" 2>&1 & pid1=$! psql "${psql_args[@]}" -c "SELECT pg_enable_data_checksums(20, 1);" >"$base/enable2.out" 2>&1 & pid2=$! wait "$pid1" || true wait "$pid2" || true # Later pg_enable_data_checksums() calls should not be delayed at the SQL entry # point; they need to probe the shared launcher_running state promptly. psql "${psql_args[@]}" -c "SELECT dcw_inject_startup_delay(false);" >"$base/detach-startup-delay.out" wait_for_log_count 'background worker "datachecksums launcher" started' 2 15 wait_for_log_count 'background worker "datachecksums launcher" already running, exiting' 1 10 wait_for_log_count 'initiating data checksum processing in database "postgres"' 1 10 # Stop delaying later launchers. Follow-up calls are fired immediately after # the redundant launcher has exited, while the owning launcher is still # processing or waiting at the final checksum-on barrier. psql "${psql_args[@]}" -c "SELECT dcw_inject_launcher_delay(false);" >"$base/detach-launcher-delay.out" # A single follow-up call is often enough after the redundant launcher exits. # Use a small handful by default to avoid depending on exact worker timing; set # FOLLOWUP_CALLS=1 for the smallest variant. pids=() for _ in $(seq 1 "$followup_calls"); do psql "${psql_args[@]}" -c "SELECT pg_enable_data_checksums(20, 1);" >"$base/enable-followup-$_.out" 2>&1 & pids+=("$!") sleep "$followup_sleep" done for pid in "${pids[@]}"; do wait "$pid" || true done sleep "$post_run_sleep" echo echo "--- relevant server log lines ---" grep -E 'datachecksums launcher|data checksum processing already running|data checksums already in desired state|enabling data checksums requested|initiating data checksum processing|cannot set data checksums|data checksums are now|data checksum processing was aborted|disabling data checksums requested' "$logfile" || true echo echo "--- summary ---" launcher_workers_started=$(count_log 'background worker "datachecksums launcher" started') checksum_processing_starts=$(count_log 'enabling data checksums requested') already_running_responses=$(count_log 'data checksum processing already running') checksum_state_warnings=$(count_log 'cannot set data checksums') echo "launcher workers started: $launcher_workers_started" echo "checksum processing starts: $checksum_processing_starts" echo "already-running responses: $already_running_responses" echo "checksum state warnings: $checksum_state_warnings" echo "final data_checksums setting: $(psql "${psql_args[@]}" -At -c 'SHOW data_checksums;' || true)" echo if (( checksum_processing_starts > 1 )); then echo "BUG REPRODUCED: more than one launcher started checksum processing." else echo "BUG NOT REPRODUCED: only one launcher started checksum processing." exit 1 fi echo cat <<'EOF' Expected result on the unfixed patchset: After the redundant launcher exits, a follow-up call can start another datachecksums launcher while the original launcher is still active. The log shows more than one "enabling data checksums requested" line. Depending on timing, it may also show "cannot set data checksums to \"on\"...". Expected result with the launcher_exit() ownership fix: Two launcher workers may start, but the redundant one only logs "already running, exiting". While the owning launcher is still active, follow-up calls log "data checksum processing already running"; after it completes, they may log "data checksums already in desired state, exiting". No additional launcher starts checksum processing. EOF