From 202490e15daca3b9ed67a3672ced09efce4f5cfa Mon Sep 17 00:00:00 2001 From: Tatsuo Ishii Date: Sun, 26 Apr 2026 14:45:11 +0900 Subject: [PATCH v2] Fix pcp main process to remember child pids upon restarting. Upon failove/failback, pgpool main process sends a signal to pcp main process to request restarting, and pcp main restarts. Problem is, when pcp main restarts, it forgets the list of pids of its child pcp worker process. As a result, when pgpool main sends SIGUSR2 to pcp main to wake up pcp worker process, it cannot find the pid to send the signal to, which causes the infinite loop in pcp worker process. Below is a reproducer script. pcp_promote_node hangs. ------------------------------------------------ pgpool_setup -n 3 --no-stop pg_ctl -D data2 stop while true do psql -p 11000 -c "show pool_nodes" test if [ $? = 0 ];then break; fi sleep 1 done psql -p 11000 -c "show pool_nodes" test pcp_recovery_node -p 11001 -n 2;pcp_promote_node -p 11001 -n 2 -s -g ------------------------------------------------ To fix the problem, we could delay the restarting of pcp main until it delivers the signal. Unfortunately this does not work, since pgpool main waits for pcp main process to exit. Thus processing failover does not proceed in pgpool main. So I decided to add a new shared memory area (Req_info->pcp_worker_pids) to remember the pcp workers pids across pcp main restarting. The array size is MAX_REQUEST_QUEUE_SIZE + some room (currently 10). When child process is forked from pcp main, its pid is added to the shared memory array. When child process exits, its pid in the array is cleared to 0, representing an empty slot. Author: Tatsuo Ishii Reported-by: Taiki Koshino Discussion: https://www.postgresql.org/message-id/20260425.223051.1207744844622514060.ishii%40postgresql.org --- src/include/pool.h | 13 +++++++++++++ src/main/pgpool_main.c | 2 ++ src/pcp_con/pcp_child.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/src/include/pool.h b/src/include/pool.h index 65907dcf1..fea5744f3 100644 --- a/src/include/pool.h +++ b/src/include/pool.h @@ -487,6 +487,13 @@ typedef struct int count; /* request node ids count */ } POOL_REQUEST_NODE; +/* + * Maximum number of pcp worker child process. * Since pcp worker process is + * forked whenever failover/failback request is made, it should be equal to + * MAX_REQUEST_QUEUE_SIZE + some room. 10 is an arbitrary number. +*/ +#define MAX_PCP_WORKER_PIDS MAX_REQUEST_QUEUE_SIZE + 10 + typedef struct { POOL_REQUEST_NODE request[MAX_REQUEST_QUEUE_SIZE]; @@ -524,6 +531,12 @@ typedef struct bool query_cache_invalidate_request; /* true if * pcp_invalidate_query_cache * requested */ + + /* + * pcp worker child pids. This is inherited to new pcp main process to + * track pcp worker child when new pcp worker child starts. + */ + pid_t pcp_worker_pids[MAX_PCP_WORKER_PIDS]; } POOL_REQUEST_INFO; /* description of row. corresponding to RowDescription message */ diff --git a/src/main/pgpool_main.c b/src/main/pgpool_main.c index 32bcb0a1f..4112074e2 100644 --- a/src/main/pgpool_main.c +++ b/src/main/pgpool_main.c @@ -3200,6 +3200,8 @@ initialize_shared_mem_objects(bool clear_memcache_oidmaps) wd_ipc_initialize_data(); } + /* initialize pcp worker child pids */ + memset(Req_info->pcp_worker_pids, 0, sizeof(Req_info->pcp_worker_pids)); } /* diff --git a/src/pcp_con/pcp_child.c b/src/pcp_con/pcp_child.c index e07c8897e..23d233146 100644 --- a/src/pcp_con/pcp_child.c +++ b/src/pcp_con/pcp_child.c @@ -5,7 +5,7 @@ * pgpool: a language independent connection pool server for PostgreSQL * written by Tatsuo Ishii * - * Copyright (c) 2003-2023 PgPool Global Development Group + * Copyright (c) 2003-2026 PgPool Global Development Group * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby @@ -154,6 +154,17 @@ pcp_main(int *fds) /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; + /* + * Restore pcp woker child pids from shmem + */ + for (int i = 0; i < MAX_PCP_WORKER_PIDS; i++) + { + pid_t pid = Req_info->pcp_worker_pids[i]; + + if (pid != 0) + pcp_worker_children = lappend_int(pcp_worker_children, (int) pid); + } + /* * Unblock signals */ @@ -326,6 +337,8 @@ start_pcp_command_processor_process(int port, int *fds) } else /* parent */ { + int i; + if (pool_config->log_pcp_processes) ereport(LOG, (errmsg("forked new pcp worker, pid=%d socket=%d", @@ -334,6 +347,18 @@ start_pcp_command_processor_process(int port, int *fds) close(port); /* Add it to the list */ pcp_worker_children = lappend_int(pcp_worker_children, (int) pid); + /* save it to shmem */ + for (i = 0; i < MAX_PCP_WORKER_PIDS; i++) + { + if (Req_info->pcp_worker_pids[i] == 0) + { + Req_info->pcp_worker_pids[i] = pid; + break; + } + } + if (i == MAX_PCP_WORKER_PIDS) + ereport(WARNING, + (errmsg("no empty slot in pcp worker table"))); } } @@ -405,6 +430,17 @@ reaper(void) (errmsg("going to remove pid: %d from pid list having %d elements", pid, list_length(pcp_worker_children)))); /* remove the pid of process from the list */ pcp_worker_children = list_delete_int(pcp_worker_children, pid); + + /* remove the pid from shmem */ + for (int i = 0; i < MAX_PCP_WORKER_PIDS; i++) + { + if (Req_info->pcp_worker_pids[i] == pid) + { + Req_info->pcp_worker_pids[i] = 0; + break; + } + } + ereport(DEBUG2, (errmsg("new list have %d elements", list_length(pcp_worker_children)))); } -- 2.43.0