From f353f922222f70fad1d104f76adca1010578d40a Mon Sep 17 00:00:00 2001 From: Taiki Koshino Date: Thu, 26 Mar 2026 17:04:37 +0900 Subject: [PATCH v9] Add Lifecheck Started status to pcp_watchdog_info output. This commit enhances the pcp_watchdog_info command by adding a new field, Lifecheck Started, which indicates whether lifecheck has been started on each watchdog node (NO: not started, YES: started). This allows users to check the lifecheck status directly from the command output without inspecting logs. Add a lifecheck_started member to WatchdogNode. When the lifecheck process detects that lifecheck has started, it notifies the watchdog process, which sets lifecheck_started to true. When set to true, the status is propagated across the cluster. Add a lifecheck_status field to pcp_watchdog_info so that the latest lifecheck_started status is displayed when the command is called. --- doc.ja/src/sgml/ref/pcp_watchdog_info.sgml | 11 ++++++++--- doc/src/sgml/ref/pcp_watchdog_info.sgml | 10 +++++++--- src/include/pcp/pcp.h | 2 ++ src/include/watchdog/watchdog.h | 2 ++ src/include/watchdog/wd_commands.h | 2 ++ src/include/watchdog/wd_ipc_defines.h | 2 +- src/include/watchdog/wd_lifecheck.h | 3 ++- src/libs/pcp/pcp.c | 6 ++++++ src/tools/pcp/pcp_frontend_client.c | 8 +++++--- src/watchdog/watchdog.c | 13 +++++++++++++ src/watchdog/wd_commands.c | 7 +++++++ src/watchdog/wd_json_data.c | 4 +++- src/watchdog/wd_lifecheck.c | 10 +++++++++- 13 files changed, 67 insertions(+), 13 deletions(-) diff --git a/doc.ja/src/sgml/ref/pcp_watchdog_info.sgml b/doc.ja/src/sgml/ref/pcp_watchdog_info.sgml index 85ca81e10..61837969c 100644 --- a/doc.ja/src/sgml/ref/pcp_watchdog_info.sgml +++ b/doc.ja/src/sgml/ref/pcp_watchdog_info.sgml @@ -108,9 +108,9 @@ $ pcp_watchdog_info -h localhost -p 9898 -U postgres Password: 3 3 YES server1:9999 Linux server1.localdomain server1 -server1:9999 Linux server1.localdomain server1 9999 9000 4 LEADER 0 MEMBER -server2:9999 Linux server2.localdomain server2 9999 9000 7 STANDBY 0 MEMBER -server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER +server1:9999 Linux server1.localdomain server1 9999 9000 4 LEADER 0 MEMBER YES +server2:9999 Linux server2.localdomain server2 9999 9000 7 STANDBY 0 MEMBER YES +server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER YES @@ -149,6 +149,7 @@ server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER 6. current node state name 7. current cluster membership status 8. current cluster membership status name + 9. Lifecheck start status --> それ以降は watchdog ノードのリストが出力されます: @@ -160,6 +161,7 @@ server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER 6. 現在のノードステータス名 7. 現在のメンバーシップステータス 8. 現在のメンバーシップステータス名 + 9. ライフチェックの開始状況 @@ -192,6 +194,7 @@ Node priority : 1 Status : 4 Status Name : LEADER Membership Status : MEMBER +Lifecheck Started : YES Node Name : server2:9999 Linux server2.localdomain Host Name : server2 @@ -202,6 +205,7 @@ Node priority : 1 Status : 7 Status Name : STANDBY Membership Status : MEMBER +Lifecheck Started : YES Node Name : server3:9999 Linux server3.localdomain Host Name : server3 @@ -212,6 +216,7 @@ Node priority : 1 Status : 7 Status Name : STANDBY Membership Status : MEMBER +Lifecheck Started : YES diff --git a/doc/src/sgml/ref/pcp_watchdog_info.sgml b/doc/src/sgml/ref/pcp_watchdog_info.sgml index ce357e93d..81da9a651 100644 --- a/doc/src/sgml/ref/pcp_watchdog_info.sgml +++ b/doc/src/sgml/ref/pcp_watchdog_info.sgml @@ -78,9 +78,9 @@ $ pcp_watchdog_info -h localhost -p 9898 -U postgres Password: 3 3 YES server1:9999 Linux server1.localdomain server1 -server1:9999 Linux server1.localdomain server1 9999 9000 4 LEADER 0 MEMBER -server2:9999 Linux server2.localdomain server2 9999 9000 7 STANDBY 0 MEMBER -server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER +server1:9999 Linux server1.localdomain server1 9999 9000 4 LEADER 0 MEMBER YES +server2:9999 Linux server2.localdomain server2 9999 9000 7 STANDBY 0 MEMBER YES +server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER YES @@ -105,6 +105,7 @@ server3:9999 Linux server3.localdomain server3 9999 9000 7 STANDBY 0 MEMBER 6. current node state name 7. current cluster membership status 8. current cluster membership status name + 9. Lifecheck start status @@ -134,6 +135,7 @@ Node priority : 1 Status : 4 Status Name : LEADER Membership Status : MEMBER +Lifecheck Started : YES Node Name : server2:9999 Linux server2.localdomain Host Name : server2 @@ -144,6 +146,7 @@ Node priority : 1 Status : 7 Status Name : STANDBY Membership Status : MEMBER +Lifecheck Started : YES Node Name : server3:9999 Linux server3.localdomain Host Name : server3 @@ -154,6 +157,7 @@ Node priority : 1 Status : 7 Status Name : STANDBY Membership Status : MEMBER +Lifecheck Started : YES diff --git a/src/include/pcp/pcp.h b/src/include/pcp/pcp.h index e40b96bdc..15a4abb01 100644 --- a/src/include/pcp/pcp.h +++ b/src/include/pcp/pcp.h @@ -48,6 +48,8 @@ typedef struct PCPWDNodeInfo int wd_priority; /* node priority in leader election */ int pgpool_port; /* pgpool port */ char delegate_ip[WD_MAX_HOST_NAMELEN]; /* delegate IP */ + bool lifecheck_started; /* True means lifecheck is started, + * false means lifecheck is not started */ int id; } PCPWDNodeInfo; diff --git a/src/include/watchdog/watchdog.h b/src/include/watchdog/watchdog.h index 8803283f5..f7699e564 100644 --- a/src/include/watchdog/watchdog.h +++ b/src/include/watchdog/watchdog.h @@ -206,6 +206,8 @@ typedef struct WatchdogNode * initiated by remote */ SocketConnection client_socket; /* socket connections for this node * initiated by local */ + bool lifecheck_started; /* True means lifecheck is started, + * false means lifecheck is not started */ } WatchdogNode; /* diff --git a/src/include/watchdog/wd_commands.h b/src/include/watchdog/wd_commands.h index a016772f6..f3d579efb 100644 --- a/src/include/watchdog/wd_commands.h +++ b/src/include/watchdog/wd_commands.h @@ -42,6 +42,8 @@ typedef struct WDNodeInfo int wd_priority; /* node priority */ char delegate_ip[WD_MAX_HOST_NAMELEN]; /* delegate IP */ int id; + bool lifecheck_started /* True means lifecheck is started, + * false means lifecheck is not started */; } WDNodeInfo; typedef struct WDGenericData diff --git a/src/include/watchdog/wd_ipc_defines.h b/src/include/watchdog/wd_ipc_defines.h index 7546bfa7e..9a8b85e7d 100644 --- a/src/include/watchdog/wd_ipc_defines.h +++ b/src/include/watchdog/wd_ipc_defines.h @@ -124,7 +124,7 @@ typedef enum WDValueDataType /* Use to inform node new node status by lifecheck */ #define WD_LIFECHECK_NODE_STATUS_DEAD 1 #define WD_LIFECHECK_NODE_STATUS_ALIVE 2 - +#define WD_LIFECHECK_NODE_LIFECHECK_STARTED 3 #endif diff --git a/src/include/watchdog/wd_lifecheck.h b/src/include/watchdog/wd_lifecheck.h index 9460dc346..669ad5b7d 100644 --- a/src/include/watchdog/wd_lifecheck.h +++ b/src/include/watchdog/wd_lifecheck.h @@ -33,7 +33,8 @@ typedef enum NodeState { NODE_EMPTY, NODE_DEAD, - NODE_ALIVE + NODE_ALIVE, + NODE_LIFECHECK_STARTED } NodeStates; typedef struct LifeCheckNode diff --git a/src/libs/pcp/pcp.c b/src/libs/pcp/pcp.c index f8a635065..e0df470b0 100644 --- a/src/libs/pcp/pcp.c +++ b/src/libs/pcp/pcp.c @@ -1772,6 +1772,12 @@ process_watchdog_info_response(PCPConnInfo * pcpConn, char *buf, int len) goto INVALID_RESPONSE; } + if (json_get_bool_value_for_key(nodeInfoValue, "LifecheckStarted", &wdNodeInfo->lifecheck_started)) + { + json_value_free(root); + goto INVALID_RESPONSE; + } + } json_value_free(root); diff --git a/src/tools/pcp/pcp_frontend_client.c b/src/tools/pcp/pcp_frontend_client.c index 9f63a78f4..928749d99 100644 --- a/src/tools/pcp/pcp_frontend_client.c +++ b/src/tools/pcp/pcp_frontend_client.c @@ -835,7 +835,8 @@ output_watchdog_info_result(PCPResultInfo * pcpResInfo, bool verbose) printf("Node priority : %d\n", watchdog_info->wd_priority); printf("Status : %d\n", watchdog_info->state); printf("Status Name : %s\n", watchdog_info->stateName); - printf("Membership Status : %s\n\n", watchdog_info->membership_status_string); + printf("Membership Status : %s\n", watchdog_info->membership_status_string); + printf("Lifecheck Started : %s\n\n", watchdog_info->lifecheck_started ? "YES" : "NO"); } } else @@ -851,7 +852,7 @@ output_watchdog_info_result(PCPResultInfo * pcpResInfo, bool verbose) { PCPWDNodeInfo *watchdog_info = &cluster->nodeList[i]; - printf("%s %s %d %d %d %s %d %s\n", + printf("%s %s %d %d %d %s %d %s %s\n", watchdog_info->nodeName, watchdog_info->hostName, watchdog_info->pgpool_port, @@ -859,7 +860,8 @@ output_watchdog_info_result(PCPResultInfo * pcpResInfo, bool verbose) watchdog_info->state, watchdog_info->stateName, watchdog_info->membership_status, - watchdog_info->membership_status_string); + watchdog_info->membership_status_string, + watchdog_info->lifecheck_started ? "YES": "NO"); } } } diff --git a/src/watchdog/watchdog.c b/src/watchdog/watchdog.c index f59e4373a..8d4c20951 100644 --- a/src/watchdog/watchdog.c +++ b/src/watchdog/watchdog.c @@ -2486,6 +2486,17 @@ fire_node_status_event(int nodeID, int nodeStatus) else watchdog_state_machine(WD_EVENT_REMOTE_NODE_FOUND, wdNode, NULL, NULL); } + else if (nodeStatus == WD_LIFECHECK_NODE_LIFECHECK_STARTED) + { + ereport(LOG, + (errmsg("processing node status changed to LIFECHECK STARTED event for node ID:%d", nodeID))); + + if (wdNode == g_cluster.localNode) + { + wdNode->lifecheck_started = true; + send_message_of_type(NULL, WD_INFO_MESSAGE, NULL); + } + } else ereport(LOG, (errmsg("failed to process node status change event"), @@ -3856,6 +3867,7 @@ add_nodeinfo_to_json(JsonNode *jNode, WatchdogNode *node) jw_put_int(jNode, "WdPort", nodeIfNull_int(wd_port, 0)); jw_put_int(jNode, "PgpoolPort", nodeIfNull_int(pgpool_port, 0)); jw_put_int(jNode, "Priority", nodeIfNull_int(wd_priority, 0)); + jw_put_int(jNode, "LifecheckStarted", nodeIfNull_int(lifecheck_started, 0)); jw_end_element(jNode); @@ -4510,6 +4522,7 @@ standard_packet_processor(WatchdogNode *wdNode, WDPacketData *pkt) wdNode->escalated = tempNode->escalated; wdNode->standby_nodes_count = tempNode->standby_nodes_count; wdNode->quorum_status = tempNode->quorum_status; + wdNode->lifecheck_started = tempNode->lifecheck_started; print_watchdog_node_info(wdNode); diff --git a/src/watchdog/wd_commands.c b/src/watchdog/wd_commands.c index 4b313e6c7..ddc13a0fb 100644 --- a/src/watchdog/wd_commands.c +++ b/src/watchdog/wd_commands.c @@ -425,6 +425,13 @@ parse_watchdog_node_info_from_wd_node_json(json_value *source) errdetail("unable to find state"))); } + if (json_get_bool_value_for_key(source, "LifecheckStarted", &wdNodeInfo->lifecheck_started)) + { + ereport(ERROR, + (errmsg("invalid json data"), + errdetail("unable to find lifecheckStarted"))); + } + return wdNodeInfo; } diff --git a/src/watchdog/wd_json_data.c b/src/watchdog/wd_json_data.c index 91dd26a86..26ebd4b5e 100644 --- a/src/watchdog/wd_json_data.c +++ b/src/watchdog/wd_json_data.c @@ -517,6 +517,7 @@ get_watchdog_node_info_json(WatchdogNode *wdNode, char *authkey) jw_put_int(jNode, "QuorumStatus", wdNode->quorum_status); jw_put_int(jNode, "AliveNodeCount", wdNode->standby_nodes_count); jw_put_bool(jNode, "Escalated", wdNode->escalated == 0 ? false : true); + jw_put_bool(jNode, "LifecheckStarted", wdNode->lifecheck_started); if (authkey) jw_put_string(jNode, "authkey", authkey); @@ -589,7 +590,8 @@ get_watchdog_node_from_json(char *json_data, int data_len, char **authkey) goto ERROR_EXIT; if (json_get_int_value_for_key(root, "PgpoolNodeId", &wdNode->pgpool_node_id)) goto ERROR_EXIT; - + if (json_get_bool_value_for_key(root, "LifecheckStarted", &wdNode->lifecheck_started)) + goto ERROR_EXIT; ptr = json_get_string_value_for_key(root, "NodeName"); if (ptr == NULL) diff --git a/src/watchdog/wd_lifecheck.c b/src/watchdog/wd_lifecheck.c index a6958a395..77ac48a45 100644 --- a/src/watchdog/wd_lifecheck.c +++ b/src/watchdog/wd_lifecheck.c @@ -107,7 +107,7 @@ static int is_wd_lifecheck_ready(void); static int wd_lifecheck(void); static int wd_ping_pgpool(LifeCheckNode *node, char *password); static pid_t fork_lifecheck_child(void); - +static bool inform_node_status(LifeCheckNode *node, char *message); LifeCheckCluster *gslifeCheckCluster = NULL; /* lives in shared memory */ @@ -452,6 +452,9 @@ lifecheck_main(void) ereport(LOG, (errmsg("watchdog: lifecheck started"))); + LifeCheckNode *node = &gslifeCheckCluster->lifeCheckNodes[0]; + node->nodeState = NODE_LIFECHECK_STARTED; + inform_node_status(node, "lifecheck started"); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { @@ -547,6 +550,11 @@ inform_node_status(LifeCheckNode *node, char *message) new_status = "NODE ALIVE"; node_status = WD_LIFECHECK_NODE_STATUS_ALIVE; } + else if (node->nodeState == NODE_LIFECHECK_STARTED) + { + new_status = "NODE LIFECHECK STARTED"; + node_status = WD_LIFECHECK_NODE_LIFECHECK_STARTED; + } else return false; -- 2.47.3