"wal receiver" process hang in syslog() while exiting after receiving SIGTERM while the postgres has been promoted.

From: "Chen, Yan-Jack (NSB - CN/Hangzhou)" <yan-jack(dot)chen(at)nokia-sbell(dot)com>
To: PostgreSQL Hackers <pgsql-hackers(at)postgresql(dot)org>
Subject: "wal receiver" process hang in syslog() while exiting after receiving SIGTERM while the postgres has been promoted.
Date: 2018-06-21 05:11:02
Message-ID: HE1PR0702MB36260727D800D630F3DBFB52F8760@HE1PR0702MB3626.eurprd07.prod.outlook.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

Hi Hackers,
We encounter one problem happened while we try to promote standby postgres(version 9.6.9) server to active. From the trace(we triggered the process abort). We can see the process was hang in syslog() while handling SIGTERM. According to below article. Looks it is risky to write syslog in signal handling. Any idea to avoid it?

https://cwe.mitre.org/data/definitions/479.html

[New LWP 7741]
b[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `postgres: wal receiver process '.
Program terminated with signal SIGABRT, Aborted.
#0 0x00007ff25122d8db in __lll_lock_wait_private () from /lib64/libc.so.6
Missing separate debuginfos, use: dnf debuginfo-install postgresql-server-9.6.9-3.wf28.x86_64
(gdb) bt full

#0 0x00007ff25122d8db in __lll_lock_wait_private () from /lib64/libc.so.6
No symbol table info available.
#1 0x00007ff25121aa7b in __vsyslog_chk () from /lib64/libc.so.6
No symbol table info available.
#2 0x00007ff25121ae5c in syslog () from /lib64/libc.so.6
No symbol table info available.
#3 0x0000000000705e88 in write_syslog (level=3, line=0x2b19cd8 "FATAL: terminating walreceiver process due to administrator command\n") at elog.c:2033
buf = "FATAL: terminating walreceiver process due to administrator command\000\200\310\a\000\000\000\000\000U\211*Q\362\177\000\000\000\000\000\000\000\000\000\000P\317\033\373\377\177", '\000' <repeats 42 times>, "\020\272\261\002\000\000\000\000\020", '\000' <repeats 11 times>, "\001", '\000' <repeats 35 times>, "\240\311\033"...
buflen = 68
i = <optimized out>
chunk_nr = <optimized out>
seq = 4
len = 69
nlpos = 0x2b19d1c "\n"
#4 0x0000000000707105 in send_message_to_server_log (edata=0xb4ded8 <errordata+184>) at elog.c:3003
syslog_level = <optimized out>
buf = {data = 0x2b19cd8 "FATAL: terminating walreceiver process due to administrator command\n", len = 69, maxlen = 1024, cursor = 0}
buf = <optimized out>
syslog_level = <optimized out>
#5 EmitErrorReport () at elog.c:1479
edata = 0xb4ded8 <errordata+184>
oldcontext = <optimized out>
#6 0x0000000000707d8f in errfinish (dummy=<optimized out>) at elog.c:483
edata = 0xb4ded8 <errordata+184>
elevel = 21
oldcontext = <optimized out>
econtext = 0x0
__func__ = "errfinish"
#7 0x0000000000622e58 in ProcessWalRcvInterrupts () at walreceiver.c:171
No locals.
#8 0x0000000000622e9f in WalRcvShutdownHandler (postgres_signal_arg=<optimized out>) at walreceiver.c:821
save_errno = 0
#9 <signal handler called>
No symbol table info available.
#10 0x00007ff2512218fb in send () from /lib64/libc.so.6
No symbol table info available.
#11 0x00007ff25121ab72 in __vsyslog_chk () from /lib64/libc.so.6
No symbol table info available.
#12 0x00007ff25121ae5c in syslog () from /lib64/libc.so.6
No symbol table info available.
#13 0x0000000000705e88 in write_syslog (level=3, line=0x2b19929 "\t\tIs the server running on host \"192.168.1.8\" and accepting\n\t\tTCP/IP connections on port 5432?\n\t\n") at elog.c:2033
buf = "\t\tIs the server running on host \"192.168.1.8\" and accepting\000onnect to server: Connection refused\000\377\177\000\000x\376\263\002", '\000' <repeats 12 times>, "\003\000\000\000\000\000\000\000\000\300\rR\362\177\000\000\377\377\377\377\000\000\000\000\343\030bm\225k\330\001\270\267\337E\362\177\000\000\360\372\263\002\000\000\000\000(at)\324\033\373\377\177\000\000\000\267\261\030\275\036y=`\324\033\373\377\177\000\000`\324\033\373\377\177\000\000\220\325\033"...
buflen = 59
i = <optimized out>
chunk_nr = <optimized out>
seq = 4
len = 97
nlpos = 0x2b19964 "\n\t\tTCP/IP connections on port 5432?\n\t\n"
#14 0x0000000000707105 in send_message_to_server_log (edata=0xb4de20 <errordata>) at elog.c:3003
syslog_level = <optimized out>
buf = {data = 0x2b198c8 "terminating walreceiver process due to administrator command", len = 194, maxlen = 1024, cursor = 0}
buf = <optimized out>
syslog_level = <optimized out>
#15 EmitErrorReport () at elog.c:1479
edata = 0xb4de20 <errordata>
oldcontext = <optimized out>
#16 0x0000000000707d8f in errfinish (dummy=<optimized out>) at elog.c:483
edata = 0xb4de20 <errordata>
elevel = 21
oldcontext = <optimized out>
econtext = 0x0
__func__ = "errfinish"
#17 0x00007ff245dfcc90 in libpqrcv_connect () from /usr/lib/libpqwalreceiver.so
No symbol table info available.
#18 0x000000000062352b in WalReceiverMain () at walreceiver.c:314
conninfo = "host=192.168.1.8 port=5432 user=_nokfssystestpostgres\000ION\000\354Q\362\177\000\000\001", '\000' <repeats 16 times>, "[\fR\362\177\000\000\250o\023Q\362\177\000\000\060\335\033\373\377\177\000\000:\274\354Q\362\177\000\000x", '\000' <repeats 16 times>, "\020NQ\362\177\000\000x\000\000\000\000\000\000\000\222\335\033\373\377\177\000\000=\245*Q\362\177\000\000\222\335\033\373\377\177\000\000:\274\354Q\362\177\000\000=\036", '\000' <repeats 14 times>, "\001\000\000\000\000\000\000\000"...
tmp_conninfo = <optimized out>
slotname = "\000\000\000\000\000\000\000\000?\002", '\000' <repeats 12 times>, " \264MQ\362\177\000\000\t\000\000\000\000\000\000\000\260\335\033\373\377\177\000\000\060?\002\000\000\000\000\316l\031Q\362\177\000"
startpoint = 578813952
startpointTLI = 55
primaryTLI = 1360676704
first_stream = <optimized out>
walrcv = 0x7ff24fc0a500
last_recv_timestamp = <optimized out>
now = 582800572427736
ping_sent = <optimized out>
__func__ = "WalReceiverMain"
#19 0x00000000004d56f7 in AuxiliaryProcessMain (argc=argc(at)entry=2, argv=argv(at)entry=0x7ffffb1bddb0) at bootstrap.c:440
progname = 0x73f59e "postgres"
flag = -1
userDoption = 0x0
__func__ = "AuxiliaryProcessMain"
#20 0x00000000006096ea in StartChildProcess (type=WalReceiverProcess) at postmaster.c:5300
pid = <optimized out>
av = {0x73f59e "postgres", 0x7ffffb1bdd90 "-x6", 0x0, 0x1dc0 <error: Cannot access memory at address 0x1dc0>, 0x2b3c850 "\220\061\264\002", 0x7ff2520c5b00 "", 0x4 <error: Cannot access memory at address 0x4>, 0x0,
0x2b3c830 "\220\061\264\002", 0x60a781 <reaper+1490> "E\211,$H\203\304\020[]A\\A]A^?\005\310DW"}
ac = 2
typebuf = "-x6", '\000' <repeats 21 times>, "\306jp\000\000\000\000"
save_errno = <optimized out>
#21 0x0000000000609e3f in MaybeStartWalReceiver () at postmaster.c:5447
No locals.
#22 0x000000000060aae3 in sigusr1_handler (postgres_signal_arg=<optimized out>) at postmaster.c:5084
save_errno = 4
__func__ = "sigusr1_handler"
#23 <signal handler called>
No symbol table info available.
#24 0x00007ff251217f57 in select () from /lib64/libc.so.6
No symbol table info available.
#25 0x000000000046ec0a in ServerLoop () at postmaster.c:1685
timeout = {tv_sec = 58, tv_usec = 886165}
rmask = {fds_bits = {40, 0 <repeats 15 times>}}
selres = <optimized out>
now = <optimized out>
readmask = {fds_bits = {40, 0 <repeats 15 times>}}
nSockets = <optimized out>
last_lockfile_recheck_time = 1529485360
last_touch_time = 1529485360
__func__ = "ServerLoop"
#26 0x000000000060b94b in PostmasterMain (argc=argc(at)entry=3, argv=argv(at)entry=0x2b176f0) at postmaster.c:1329
opt = <optimized out>
status = <optimized out>
userDoption = <optimized out>
listen_addr_saved = <optimized out>
i = <optimized out>
output_config_variable = <optimized out>
__func__ = "PostmasterMain"
#27 0x000000000047001e in main (argc=3, argv=0x2b176f0) at main.c:228
No locals.

Br. :)
Chen Yan(Jack)

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Michael Paquier 2018-06-21 05:32:24 Re: PATCH: backtraces for error messages
Previous Message Craig Ringer 2018-06-21 04:35:10 Re: PATCH: backtraces for error messages