Re: SIGSEGV in BRIN autosummarize

From: Justin Pryzby <pryzby(at)telsasoft(dot)com>
To: pgsql-hackers(at)postgresql(dot)org
Cc: Alvaro Herrera <alvherre(at)alvh(dot)no-ip(dot)org>
Subject: Re: SIGSEGV in BRIN autosummarize
Date: 2017-10-14 22:42:20
Message-ID: 20171014224220.GB22678@telsasoft.com
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

On Fri, Oct 13, 2017 at 10:57:32PM -0500, Justin Pryzby wrote:
> I don't have any reason to believe there's memory issue on the server, So I
> suppose this is just a "heads up" to early adopters until/in case it happens
> again and I can at least provide a stack trace.

I'm back; find stacktrace below.

> Today I see:
> < 2017-10-13 17:22:47.839 -04 >LOG: server process (PID 32127) was terminated by signal 11: Segmentation fault
> < 2017-10-13 17:22:47.839 -04 >DETAIL: Failed process was running: autovacuum: BRIN summarize public.gtt 747263

Is it a coincidence the server failed within 45m of yesterday's failure ?

postmaster[26500] general protection ip:84a177 sp:7ffd9b349b88 error:0 in postgres[400000+692000]
< 2017-10-14 18:05:36.432 -04 >DETAIL: Failed process was running: autovacuum: BRIN summarize public.gtt 41087

> It looks like this table was being inserted into simultaneously by a python
> program using multiprocessing. It looks like each subprocess was INSERTing
> into several tables, each of which has one BRIN index on timestamp column.

I should add:
These are insert-only child tables in a heirarchy (not PG10 partitions), being
inserted into directly (not via trigger/rule).

Also notice the vacuum process was interrupted, same as yesterday (think
goodness for full logs). Our INSERT script is using python
multiprocessing.pool() with "maxtasksperchild=1", which I think means we load
one file and then exit the subprocess, and pool() creates a new subproc, which
starts a new PG session and transaction. Which explains why autovacuum starts
processing the table only to be immediately interrupted.

postgres=# SELECT * FROM postgres_log_2017_10_14_1800 WHERE pid=26500 ORDER BY log_time DESC LIMIT 9;
log_time | 2017-10-14 18:05:34.132-04
pid | 26500
session_id | 59e289b4.6784
session_line | 2
session_start_time | 2017-10-14 18:03:32-04
error_severity | ERROR
sql_state_code | 57014
message | canceling autovacuum task
context | processing work entry for relation "gtt.public.cdrs_eric_ggsnpdprecord_2017_10_14_recordopeningtime_idx"
-----------------------+---------------------------------------------------------------------------------------------------------
log_time | 2017-10-14 18:05:32.925-04
pid | 26500
session_id | 59e289b4.6784
session_line | 1
session_start_time | 2017-10-14 18:03:32-04
error_severity | ERROR
sql_state_code | 57014
message | canceling autovacuum task
context | automatic analyze of table "gtt.public.cdrs_eric_egsnpdprecord_2017_10_14"

gtt=# \dt+ *record_2017_10_14
public | cdrs_eric_egsnpdprecord_2017_10_14 | table | gtt | 1642 MB |
public | cdrs_eric_ggsnpdprecord_2017_10_14 | table | gtt | 492 MB |

gtt=# \di+ *_2017_10_14*_recordopeningtime_idx
public | cdrs_eric_egsnpdprecord_2017_10_14_recordopeningtime_idx | index | gtt | cdrs_eric_egsnpdprecord_2017_10_14 | 72 kB |
public | cdrs_eric_ggsnpdprecord_2017_10_14_recordopeningtime_idx | index | gtt | cdrs_eric_ggsnpdprecord_2017_10_14 | 48 kB |

Due to a .."behavioral deficiency" in the loader for those tables, the crashed
backend causes the loader to get stuck, so the tables should be untouched since
the crash, should it be desirable to inspect them.

#0 pfree (pointer=0x298c740) at mcxt.c:954
context = 0x7474617261763a20
#1 0x00000000006a52e9 in perform_work_item (workitem=0x7f8ad1f94824) at autovacuum.c:2676
cur_datname = 0x298c740 "no 1 :vartype 1184 :vartypmod -1 :varcollid 0 :varlevelsup 0 :varnoold 1 :varoattno 1 :location 146} {CONST :consttype 1184 :consttypmod -1 :constcollid 0 :constlen 8 :constbyval true :constisnull fal"...
cur_nspname = 0x298c728 "s ({VAR :varno 1 :varattno 1 :vartype 1184 :vartypmod -1 :varcollid 0 :varlevelsup 0 :varnoold 1 :varoattno 1 :location 146} {CONST :consttype 1184 :consttypmod -1 :constcollid 0 :constlen 8 :constbyv"...
cur_relname = 0x298cd68 "cdrs_eric_msc_sms_2017_10_14_startofcharge_idx"
__func__ = "perform_work_item"
#2 0x00000000006a6fd9 in do_autovacuum () at autovacuum.c:2533
workitem = 0x7f8ad1f94824
classRel = 0x7f89c26d0e58
tuple = <value optimized out>
relScan = <value optimized out>
dbForm = <value optimized out>
table_oids = <value optimized out>
orphan_oids = 0x0
ctl = {num_partitions = 0, ssize = 0, dsize = 0, max_dsize = 0, ffactor = 0, keysize = 4, entrysize = 80, hash = 0, match = 0, keycopy = 0, alloc = 0, hcxt = 0x0, hctl = 0x0}
table_toast_map = 0x29c8188
cell = 0x0
shared = 0x298ce18
dbentry = 0x298d0a0
bstrategy = 0x2a61c18
key = {sk_flags = 0, sk_attno = 16, sk_strategy = 3, sk_subtype = 0, sk_collation = 100, sk_func = {fn_addr = 0x750430 <chareq>, fn_oid = 61, fn_nargs = 2, fn_strict = 1 '\001', fn_retset = 0 '\000',
fn_stats = 2 '\002', fn_extra = 0x0, fn_mcxt = 0x28f3608, fn_expr = 0x0}, sk_argument = 116}
pg_class_desc = 0x29b66a8
effective_multixact_freeze_max_age = 400000000
did_vacuum = <value optimized out>
found_concurrent_worker = <value optimized out>
i = <value optimized out>
__func__ = "do_autovacuum"
#3 0x00000000006a7dc8 in AutoVacWorkerMain (argc=<value optimized out>, argv=<value optimized out>) at autovacuum.c:1694
dbname = "gtt\000\000\000\000\000\325I\001\327;\000\000\000\204g\000\000\000\000\000\000E\352\200\327;\000\000\000\001\000\000\000\000\000\000\000\002\000\000\000\000\000\000\000\374\024\316", '\000' <repeats 12 times>
local_sigjmp_buf = {{__jmpbuf = {0, -5392960675600994449, 42895168, 1508018581, 26499, 1508015202, 5391772750539845487, -5392961587993972881}, __mask_was_saved = 1, __saved_mask = {__val = {
18446744066192964103, 13184024, 1, 13182664, 0, 26499, 1508015202, 0, 257010229664, 1, 0, 140727207371280, 257014425560, 4, 0, 4}}}}
dbid = 16400
__func__ = "AutoVacWorkerMain"
#4 0x00000000006a7e96 in StartAutoVacWorker () at autovacuum.c:1499
worker_pid = 0
__func__ = "StartAutoVacWorker"
#5 0x00000000006b595a in StartAutovacuumWorker (postgres_signal_arg=<value optimized out>) at postmaster.c:5414
bn = 0x28e8740
#6 sigusr1_handler (postgres_signal_arg=<value optimized out>) at postmaster.c:5111
save_errno = 4
__func__ = "sigusr1_handler"
#7 <signal handler called>
No symbol table info available.
#8 0x0000003bd74e1603 in __select_nocancel () at ../sysdeps/unix/syscall-template.S:82
No locals.
#9 0x00000000006b6c96 in ServerLoop (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1717
timeout = {tv_sec = 59, tv_usec = 686521}
rmask = {fds_bits = {120, 0 <repeats 15 times>}}
selres = <value optimized out>
now = <value optimized out>
readmask = {fds_bits = {120, 0 <repeats 15 times>}}
nSockets = 7
last_lockfile_recheck_time = 1508018581
last_touch_time = 1508015202
#10 PostmasterMain (argc=<value optimized out>, argv=<value optimized out>) at postmaster.c:1361
opt = <value optimized out>
status = <value optimized out>
userDoption = <value optimized out>
listen_addr_saved = <value optimized out>
i = <value optimized out>
output_config_variable = <value optimized out>
__func__ = "PostmasterMain"
#11 0x0000000000638450 in main (argc=3, argv=0x28c03a0) at main.c:228

In response to

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Tomas Vondra 2017-10-14 23:57:14 Re: SIGSEGV in BRIN autosummarize
Previous Message Peter Geoghegan 2017-10-14 21:47:29 Re: [PATCH] pageinspect function to decode infomasks