Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * worker.c
3 : * PostgreSQL logical replication worker (apply)
4 : *
5 : * Copyright (c) 2016-2020, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/backend/replication/logical/worker.c
9 : *
10 : * NOTES
11 : * This file contains the worker which applies logical changes as they come
12 : * from remote logical replication stream.
13 : *
14 : * The main worker (apply) is started by logical replication worker
15 : * launcher for every enabled subscription in a database. It uses
16 : * walsender protocol to communicate with publisher.
17 : *
18 : * This module includes server facing code and shares libpqwalreceiver
19 : * module with walreceiver for providing the libpq specific functionality.
20 : *
21 : *
22 : * STREAMED TRANSACTIONS
23 : * ---------------------
24 : * Streamed transactions (large transactions exceeding a memory limit on the
25 : * upstream) are not applied immediately, but instead, the data is written
26 : * to temporary files and then applied at once when the final commit arrives.
27 : *
28 : * Unlike the regular (non-streamed) case, handling streamed transactions has
29 : * to handle aborts of both the toplevel transaction and subtransactions. This
30 : * is achieved by tracking offsets for subtransactions, which is then used
31 : * to truncate the file with serialized changes.
32 : *
33 : * The files are placed in tmp file directory by default, and the filenames
34 : * include both the XID of the toplevel transaction and OID of the
35 : * subscription. This is necessary so that different workers processing a
36 : * remote transaction with the same XID doesn't interfere.
37 : *
38 : * We use BufFiles instead of using normal temporary files because (a) the
39 : * BufFile infrastructure supports temporary files that exceed the OS file size
40 : * limit, (b) provides a way for automatic clean up on the error and (c) provides
41 : * a way to survive these files across local transactions and allow to open and
42 : * close at stream start and close. We decided to use SharedFileSet
43 : * infrastructure as without that it deletes the files on the closure of the
44 : * file and if we decide to keep stream files open across the start/stop stream
45 : * then it will consume a lot of memory (more than 8K for each BufFile and
46 : * there could be multiple such BufFiles as the subscriber could receive
47 : * multiple start/stop streams for different transactions before getting the
48 : * commit). Moreover, if we don't use SharedFileSet then we also need to invent
49 : * a new way to pass filenames to BufFile APIs so that we are allowed to open
50 : * the file we desired across multiple stream-open calls for the same
51 : * transaction.
52 : *-------------------------------------------------------------------------
53 : */
54 :
55 : #include "postgres.h"
56 :
57 : #include <sys/stat.h>
58 : #include <unistd.h>
59 :
60 : #include "access/table.h"
61 : #include "access/tableam.h"
62 : #include "access/xact.h"
63 : #include "access/xlog_internal.h"
64 : #include "catalog/catalog.h"
65 : #include "catalog/namespace.h"
66 : #include "catalog/partition.h"
67 : #include "catalog/pg_inherits.h"
68 : #include "catalog/pg_subscription.h"
69 : #include "catalog/pg_subscription_rel.h"
70 : #include "catalog/pg_tablespace.h"
71 : #include "commands/tablecmds.h"
72 : #include "commands/tablespace.h"
73 : #include "commands/trigger.h"
74 : #include "executor/executor.h"
75 : #include "executor/execPartition.h"
76 : #include "executor/nodeModifyTable.h"
77 : #include "funcapi.h"
78 : #include "libpq/pqformat.h"
79 : #include "libpq/pqsignal.h"
80 : #include "mb/pg_wchar.h"
81 : #include "miscadmin.h"
82 : #include "nodes/makefuncs.h"
83 : #include "optimizer/optimizer.h"
84 : #include "pgstat.h"
85 : #include "postmaster/bgworker.h"
86 : #include "postmaster/interrupt.h"
87 : #include "postmaster/postmaster.h"
88 : #include "postmaster/walwriter.h"
89 : #include "replication/decode.h"
90 : #include "replication/logical.h"
91 : #include "replication/logicalproto.h"
92 : #include "replication/logicalrelation.h"
93 : #include "replication/logicalworker.h"
94 : #include "replication/origin.h"
95 : #include "replication/reorderbuffer.h"
96 : #include "replication/snapbuild.h"
97 : #include "replication/walreceiver.h"
98 : #include "replication/worker_internal.h"
99 : #include "rewrite/rewriteHandler.h"
100 : #include "storage/buffile.h"
101 : #include "storage/bufmgr.h"
102 : #include "storage/fd.h"
103 : #include "storage/ipc.h"
104 : #include "storage/lmgr.h"
105 : #include "storage/proc.h"
106 : #include "storage/procarray.h"
107 : #include "tcop/tcopprot.h"
108 : #include "utils/builtins.h"
109 : #include "utils/catcache.h"
110 : #include "utils/dynahash.h"
111 : #include "utils/datum.h"
112 : #include "utils/fmgroids.h"
113 : #include "utils/guc.h"
114 : #include "utils/inval.h"
115 : #include "utils/lsyscache.h"
116 : #include "utils/memutils.h"
117 : #include "utils/rel.h"
118 : #include "utils/syscache.h"
119 : #include "utils/timeout.h"
120 :
121 : #define NAPTIME_PER_CYCLE 1000 /* max sleep time between cycles (1s) */
122 :
123 : typedef struct FlushPosition
124 : {
125 : dlist_node node;
126 : XLogRecPtr local_end;
127 : XLogRecPtr remote_end;
128 : } FlushPosition;
129 :
130 : static dlist_head lsn_mapping = DLIST_STATIC_INIT(lsn_mapping);
131 :
132 : typedef struct SlotErrCallbackArg
133 : {
134 : LogicalRepRelMapEntry *rel;
135 : int local_attnum;
136 : int remote_attnum;
137 : } SlotErrCallbackArg;
138 :
139 : /*
140 : * Stream xid hash entry. Whenever we see a new xid we create this entry in the
141 : * xidhash and along with it create the streaming file and store the fileset handle.
142 : * The subxact file is created iff there is any subxact info under this xid. This
143 : * entry is used on the subsequent streams for the xid to get the corresponding
144 : * fileset handles, so storing them in hash makes the search faster.
145 : */
146 : typedef struct StreamXidHash
147 : {
148 : TransactionId xid; /* xid is the hash key and must be first */
149 : SharedFileSet *stream_fileset; /* shared file set for stream data */
150 : SharedFileSet *subxact_fileset; /* shared file set for subxact info */
151 : } StreamXidHash;
152 :
153 : static MemoryContext ApplyMessageContext = NULL;
154 : MemoryContext ApplyContext = NULL;
155 :
156 : /* per stream context for streaming transactions */
157 : static MemoryContext LogicalStreamingContext = NULL;
158 :
159 : WalReceiverConn *wrconn = NULL;
160 :
161 : Subscription *MySubscription = NULL;
162 : bool MySubscriptionValid = false;
163 :
164 : bool in_remote_transaction = false;
165 : static XLogRecPtr remote_final_lsn = InvalidXLogRecPtr;
166 :
167 : /* fields valid only when processing streamed transaction */
168 : bool in_streamed_transaction = false;
169 :
170 : static TransactionId stream_xid = InvalidTransactionId;
171 :
172 : /*
173 : * Hash table for storing the streaming xid information along with shared file
174 : * set for streaming and subxact files.
175 : */
176 : static HTAB *xidhash = NULL;
177 :
178 : /* BufFile handle of the current streaming file */
179 : static BufFile *stream_fd = NULL;
180 :
181 : typedef struct SubXactInfo
182 : {
183 : TransactionId xid; /* XID of the subxact */
184 : int fileno; /* file number in the buffile */
185 : off_t offset; /* offset in the file */
186 : } SubXactInfo;
187 :
188 : /* Sub-transaction data for the current streaming transaction */
189 : typedef struct ApplySubXactData
190 : {
191 : uint32 nsubxacts; /* number of sub-transactions */
192 : uint32 nsubxacts_max; /* current capacity of subxacts */
193 : TransactionId subxact_last; /* xid of the last sub-transaction */
194 : SubXactInfo *subxacts; /* sub-xact offset in changes file */
195 : } ApplySubXactData;
196 :
197 : static ApplySubXactData subxact_data = {0, 0, InvalidTransactionId, NULL};
198 :
199 : static inline void subxact_filename(char *path, Oid subid, TransactionId xid);
200 : static inline void changes_filename(char *path, Oid subid, TransactionId xid);
201 :
202 : /*
203 : * Information about subtransactions of a given toplevel transaction.
204 : */
205 : static void subxact_info_write(Oid subid, TransactionId xid);
206 : static void subxact_info_read(Oid subid, TransactionId xid);
207 : static void subxact_info_add(TransactionId xid);
208 : static inline void cleanup_subxact_info(void);
209 :
210 : /*
211 : * Serialize and deserialize changes for a toplevel transaction.
212 : */
213 : static void stream_cleanup_files(Oid subid, TransactionId xid);
214 : static void stream_open_file(Oid subid, TransactionId xid, bool first);
215 : static void stream_write_change(char action, StringInfo s);
216 : static void stream_close_file(void);
217 :
218 : static void send_feedback(XLogRecPtr recvpos, bool force, bool requestReply);
219 :
220 : static void store_flush_position(XLogRecPtr remote_lsn);
221 :
222 : static void maybe_reread_subscription(void);
223 :
224 : /* prototype needed because of stream_commit */
225 : static void apply_dispatch(StringInfo s);
226 :
227 : static void apply_handle_insert_internal(ResultRelInfo *relinfo,
228 : EState *estate, TupleTableSlot *remoteslot);
229 : static void apply_handle_update_internal(ResultRelInfo *relinfo,
230 : EState *estate, TupleTableSlot *remoteslot,
231 : LogicalRepTupleData *newtup,
232 : LogicalRepRelMapEntry *relmapentry);
233 : static void apply_handle_delete_internal(ResultRelInfo *relinfo, EState *estate,
234 : TupleTableSlot *remoteslot,
235 : LogicalRepRelation *remoterel);
236 : static bool FindReplTupleInLocalRel(EState *estate, Relation localrel,
237 : LogicalRepRelation *remoterel,
238 : TupleTableSlot *remoteslot,
239 : TupleTableSlot **localslot);
240 : static void apply_handle_tuple_routing(ResultRelInfo *relinfo,
241 : EState *estate,
242 : TupleTableSlot *remoteslot,
243 : LogicalRepTupleData *newtup,
244 : LogicalRepRelMapEntry *relmapentry,
245 : CmdType operation);
246 :
247 : static int apply_spooled_messages(TransactionId xid, XLogRecPtr lsn);
248 :
249 : /*
250 : * Should this worker apply changes for given relation.
251 : *
252 : * This is mainly needed for initial relation data sync as that runs in
253 : * separate worker process running in parallel and we need some way to skip
254 : * changes coming to the main apply worker during the sync of a table.
255 : *
256 : * Note we need to do smaller or equals comparison for SYNCDONE state because
257 : * it might hold position of end of initial slot consistent point WAL
258 : * record + 1 (ie start of next record) and next record can be COMMIT of
259 : * transaction we are now processing (which is what we set remote_final_lsn
260 : * to in apply_handle_begin).
261 : */
262 : static bool
263 227768 : should_apply_changes_for_rel(LogicalRepRelMapEntry *rel)
264 : {
265 227768 : if (am_tablesync_worker())
266 2 : return MyLogicalRepWorker->relid == rel->localreloid;
267 : else
268 455536 : return (rel->state == SUBREL_STATE_READY ||
269 34 : (rel->state == SUBREL_STATE_SYNCDONE &&
270 4 : rel->statelsn <= remote_final_lsn));
271 : }
272 :
273 : /*
274 : * Make sure that we started local transaction.
275 : *
276 : * Also switches to ApplyMessageContext as necessary.
277 : */
278 : static bool
279 228452 : ensure_transaction(void)
280 : {
281 228452 : if (IsTransactionState())
282 : {
283 227378 : SetCurrentStatementStartTimestamp();
284 :
285 227378 : if (CurrentMemoryContext != ApplyMessageContext)
286 0 : MemoryContextSwitchTo(ApplyMessageContext);
287 :
288 227378 : return false;
289 : }
290 :
291 1074 : SetCurrentStatementStartTimestamp();
292 1074 : StartTransactionCommand();
293 :
294 1074 : maybe_reread_subscription();
295 :
296 1070 : MemoryContextSwitchTo(ApplyMessageContext);
297 1070 : return true;
298 : }
299 :
300 : /*
301 : * Handle streamed transactions.
302 : *
303 : * If in streaming mode (receiving a block of streamed transaction), we
304 : * simply redirect it to a file for the proper toplevel transaction.
305 : *
306 : * Returns true for streamed transactions, false otherwise (regular mode).
307 : */
308 : static bool
309 475286 : handle_streamed_transaction(const char action, StringInfo s)
310 : {
311 : TransactionId xid;
312 :
313 : /* not in streaming mode */
314 475286 : if (!in_streamed_transaction)
315 228040 : return false;
316 :
317 247246 : Assert(stream_fd != NULL);
318 247246 : Assert(TransactionIdIsValid(stream_xid));
319 :
320 : /*
321 : * We should have received XID of the subxact as the first part of the
322 : * message, so extract it.
323 : */
324 247246 : xid = pq_getmsgint(s, 4);
325 :
326 247246 : Assert(TransactionIdIsValid(xid));
327 :
328 : /* Add the new subxact to the array (unless already there). */
329 247246 : subxact_info_add(xid);
330 :
331 : /* write the change to the current file */
332 247246 : stream_write_change(action, s);
333 :
334 247246 : return true;
335 : }
336 :
337 : /*
338 : * Executor state preparation for evaluation of constraint expressions,
339 : * indexes and triggers.
340 : *
341 : * This is based on similar code in copy.c
342 : */
343 : static EState *
344 227702 : create_estate_for_relation(LogicalRepRelMapEntry *rel)
345 : {
346 : EState *estate;
347 : RangeTblEntry *rte;
348 :
349 227702 : estate = CreateExecutorState();
350 :
351 227702 : rte = makeNode(RangeTblEntry);
352 227702 : rte->rtekind = RTE_RELATION;
353 227702 : rte->relid = RelationGetRelid(rel->localrel);
354 227702 : rte->relkind = rel->localrel->rd_rel->relkind;
355 227702 : rte->rellockmode = AccessShareLock;
356 227702 : ExecInitRangeTable(estate, list_make1(rte));
357 :
358 227702 : estate->es_output_cid = GetCurrentCommandId(true);
359 :
360 : /* Prepare to catch AFTER triggers. */
361 227702 : AfterTriggerBeginQuery();
362 :
363 227702 : return estate;
364 : }
365 :
366 : /*
367 : * Executes default values for columns for which we can't map to remote
368 : * relation columns.
369 : *
370 : * This allows us to support tables which have more columns on the downstream
371 : * than on the upstream.
372 : */
373 : static void
374 102826 : slot_fill_defaults(LogicalRepRelMapEntry *rel, EState *estate,
375 : TupleTableSlot *slot)
376 : {
377 102826 : TupleDesc desc = RelationGetDescr(rel->localrel);
378 102826 : int num_phys_attrs = desc->natts;
379 : int i;
380 : int attnum,
381 102826 : num_defaults = 0;
382 : int *defmap;
383 : ExprState **defexprs;
384 : ExprContext *econtext;
385 :
386 102826 : econtext = GetPerTupleExprContext(estate);
387 :
388 : /* We got all the data via replication, no need to evaluate anything. */
389 102826 : if (num_phys_attrs == rel->remoterel.natts)
390 103594 : return;
391 :
392 102058 : defmap = (int *) palloc(num_phys_attrs * sizeof(int));
393 102058 : defexprs = (ExprState **) palloc(num_phys_attrs * sizeof(ExprState *));
394 :
395 102058 : Assert(rel->attrmap->maplen == num_phys_attrs);
396 537176 : for (attnum = 0; attnum < num_phys_attrs; attnum++)
397 : {
398 : Expr *defexpr;
399 :
400 435118 : if (TupleDescAttr(desc, attnum)->attisdropped || TupleDescAttr(desc, attnum)->attgenerated)
401 4 : continue;
402 :
403 435114 : if (rel->attrmap->attnums[attnum] >= 0)
404 229104 : continue;
405 :
406 206010 : defexpr = (Expr *) build_column_default(rel->localrel, attnum + 1);
407 :
408 206010 : if (defexpr != NULL)
409 : {
410 : /* Run the expression through planner */
411 170042 : defexpr = expression_planner(defexpr);
412 :
413 : /* Initialize executable expression in copycontext */
414 170042 : defexprs[num_defaults] = ExecInitExpr(defexpr, NULL);
415 170042 : defmap[num_defaults] = attnum;
416 170042 : num_defaults++;
417 : }
418 :
419 : }
420 :
421 272100 : for (i = 0; i < num_defaults; i++)
422 340084 : slot->tts_values[defmap[i]] =
423 170042 : ExecEvalExpr(defexprs[i], econtext, &slot->tts_isnull[defmap[i]]);
424 : }
425 :
426 : /*
427 : * Error callback to give more context info about type conversion failure.
428 : */
429 : static void
430 0 : slot_store_error_callback(void *arg)
431 : {
432 0 : SlotErrCallbackArg *errarg = (SlotErrCallbackArg *) arg;
433 : LogicalRepRelMapEntry *rel;
434 : char *remotetypname;
435 : Oid remotetypoid,
436 : localtypoid;
437 :
438 : /* Nothing to do if remote attribute number is not set */
439 0 : if (errarg->remote_attnum < 0)
440 0 : return;
441 :
442 0 : rel = errarg->rel;
443 0 : remotetypoid = rel->remoterel.atttyps[errarg->remote_attnum];
444 :
445 : /* Fetch remote type name from the LogicalRepTypMap cache */
446 0 : remotetypname = logicalrep_typmap_gettypname(remotetypoid);
447 :
448 : /* Fetch local type OID from the local sys cache */
449 0 : localtypoid = get_atttype(rel->localreloid, errarg->local_attnum + 1);
450 :
451 0 : errcontext("processing remote data for replication target relation \"%s.%s\" column \"%s\", "
452 : "remote type %s, local type %s",
453 : rel->remoterel.nspname, rel->remoterel.relname,
454 0 : rel->remoterel.attnames[errarg->remote_attnum],
455 : remotetypname,
456 : format_type_be(localtypoid));
457 : }
458 :
459 : /*
460 : * Store tuple data into slot.
461 : *
462 : * Incoming data can be either text or binary format.
463 : */
464 : static void
465 227702 : slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
466 : LogicalRepTupleData *tupleData)
467 : {
468 227702 : int natts = slot->tts_tupleDescriptor->natts;
469 : int i;
470 : SlotErrCallbackArg errarg;
471 : ErrorContextCallback errcallback;
472 :
473 227702 : ExecClearTuple(slot);
474 :
475 : /* Push callback + info on the error context stack */
476 227702 : errarg.rel = rel;
477 227702 : errarg.local_attnum = -1;
478 227702 : errarg.remote_attnum = -1;
479 227702 : errcallback.callback = slot_store_error_callback;
480 227702 : errcallback.arg = (void *) &errarg;
481 227702 : errcallback.previous = error_context_stack;
482 227702 : error_context_stack = &errcallback;
483 :
484 : /* Call the "in" function for each non-dropped, non-null attribute */
485 227702 : Assert(natts == rel->attrmap->maplen);
486 1161872 : for (i = 0; i < natts; i++)
487 : {
488 934170 : Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i);
489 934170 : int remoteattnum = rel->attrmap->attnums[i];
490 :
491 934170 : if (!att->attisdropped && remoteattnum >= 0)
492 479710 : {
493 479710 : StringInfo colvalue = &tupleData->colvalues[remoteattnum];
494 :
495 479710 : Assert(remoteattnum < tupleData->ncols);
496 :
497 479710 : errarg.local_attnum = i;
498 479710 : errarg.remote_attnum = remoteattnum;
499 :
500 479710 : if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
501 : {
502 : Oid typinput;
503 : Oid typioparam;
504 :
505 350828 : getTypeInputInfo(att->atttypid, &typinput, &typioparam);
506 701656 : slot->tts_values[i] =
507 350828 : OidInputFunctionCall(typinput, colvalue->data,
508 : typioparam, att->atttypmod);
509 350828 : slot->tts_isnull[i] = false;
510 : }
511 128882 : else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY)
512 : {
513 : Oid typreceive;
514 : Oid typioparam;
515 :
516 : /*
517 : * In some code paths we may be asked to re-parse the same
518 : * tuple data. Reset the StringInfo's cursor so that works.
519 : */
520 66738 : colvalue->cursor = 0;
521 :
522 66738 : getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam);
523 133476 : slot->tts_values[i] =
524 66738 : OidReceiveFunctionCall(typreceive, colvalue,
525 : typioparam, att->atttypmod);
526 :
527 : /* Trouble if it didn't eat the whole buffer */
528 66738 : if (colvalue->cursor != colvalue->len)
529 0 : ereport(ERROR,
530 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
531 : errmsg("incorrect binary data format in logical replication column %d",
532 : remoteattnum + 1)));
533 66738 : slot->tts_isnull[i] = false;
534 : }
535 : else
536 : {
537 : /*
538 : * NULL value from remote. (We don't expect to see
539 : * LOGICALREP_COLUMN_UNCHANGED here, but if we do, treat it as
540 : * NULL.)
541 : */
542 62144 : slot->tts_values[i] = (Datum) 0;
543 62144 : slot->tts_isnull[i] = true;
544 : }
545 :
546 479710 : errarg.local_attnum = -1;
547 479710 : errarg.remote_attnum = -1;
548 : }
549 : else
550 : {
551 : /*
552 : * We assign NULL to dropped attributes and missing values
553 : * (missing values should be later filled using
554 : * slot_fill_defaults).
555 : */
556 454460 : slot->tts_values[i] = (Datum) 0;
557 454460 : slot->tts_isnull[i] = true;
558 : }
559 : }
560 :
561 : /* Pop the error context stack */
562 227702 : error_context_stack = errcallback.previous;
563 :
564 227702 : ExecStoreVirtualTuple(slot);
565 227702 : }
566 :
567 : /*
568 : * Replace updated columns with data from the LogicalRepTupleData struct.
569 : * This is somewhat similar to heap_modify_tuple but also calls the type
570 : * input functions on the user data.
571 : *
572 : * "slot" is filled with a copy of the tuple in "srcslot", replacing
573 : * columns provided in "tupleData" and leaving others as-is.
574 : *
575 : * Caution: unreplaced pass-by-ref columns in "slot" will point into the
576 : * storage for "srcslot". This is OK for current usage, but someday we may
577 : * need to materialize "slot" at the end to make it independent of "srcslot".
578 : */
579 : static void
580 62776 : slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot,
581 : LogicalRepRelMapEntry *rel,
582 : LogicalRepTupleData *tupleData)
583 : {
584 62776 : int natts = slot->tts_tupleDescriptor->natts;
585 : int i;
586 : SlotErrCallbackArg errarg;
587 : ErrorContextCallback errcallback;
588 :
589 : /* We'll fill "slot" with a virtual tuple, so we must start with ... */
590 62776 : ExecClearTuple(slot);
591 :
592 : /*
593 : * Copy all the column data from srcslot, so that we'll have valid values
594 : * for unreplaced columns.
595 : */
596 62776 : Assert(natts == srcslot->tts_tupleDescriptor->natts);
597 62776 : slot_getallattrs(srcslot);
598 62776 : memcpy(slot->tts_values, srcslot->tts_values, natts * sizeof(Datum));
599 62776 : memcpy(slot->tts_isnull, srcslot->tts_isnull, natts * sizeof(bool));
600 :
601 : /* For error reporting, push callback + info on the error context stack */
602 62776 : errarg.rel = rel;
603 62776 : errarg.local_attnum = -1;
604 62776 : errarg.remote_attnum = -1;
605 62776 : errcallback.callback = slot_store_error_callback;
606 62776 : errcallback.arg = (void *) &errarg;
607 62776 : errcallback.previous = error_context_stack;
608 62776 : error_context_stack = &errcallback;
609 :
610 : /* Call the "in" function for each replaced attribute */
611 62776 : Assert(natts == rel->attrmap->maplen);
612 313368 : for (i = 0; i < natts; i++)
613 : {
614 250592 : Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i);
615 250592 : int remoteattnum = rel->attrmap->attnums[i];
616 :
617 250592 : if (remoteattnum < 0)
618 125080 : continue;
619 :
620 125512 : Assert(remoteattnum < tupleData->ncols);
621 :
622 125512 : if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
623 : {
624 125506 : StringInfo colvalue = &tupleData->colvalues[remoteattnum];
625 :
626 125506 : errarg.local_attnum = i;
627 125506 : errarg.remote_attnum = remoteattnum;
628 :
629 125506 : if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
630 : {
631 : Oid typinput;
632 : Oid typioparam;
633 :
634 82060 : getTypeInputInfo(att->atttypid, &typinput, &typioparam);
635 164120 : slot->tts_values[i] =
636 82060 : OidInputFunctionCall(typinput, colvalue->data,
637 : typioparam, att->atttypmod);
638 82060 : slot->tts_isnull[i] = false;
639 : }
640 43446 : else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY)
641 : {
642 : Oid typreceive;
643 : Oid typioparam;
644 :
645 : /*
646 : * In some code paths we may be asked to re-parse the same
647 : * tuple data. Reset the StringInfo's cursor so that works.
648 : */
649 43362 : colvalue->cursor = 0;
650 :
651 43362 : getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam);
652 86724 : slot->tts_values[i] =
653 43362 : OidReceiveFunctionCall(typreceive, colvalue,
654 : typioparam, att->atttypmod);
655 :
656 : /* Trouble if it didn't eat the whole buffer */
657 43362 : if (colvalue->cursor != colvalue->len)
658 0 : ereport(ERROR,
659 : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
660 : errmsg("incorrect binary data format in logical replication column %d",
661 : remoteattnum + 1)));
662 43362 : slot->tts_isnull[i] = false;
663 : }
664 : else
665 : {
666 : /* must be LOGICALREP_COLUMN_NULL */
667 84 : slot->tts_values[i] = (Datum) 0;
668 84 : slot->tts_isnull[i] = true;
669 : }
670 :
671 125506 : errarg.local_attnum = -1;
672 125506 : errarg.remote_attnum = -1;
673 : }
674 : }
675 :
676 : /* Pop the error context stack */
677 62776 : error_context_stack = errcallback.previous;
678 :
679 : /* And finally, declare that "slot" contains a valid virtual tuple */
680 62776 : ExecStoreVirtualTuple(slot);
681 62776 : }
682 :
683 : /*
684 : * Handle BEGIN message.
685 : */
686 : static void
687 500 : apply_handle_begin(StringInfo s)
688 : {
689 : LogicalRepBeginData begin_data;
690 :
691 500 : logicalrep_read_begin(s, &begin_data);
692 :
693 500 : remote_final_lsn = begin_data.final_lsn;
694 :
695 500 : in_remote_transaction = true;
696 :
697 500 : pgstat_report_activity(STATE_RUNNING, NULL);
698 500 : }
699 :
700 : /*
701 : * Handle COMMIT message.
702 : *
703 : * TODO, support tracking of multiple origins
704 : */
705 : static void
706 488 : apply_handle_commit(StringInfo s)
707 : {
708 : LogicalRepCommitData commit_data;
709 :
710 488 : logicalrep_read_commit(s, &commit_data);
711 :
712 488 : Assert(commit_data.commit_lsn == remote_final_lsn);
713 :
714 : /* The synchronization worker runs in single transaction. */
715 488 : if (IsTransactionState() && !am_tablesync_worker())
716 : {
717 : /*
718 : * Update origin state so we can restart streaming from correct
719 : * position in case of crash.
720 : */
721 368 : replorigin_session_origin_lsn = commit_data.end_lsn;
722 368 : replorigin_session_origin_timestamp = commit_data.committime;
723 :
724 368 : CommitTransactionCommand();
725 368 : pgstat_report_stat(false);
726 :
727 368 : store_flush_position(commit_data.end_lsn);
728 : }
729 : else
730 : {
731 : /* Process any invalidation messages that might have accumulated. */
732 120 : AcceptInvalidationMessages();
733 120 : maybe_reread_subscription();
734 : }
735 :
736 488 : in_remote_transaction = false;
737 :
738 : /* Process any tables that are being synchronized in parallel. */
739 488 : process_syncing_tables(commit_data.end_lsn);
740 :
741 484 : pgstat_report_activity(STATE_IDLE, NULL);
742 484 : }
743 :
744 : /*
745 : * Called from apply_handle_prepare to handle a PREPARE TRANSACTION.
746 : */
747 : static void
748 6 : apply_handle_prepare_txn(LogicalRepPrepareData *prepare_data)
749 : {
750 6 : Assert(prepare_data->prepare_lsn == remote_final_lsn);
751 :
752 : /* The synchronization worker runs in single transaction. */
753 6 : if (IsTransactionState() && !am_tablesync_worker())
754 : {
755 : /*
756 : * BeginTransactionBlock is necessary to balance the
757 : * EndTransactionBlock called within the PrepareTransactionBlock
758 : * below.
759 : */
760 6 : BeginTransactionBlock();
761 6 : CommitTransactionCommand();
762 :
763 : /*
764 : * Update origin state so we can restart streaming from correct
765 : * position in case of crash.
766 : */
767 6 : replorigin_session_origin_lsn = prepare_data->end_lsn;
768 6 : replorigin_session_origin_timestamp = prepare_data->preparetime;
769 :
770 6 : PrepareTransactionBlock(prepare_data->gid);
771 6 : CommitTransactionCommand();
772 6 : pgstat_report_stat(false);
773 :
774 6 : store_flush_position(prepare_data->end_lsn);
775 : }
776 : else
777 : {
778 : /* Process any invalidation messages that might have accumulated. */
779 0 : AcceptInvalidationMessages();
780 0 : maybe_reread_subscription();
781 : }
782 :
783 6 : in_remote_transaction = false;
784 :
785 : /* Process any tables that are being synchronized in parallel. */
786 6 : process_syncing_tables(prepare_data->end_lsn);
787 :
788 6 : pgstat_report_activity(STATE_IDLE, NULL);
789 6 : }
790 :
791 : /*
792 : * Called from apply_handle_prepare to handle a COMMIT PREPARED of a previously
793 : * PREPARED transaction.
794 : */
795 : static void
796 16 : apply_handle_commit_prepared_txn(LogicalRepPrepareData *prepare_data)
797 : {
798 : /* there is no transaction when COMMIT PREPARED is called */
799 16 : ensure_transaction();
800 :
801 : /*
802 : * Update origin state so we can restart streaming from correct position
803 : * in case of crash.
804 : */
805 16 : replorigin_session_origin_lsn = prepare_data->end_lsn;
806 16 : replorigin_session_origin_timestamp = prepare_data->preparetime;
807 :
808 16 : FinishPreparedTransaction(prepare_data->gid, true);
809 16 : CommitTransactionCommand();
810 16 : pgstat_report_stat(false);
811 :
812 16 : store_flush_position(prepare_data->end_lsn);
813 16 : in_remote_transaction = false;
814 :
815 : /* Process any tables that are being synchronized in parallel. */
816 16 : process_syncing_tables(prepare_data->end_lsn);
817 :
818 16 : pgstat_report_activity(STATE_IDLE, NULL);
819 16 : }
820 :
821 : /*
822 : * Called from apply_handle_prepare to handle a ROLLBACK PREPARED of a previously
823 : * PREPARED TRANSACTION.
824 : */
825 : static void
826 4 : apply_handle_rollback_prepared_txn(LogicalRepPrepareData *prepare_data)
827 : {
828 : /*
829 : * Update origin state so we can restart streaming from correct position
830 : * in case of crash.
831 : */
832 4 : replorigin_session_origin_lsn = prepare_data->end_lsn;
833 4 : replorigin_session_origin_timestamp = prepare_data->preparetime;
834 :
835 : /*
836 : * During logical decoding, on the apply side, it's possible that a
837 : * prepared transaction got aborted while decoding. In that case, we stop
838 : * the decoding and abort the transaction immediately. However the
839 : * ROLLBACK prepared processing still reaches the subscriber. In that case
840 : * it's ok to have a missing gid
841 : */
842 4 : if (LookupGXact(prepare_data->gid))
843 : {
844 : /* there is no transaction when ABORT/ROLLBACK PREPARED is called */
845 4 : ensure_transaction();
846 4 : FinishPreparedTransaction(prepare_data->gid, false);
847 4 : CommitTransactionCommand();
848 : }
849 :
850 4 : pgstat_report_stat(false);
851 :
852 4 : store_flush_position(prepare_data->end_lsn);
853 4 : in_remote_transaction = false;
854 :
855 : /* Process any tables that are being synchronized in parallel. */
856 4 : process_syncing_tables(prepare_data->end_lsn);
857 :
858 4 : pgstat_report_activity(STATE_IDLE, NULL);
859 4 : }
860 :
861 : /*
862 : * Handle PREPARE message.
863 : */
864 : static void
865 26 : apply_handle_prepare(StringInfo s)
866 : {
867 : LogicalRepPrepareData prepare_data;
868 :
869 26 : logicalrep_read_prepare(s, &prepare_data);
870 :
871 26 : switch (prepare_data.prepare_type)
872 : {
873 : case LOGICALREP_IS_PREPARE:
874 6 : apply_handle_prepare_txn(&prepare_data);
875 6 : break;
876 :
877 : case LOGICALREP_IS_COMMIT_PREPARED:
878 16 : apply_handle_commit_prepared_txn(&prepare_data);
879 16 : break;
880 :
881 : case LOGICALREP_IS_ROLLBACK_PREPARED:
882 4 : apply_handle_rollback_prepared_txn(&prepare_data);
883 4 : break;
884 :
885 : default:
886 0 : ereport(ERROR,
887 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
888 : errmsg("unexpected type of prepare message: %d",
889 : prepare_data.prepare_type)));
890 : }
891 26 : }
892 :
893 : /*
894 : * Handle STREAM PREPARE.
895 : *
896 : * Logic is in two parts:
897 : * 1. Replay all the spooled operations
898 : * 2. Mark the transaction as prepared
899 : */
900 : static void
901 12 : apply_handle_stream_prepare(StringInfo s)
902 : {
903 12 : int nchanges = 0;
904 : LogicalRepPrepareData prepare_data;
905 : TransactionId xid;
906 :
907 12 : Assert(!in_streamed_transaction);
908 :
909 12 : xid = logicalrep_read_stream_prepare(s, &prepare_data);
910 12 : elog(DEBUG1, "received prepare for streamed transaction %u", xid);
911 :
912 : /*
913 : * This should be a PREPARE only. The COMMIT PREPARED and ROLLBACK
914 : * PREPARED for streaming are handled by the non-streaming APIs.
915 : */
916 12 : Assert(prepare_data.prepare_type == LOGICALREP_IS_PREPARE);
917 :
918 : /*
919 : *
920 : * --------------------------------------------------------------------------
921 : * 1. Replay all the spooled operations - Similar code as for
922 : * apply_handle_stream_commit (i.e. non two-phase stream commit)
923 : * --------------------------------------------------------------------------
924 : */
925 :
926 12 : ensure_transaction();
927 :
928 : /*
929 : * BeginTransactionBlock is necessary to balance the EndTransactionBlock
930 : * called within the PrepareTransactionBlock below.
931 : */
932 12 : BeginTransactionBlock();
933 12 : CommitTransactionCommand();
934 :
935 12 : nchanges = apply_spooled_messages(xid, prepare_data.prepare_lsn);
936 :
937 : /*
938 : *
939 : * --------------------------------------------------------------------------
940 : * 2. Mark the transaction as prepared. - Similar code as for
941 : * apply_handle_prepare_txn (i.e. two-phase non-streamed prepare)
942 : * --------------------------------------------------------------------------
943 : */
944 :
945 : /*
946 : * Update origin state so we can restart streaming from correct position
947 : * in case of crash.
948 : */
949 12 : replorigin_session_origin_lsn = prepare_data.end_lsn;
950 12 : replorigin_session_origin_timestamp = prepare_data.preparetime;
951 :
952 12 : PrepareTransactionBlock(prepare_data.gid);
953 12 : CommitTransactionCommand();
954 :
955 12 : pgstat_report_stat(false);
956 :
957 12 : store_flush_position(prepare_data.end_lsn);
958 :
959 12 : elog(DEBUG1, "apply_handle_stream_prepare_txn: replayed %d (all) changes.", nchanges);
960 :
961 12 : in_remote_transaction = false;
962 :
963 : /* Process any tables that are being synchronized in parallel. */
964 12 : process_syncing_tables(prepare_data.end_lsn);
965 :
966 : /* unlink the files with serialized changes and subxact info */
967 12 : stream_cleanup_files(MyLogicalRepWorker->subid, xid);
968 :
969 12 : pgstat_report_activity(STATE_IDLE, NULL);
970 12 : }
971 :
972 : /*
973 : * Handle ORIGIN message.
974 : *
975 : * TODO, support tracking of multiple origins
976 : */
977 : static void
978 0 : apply_handle_origin(StringInfo s)
979 : {
980 : /*
981 : * ORIGIN message can only come inside streaming transaction or inside
982 : * remote transaction and before any actual writes.
983 : */
984 0 : if (!in_streamed_transaction &&
985 0 : (!in_remote_transaction ||
986 0 : (IsTransactionState() && !am_tablesync_worker())))
987 0 : ereport(ERROR,
988 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
989 : errmsg("ORIGIN message sent out of order")));
990 0 : }
991 :
992 : /*
993 : * Handle STREAM START message.
994 : */
995 : static void
996 610 : apply_handle_stream_start(StringInfo s)
997 : {
998 : bool first_segment;
999 : HASHCTL hash_ctl;
1000 :
1001 610 : Assert(!in_streamed_transaction);
1002 :
1003 : /*
1004 : * Start a transaction on stream start, this transaction will be committed
1005 : * on the stream stop. We need the transaction for handling the buffile,
1006 : * used for serializing the streaming data and subxact info.
1007 : */
1008 610 : ensure_transaction();
1009 :
1010 : /* notify handle methods we're processing a remote transaction */
1011 610 : in_streamed_transaction = true;
1012 :
1013 : /* extract XID of the top-level transaction */
1014 610 : stream_xid = logicalrep_read_stream_start(s, &first_segment);
1015 :
1016 : /*
1017 : * Initialize the xidhash table if we haven't yet. This will be used for
1018 : * the entire duration of the apply worker so create it in permanent
1019 : * context.
1020 : */
1021 610 : if (xidhash == NULL)
1022 : {
1023 16 : hash_ctl.keysize = sizeof(TransactionId);
1024 16 : hash_ctl.entrysize = sizeof(StreamXidHash);
1025 16 : hash_ctl.hcxt = ApplyContext;
1026 16 : xidhash = hash_create("StreamXidHash", 1024, &hash_ctl,
1027 : HASH_ELEM | HASH_CONTEXT);
1028 : }
1029 :
1030 : /* open the spool file for this transaction */
1031 610 : stream_open_file(MyLogicalRepWorker->subid, stream_xid, first_segment);
1032 :
1033 : /* if this is not the first segment, open existing subxact file */
1034 610 : if (!first_segment)
1035 566 : subxact_info_read(MyLogicalRepWorker->subid, stream_xid);
1036 :
1037 610 : pgstat_report_activity(STATE_RUNNING, NULL);
1038 610 : }
1039 :
1040 : /*
1041 : * Handle STREAM STOP message.
1042 : */
1043 : static void
1044 610 : apply_handle_stream_stop(StringInfo s)
1045 : {
1046 610 : Assert(in_streamed_transaction);
1047 :
1048 : /*
1049 : * Close the file with serialized changes, and serialize information about
1050 : * subxacts for the toplevel transaction.
1051 : */
1052 610 : subxact_info_write(MyLogicalRepWorker->subid, stream_xid);
1053 610 : stream_close_file();
1054 :
1055 : /* We must be in a valid transaction state */
1056 610 : Assert(IsTransactionState());
1057 :
1058 : /* Commit the per-stream transaction */
1059 610 : CommitTransactionCommand();
1060 :
1061 610 : in_streamed_transaction = false;
1062 :
1063 : /* Reset per-stream context */
1064 610 : MemoryContextReset(LogicalStreamingContext);
1065 :
1066 610 : pgstat_report_activity(STATE_IDLE, NULL);
1067 610 : }
1068 :
1069 : /*
1070 : * Handle STREAM abort message.
1071 : */
1072 : static void
1073 26 : apply_handle_stream_abort(StringInfo s)
1074 : {
1075 : TransactionId xid;
1076 : TransactionId subxid;
1077 :
1078 26 : Assert(!in_streamed_transaction);
1079 :
1080 26 : logicalrep_read_stream_abort(s, &xid, &subxid);
1081 :
1082 : /*
1083 : * If the two XIDs are the same, it's in fact abort of toplevel xact, so
1084 : * just delete the files with serialized info.
1085 : */
1086 26 : if (xid == subxid)
1087 2 : stream_cleanup_files(MyLogicalRepWorker->subid, xid);
1088 : else
1089 : {
1090 : /*
1091 : * OK, so it's a subxact. We need to read the subxact file for the
1092 : * toplevel transaction, determine the offset tracked for the subxact,
1093 : * and truncate the file with changes. We also remove the subxacts
1094 : * with higher offsets (or rather higher XIDs).
1095 : *
1096 : * We intentionally scan the array from the tail, because we're likely
1097 : * aborting a change for the most recent subtransactions.
1098 : *
1099 : * We can't use the binary search here as subxact XIDs won't
1100 : * necessarily arrive in sorted order, consider the case where we have
1101 : * released the savepoint for multiple subtransactions and then
1102 : * performed rollback to savepoint for one of the earlier
1103 : * sub-transaction.
1104 : */
1105 :
1106 : int64 i;
1107 : int64 subidx;
1108 : BufFile *fd;
1109 24 : bool found = false;
1110 : char path[MAXPGPATH];
1111 : StreamXidHash *ent;
1112 :
1113 24 : subidx = -1;
1114 24 : ensure_transaction();
1115 24 : subxact_info_read(MyLogicalRepWorker->subid, xid);
1116 :
1117 28 : for (i = subxact_data.nsubxacts; i > 0; i--)
1118 : {
1119 20 : if (subxact_data.subxacts[i - 1].xid == subxid)
1120 : {
1121 16 : subidx = (i - 1);
1122 16 : found = true;
1123 16 : break;
1124 : }
1125 : }
1126 :
1127 : /*
1128 : * If it's an empty sub-transaction then we will not find the subxid
1129 : * here so just cleanup the subxact info and return.
1130 : */
1131 24 : if (!found)
1132 : {
1133 : /* Cleanup the subxact info */
1134 8 : cleanup_subxact_info();
1135 8 : CommitTransactionCommand();
1136 34 : return;
1137 : }
1138 :
1139 16 : Assert((subidx >= 0) && (subidx < subxact_data.nsubxacts));
1140 :
1141 16 : ent = (StreamXidHash *) hash_search(xidhash,
1142 : (void *) &xid,
1143 : HASH_FIND,
1144 : &found);
1145 16 : Assert(found);
1146 :
1147 : /* open the changes file */
1148 16 : changes_filename(path, MyLogicalRepWorker->subid, xid);
1149 16 : fd = BufFileOpenShared(ent->stream_fileset, path, O_RDWR);
1150 :
1151 : /* OK, truncate the file at the right offset */
1152 16 : BufFileTruncateShared(fd, subxact_data.subxacts[subidx].fileno,
1153 16 : subxact_data.subxacts[subidx].offset);
1154 16 : BufFileClose(fd);
1155 :
1156 : /* discard the subxacts added later */
1157 16 : subxact_data.nsubxacts = subidx;
1158 :
1159 : /* write the updated subxact list */
1160 16 : subxact_info_write(MyLogicalRepWorker->subid, xid);
1161 16 : CommitTransactionCommand();
1162 : }
1163 : }
1164 :
1165 : /*
1166 : * Common spoolfile processing.
1167 : * Returns how many changes were applied.
1168 : */
1169 : static int
1170 40 : apply_spooled_messages(TransactionId xid, XLogRecPtr lsn)
1171 : {
1172 : StringInfoData s2;
1173 : int nchanges;
1174 : char path[MAXPGPATH];
1175 40 : char *buffer = NULL;
1176 : bool found;
1177 : StreamXidHash *ent;
1178 : MemoryContext oldcxt;
1179 : BufFile *fd;
1180 :
1181 : /*
1182 : * Allocate file handle and memory required to process all the messages in
1183 : * TopTransactionContext to avoid them getting reset after each message is
1184 : * processed.
1185 : */
1186 40 : oldcxt = MemoryContextSwitchTo(TopTransactionContext);
1187 :
1188 : /* open the spool file for the committed/prepared transaction */
1189 40 : changes_filename(path, MyLogicalRepWorker->subid, xid);
1190 40 : elog(DEBUG1, "replaying changes from file \"%s\"", path);
1191 40 : ent = (StreamXidHash *) hash_search(xidhash,
1192 : (void *) &xid,
1193 : HASH_FIND,
1194 : &found);
1195 40 : Assert(found);
1196 40 : fd = BufFileOpenShared(ent->stream_fileset, path, O_RDONLY);
1197 :
1198 40 : buffer = palloc(BLCKSZ);
1199 40 : initStringInfo(&s2);
1200 :
1201 40 : MemoryContextSwitchTo(oldcxt);
1202 :
1203 40 : remote_final_lsn = lsn;
1204 :
1205 : /*
1206 : * Make sure the handle apply_dispatch methods are aware we're in a remote
1207 : * transaction.
1208 : */
1209 40 : in_remote_transaction = true;
1210 40 : pgstat_report_activity(STATE_RUNNING, NULL);
1211 :
1212 : /*
1213 : * Read the entries one by one and pass them through the same logic as in
1214 : * apply_dispatch.
1215 : */
1216 40 : nchanges = 0;
1217 : while (true)
1218 : {
1219 : int nbytes;
1220 : int len;
1221 :
1222 226192 : CHECK_FOR_INTERRUPTS();
1223 :
1224 : /* read length of the on-disk record */
1225 226192 : nbytes = BufFileRead(fd, &len, sizeof(len));
1226 :
1227 : /* have we reached end of the file? */
1228 226192 : if (nbytes == 0)
1229 40 : break;
1230 :
1231 : /* do we have a correct length? */
1232 226152 : if (nbytes != sizeof(len))
1233 0 : ereport(ERROR,
1234 : (errcode_for_file_access(),
1235 : errmsg("could not read from streaming transaction's changes file \"%s\": %m",
1236 : path)));
1237 :
1238 226152 : Assert(len > 0);
1239 :
1240 : /* make sure we have sufficiently large buffer */
1241 226152 : buffer = repalloc(buffer, len);
1242 :
1243 : /* and finally read the data into the buffer */
1244 226152 : if (BufFileRead(fd, buffer, len) != len)
1245 0 : ereport(ERROR,
1246 : (errcode_for_file_access(),
1247 : errmsg("could not read from streaming transaction's changes file \"%s\": %m",
1248 : path)));
1249 :
1250 : /* copy the buffer to the stringinfo and call apply_dispatch */
1251 226152 : resetStringInfo(&s2);
1252 226152 : appendBinaryStringInfo(&s2, buffer, len);
1253 :
1254 : /* Ensure we are reading the data into our memory context. */
1255 226152 : oldcxt = MemoryContextSwitchTo(ApplyMessageContext);
1256 :
1257 226152 : apply_dispatch(&s2);
1258 :
1259 226152 : MemoryContextReset(ApplyMessageContext);
1260 :
1261 226152 : MemoryContextSwitchTo(oldcxt);
1262 :
1263 226152 : nchanges++;
1264 :
1265 226152 : if (nchanges % 1000 == 0)
1266 214 : elog(DEBUG1, "replayed %d changes from file '%s'",
1267 : nchanges, path);
1268 226152 : }
1269 :
1270 40 : BufFileClose(fd);
1271 :
1272 40 : pfree(buffer);
1273 40 : pfree(s2.data);
1274 :
1275 40 : elog(DEBUG1, "replayed %d (all) changes from file \"%s\"",
1276 : nchanges, path);
1277 :
1278 40 : return nchanges;
1279 : }
1280 :
1281 : /*
1282 : * Handle STREAM COMMIT message.
1283 : */
1284 : static void
1285 28 : apply_handle_stream_commit(StringInfo s)
1286 : {
1287 : TransactionId xid;
1288 : LogicalRepCommitData commit_data;
1289 28 : int nchanges = 0;
1290 :
1291 28 : Assert(!in_streamed_transaction);
1292 :
1293 28 : xid = logicalrep_read_stream_commit(s, &commit_data);
1294 :
1295 28 : elog(DEBUG1, "received commit for streamed transaction %u", xid);
1296 :
1297 28 : ensure_transaction();
1298 :
1299 28 : nchanges = apply_spooled_messages(xid, commit_data.commit_lsn);
1300 :
1301 : /*
1302 : * Update origin state so we can restart streaming from correct position
1303 : * in case of crash.
1304 : */
1305 28 : replorigin_session_origin_lsn = commit_data.end_lsn;
1306 28 : replorigin_session_origin_timestamp = commit_data.committime;
1307 :
1308 28 : CommitTransactionCommand();
1309 28 : pgstat_report_stat(false);
1310 :
1311 28 : store_flush_position(commit_data.end_lsn);
1312 :
1313 28 : elog(DEBUG1, "apply_handle_stream_commit: replayed %d (all) changes.", nchanges);
1314 :
1315 28 : in_remote_transaction = false;
1316 :
1317 : /* Process any tables that are being synchronized in parallel. */
1318 28 : process_syncing_tables(commit_data.end_lsn);
1319 :
1320 : /* unlink the files with serialized changes and subxact info */
1321 28 : stream_cleanup_files(MyLogicalRepWorker->subid, xid);
1322 :
1323 28 : pgstat_report_activity(STATE_IDLE, NULL);
1324 28 : }
1325 :
1326 : /*
1327 : * Handle RELATION message.
1328 : *
1329 : * Note we don't do validation against local schema here. The validation
1330 : * against local schema is postponed until first change for given relation
1331 : * comes as we only care about it when applying changes for it anyway and we
1332 : * do less locking this way.
1333 : */
1334 : static void
1335 310 : apply_handle_relation(StringInfo s)
1336 : {
1337 : LogicalRepRelation *rel;
1338 :
1339 310 : if (handle_streamed_transaction('R', s))
1340 370 : return;
1341 :
1342 250 : rel = logicalrep_read_rel(s);
1343 250 : logicalrep_relmap_update(rel);
1344 : }
1345 :
1346 : /*
1347 : * Handle TYPE message.
1348 : *
1349 : * Note we don't do local mapping here, that's done when the type is
1350 : * actually used.
1351 : */
1352 : static void
1353 32 : apply_handle_type(StringInfo s)
1354 : {
1355 : LogicalRepTyp typ;
1356 :
1357 32 : if (handle_streamed_transaction('Y', s))
1358 32 : return;
1359 :
1360 32 : logicalrep_read_typ(s, &typ);
1361 32 : logicalrep_typmap_update(&typ);
1362 : }
1363 :
1364 : /*
1365 : * Get replica identity index or if it is not defined a primary key.
1366 : *
1367 : * If neither is defined, returns InvalidOid
1368 : */
1369 : static Oid
1370 124878 : GetRelationIdentityOrPK(Relation rel)
1371 : {
1372 : Oid idxoid;
1373 :
1374 124878 : idxoid = RelationGetReplicaIndex(rel);
1375 :
1376 124878 : if (!OidIsValid(idxoid))
1377 246 : idxoid = RelationGetPrimaryKeyIndex(rel);
1378 :
1379 124878 : return idxoid;
1380 : }
1381 :
1382 : /*
1383 : * Handle INSERT message.
1384 : */
1385 :
1386 : static void
1387 224746 : apply_handle_insert(StringInfo s)
1388 : {
1389 : ResultRelInfo *resultRelInfo;
1390 : LogicalRepRelMapEntry *rel;
1391 : LogicalRepTupleData newtup;
1392 : LogicalRepRelId relid;
1393 : EState *estate;
1394 : TupleTableSlot *remoteslot;
1395 : MemoryContext oldctx;
1396 :
1397 224746 : if (handle_streamed_transaction('I', s))
1398 243802 : return;
1399 :
1400 102858 : ensure_transaction();
1401 :
1402 102854 : relid = logicalrep_read_insert(s, &newtup);
1403 102854 : rel = logicalrep_rel_open(relid, RowExclusiveLock);
1404 102852 : if (!should_apply_changes_for_rel(rel))
1405 : {
1406 : /*
1407 : * The relation can't become interesting in the middle of the
1408 : * transaction so it's safe to unlock it.
1409 : */
1410 26 : logicalrep_rel_close(rel, RowExclusiveLock);
1411 26 : return;
1412 : }
1413 :
1414 : /* Initialize the executor state. */
1415 102826 : estate = create_estate_for_relation(rel);
1416 102826 : remoteslot = ExecInitExtraTupleSlot(estate,
1417 102826 : RelationGetDescr(rel->localrel),
1418 : &TTSOpsVirtual);
1419 102826 : resultRelInfo = makeNode(ResultRelInfo);
1420 102826 : InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
1421 :
1422 : /* Input functions may need an active snapshot, so get one */
1423 102826 : PushActiveSnapshot(GetTransactionSnapshot());
1424 :
1425 : /* Process and store remote tuple in the slot */
1426 102826 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1427 102826 : slot_store_data(remoteslot, rel, &newtup);
1428 102826 : slot_fill_defaults(rel, estate, remoteslot);
1429 102826 : MemoryContextSwitchTo(oldctx);
1430 :
1431 : /* For a partitioned table, insert the tuple into a partition. */
1432 102826 : if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
1433 36 : apply_handle_tuple_routing(resultRelInfo, estate,
1434 : remoteslot, NULL, rel, CMD_INSERT);
1435 : else
1436 102790 : apply_handle_insert_internal(resultRelInfo, estate,
1437 : remoteslot);
1438 :
1439 102826 : PopActiveSnapshot();
1440 :
1441 : /* Handle queued AFTER triggers. */
1442 102826 : AfterTriggerEndQuery(estate);
1443 :
1444 102826 : ExecResetTupleTable(estate->es_tupleTable, false);
1445 102826 : FreeExecutorState(estate);
1446 :
1447 102826 : logicalrep_rel_close(rel, NoLock);
1448 :
1449 102826 : CommandCounterIncrement();
1450 : }
1451 :
1452 : /* Workhorse for apply_handle_insert() */
1453 : static void
1454 102828 : apply_handle_insert_internal(ResultRelInfo *relinfo,
1455 : EState *estate, TupleTableSlot *remoteslot)
1456 : {
1457 102828 : ExecOpenIndices(relinfo, false);
1458 :
1459 : /* Do the insert. */
1460 102828 : ExecSimpleRelationInsert(relinfo, estate, remoteslot);
1461 :
1462 : /* Cleanup. */
1463 102828 : ExecCloseIndices(relinfo);
1464 102828 : }
1465 :
1466 : /*
1467 : * Check if the logical replication relation is updatable and throw
1468 : * appropriate error if it isn't.
1469 : */
1470 : static void
1471 124876 : check_relation_updatable(LogicalRepRelMapEntry *rel)
1472 : {
1473 : /* Updatable, no error. */
1474 124876 : if (rel->updatable)
1475 249752 : return;
1476 :
1477 : /*
1478 : * We are in error mode so it's fine this is somewhat slow. It's better to
1479 : * give user correct error.
1480 : */
1481 0 : if (OidIsValid(GetRelationIdentityOrPK(rel->localrel)))
1482 : {
1483 0 : ereport(ERROR,
1484 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1485 : errmsg("publisher did not send replica identity column "
1486 : "expected by the logical replication target relation \"%s.%s\"",
1487 : rel->remoterel.nspname, rel->remoterel.relname)));
1488 : }
1489 :
1490 0 : ereport(ERROR,
1491 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1492 : errmsg("logical replication target relation \"%s.%s\" has "
1493 : "neither REPLICA IDENTITY index nor PRIMARY "
1494 : "KEY and published relation does not have "
1495 : "REPLICA IDENTITY FULL",
1496 : rel->remoterel.nspname, rel->remoterel.relname)));
1497 : }
1498 :
1499 : /*
1500 : * Handle UPDATE message.
1501 : *
1502 : * TODO: FDW support
1503 : */
1504 : static void
1505 126428 : apply_handle_update(StringInfo s)
1506 : {
1507 : ResultRelInfo *resultRelInfo;
1508 : LogicalRepRelMapEntry *rel;
1509 : LogicalRepRelId relid;
1510 : EState *estate;
1511 : LogicalRepTupleData oldtup;
1512 : LogicalRepTupleData newtup;
1513 : bool has_oldtup;
1514 : TupleTableSlot *remoteslot;
1515 : RangeTblEntry *target_rte;
1516 : MemoryContext oldctx;
1517 :
1518 126428 : if (handle_streamed_transaction('U', s))
1519 127304 : return;
1520 :
1521 62776 : ensure_transaction();
1522 :
1523 62776 : relid = logicalrep_read_update(s, &has_oldtup, &oldtup,
1524 : &newtup);
1525 62776 : rel = logicalrep_rel_open(relid, RowExclusiveLock);
1526 62776 : if (!should_apply_changes_for_rel(rel))
1527 : {
1528 : /*
1529 : * The relation can't become interesting in the middle of the
1530 : * transaction so it's safe to unlock it.
1531 : */
1532 0 : logicalrep_rel_close(rel, RowExclusiveLock);
1533 0 : return;
1534 : }
1535 :
1536 : /* Check if we can do the update. */
1537 62776 : check_relation_updatable(rel);
1538 :
1539 : /* Initialize the executor state. */
1540 62776 : estate = create_estate_for_relation(rel);
1541 62776 : remoteslot = ExecInitExtraTupleSlot(estate,
1542 62776 : RelationGetDescr(rel->localrel),
1543 : &TTSOpsVirtual);
1544 62776 : resultRelInfo = makeNode(ResultRelInfo);
1545 62776 : InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
1546 :
1547 : /*
1548 : * Populate updatedCols so that per-column triggers can fire. This could
1549 : * include more columns than were actually changed on the publisher
1550 : * because the logical replication protocol doesn't contain that
1551 : * information. But it would for example exclude columns that only exist
1552 : * on the subscriber, since we are not touching those.
1553 : */
1554 62776 : target_rte = list_nth(estate->es_range_table, 0);
1555 313368 : for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++)
1556 : {
1557 250592 : Form_pg_attribute att = TupleDescAttr(remoteslot->tts_tupleDescriptor, i);
1558 250592 : int remoteattnum = rel->attrmap->attnums[i];
1559 :
1560 250592 : if (!att->attisdropped && remoteattnum >= 0)
1561 : {
1562 125512 : Assert(remoteattnum < newtup.ncols);
1563 125512 : if (newtup.colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
1564 125506 : target_rte->updatedCols =
1565 125506 : bms_add_member(target_rte->updatedCols,
1566 : i + 1 - FirstLowInvalidHeapAttributeNumber);
1567 : }
1568 : }
1569 :
1570 : /* Also populate extraUpdatedCols, in case we have generated columns */
1571 62776 : fill_extraUpdatedCols(target_rte, rel->localrel);
1572 :
1573 62776 : PushActiveSnapshot(GetTransactionSnapshot());
1574 :
1575 : /* Build the search tuple. */
1576 62776 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1577 62776 : slot_store_data(remoteslot, rel,
1578 : has_oldtup ? &oldtup : &newtup);
1579 62776 : MemoryContextSwitchTo(oldctx);
1580 :
1581 : /* For a partitioned table, apply update to correct partition. */
1582 62776 : if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
1583 10 : apply_handle_tuple_routing(resultRelInfo, estate,
1584 : remoteslot, &newtup, rel, CMD_UPDATE);
1585 : else
1586 62766 : apply_handle_update_internal(resultRelInfo, estate,
1587 : remoteslot, &newtup, rel);
1588 :
1589 62776 : PopActiveSnapshot();
1590 :
1591 : /* Handle queued AFTER triggers. */
1592 62776 : AfterTriggerEndQuery(estate);
1593 :
1594 62776 : ExecResetTupleTable(estate->es_tupleTable, false);
1595 62776 : FreeExecutorState(estate);
1596 :
1597 62776 : logicalrep_rel_close(rel, NoLock);
1598 :
1599 62776 : CommandCounterIncrement();
1600 : }
1601 :
1602 : /* Workhorse for apply_handle_update() */
1603 : static void
1604 62766 : apply_handle_update_internal(ResultRelInfo *relinfo,
1605 : EState *estate, TupleTableSlot *remoteslot,
1606 : LogicalRepTupleData *newtup,
1607 : LogicalRepRelMapEntry *relmapentry)
1608 : {
1609 62766 : Relation localrel = relinfo->ri_RelationDesc;
1610 : EPQState epqstate;
1611 : TupleTableSlot *localslot;
1612 : bool found;
1613 : MemoryContext oldctx;
1614 :
1615 62766 : EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1);
1616 62766 : ExecOpenIndices(relinfo, false);
1617 :
1618 62766 : found = FindReplTupleInLocalRel(estate, localrel,
1619 : &relmapentry->remoterel,
1620 : remoteslot, &localslot);
1621 62766 : ExecClearTuple(remoteslot);
1622 :
1623 : /*
1624 : * Tuple found.
1625 : *
1626 : * Note this will fail if there are other conflicting unique indexes.
1627 : */
1628 62766 : if (found)
1629 : {
1630 : /* Process and store remote tuple in the slot */
1631 62766 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1632 62766 : slot_modify_data(remoteslot, localslot, relmapentry, newtup);
1633 62766 : MemoryContextSwitchTo(oldctx);
1634 :
1635 62766 : EvalPlanQualSetSlot(&epqstate, remoteslot);
1636 :
1637 : /* Do the actual update. */
1638 62766 : ExecSimpleRelationUpdate(relinfo, estate, &epqstate, localslot,
1639 : remoteslot);
1640 : }
1641 : else
1642 : {
1643 : /*
1644 : * The tuple to be updated could not be found.
1645 : *
1646 : * TODO what to do here, change the log level to LOG perhaps?
1647 : */
1648 0 : elog(DEBUG1,
1649 : "logical replication did not find row for update "
1650 : "in replication target relation \"%s\"",
1651 : RelationGetRelationName(localrel));
1652 : }
1653 :
1654 : /* Cleanup. */
1655 62766 : ExecCloseIndices(relinfo);
1656 62766 : EvalPlanQualEnd(&epqstate);
1657 62766 : }
1658 :
1659 : /*
1660 : * Handle DELETE message.
1661 : *
1662 : * TODO: FDW support
1663 : */
1664 : static void
1665 123746 : apply_handle_delete(StringInfo s)
1666 : {
1667 : ResultRelInfo *resultRelInfo;
1668 : LogicalRepRelMapEntry *rel;
1669 : LogicalRepTupleData oldtup;
1670 : LogicalRepRelId relid;
1671 : EState *estate;
1672 : TupleTableSlot *remoteslot;
1673 : MemoryContext oldctx;
1674 :
1675 123746 : if (handle_streamed_transaction('D', s))
1676 123292 : return;
1677 :
1678 62100 : ensure_transaction();
1679 :
1680 62100 : relid = logicalrep_read_delete(s, &oldtup);
1681 62100 : rel = logicalrep_rel_open(relid, RowExclusiveLock);
1682 62100 : if (!should_apply_changes_for_rel(rel))
1683 : {
1684 : /*
1685 : * The relation can't become interesting in the middle of the
1686 : * transaction so it's safe to unlock it.
1687 : */
1688 0 : logicalrep_rel_close(rel, RowExclusiveLock);
1689 0 : return;
1690 : }
1691 :
1692 : /* Check if we can do the delete. */
1693 62100 : check_relation_updatable(rel);
1694 :
1695 : /* Initialize the executor state. */
1696 62100 : estate = create_estate_for_relation(rel);
1697 62100 : remoteslot = ExecInitExtraTupleSlot(estate,
1698 62100 : RelationGetDescr(rel->localrel),
1699 : &TTSOpsVirtual);
1700 62100 : resultRelInfo = makeNode(ResultRelInfo);
1701 62100 : InitResultRelInfo(resultRelInfo, rel->localrel, 1, NULL, 0);
1702 :
1703 62100 : PushActiveSnapshot(GetTransactionSnapshot());
1704 :
1705 : /* Build the search tuple. */
1706 62100 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1707 62100 : slot_store_data(remoteslot, rel, &oldtup);
1708 62100 : MemoryContextSwitchTo(oldctx);
1709 :
1710 : /* For a partitioned table, apply delete to correct partition. */
1711 62100 : if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
1712 24 : apply_handle_tuple_routing(resultRelInfo, estate,
1713 : remoteslot, NULL, rel, CMD_DELETE);
1714 : else
1715 62076 : apply_handle_delete_internal(resultRelInfo, estate,
1716 : remoteslot, &rel->remoterel);
1717 :
1718 62100 : PopActiveSnapshot();
1719 :
1720 : /* Handle queued AFTER triggers. */
1721 62100 : AfterTriggerEndQuery(estate);
1722 :
1723 62100 : ExecResetTupleTable(estate->es_tupleTable, false);
1724 62100 : FreeExecutorState(estate);
1725 :
1726 62100 : logicalrep_rel_close(rel, NoLock);
1727 :
1728 62100 : CommandCounterIncrement();
1729 : }
1730 :
1731 : /* Workhorse for apply_handle_delete() */
1732 : static void
1733 62102 : apply_handle_delete_internal(ResultRelInfo *relinfo, EState *estate,
1734 : TupleTableSlot *remoteslot,
1735 : LogicalRepRelation *remoterel)
1736 : {
1737 62102 : Relation localrel = relinfo->ri_RelationDesc;
1738 : EPQState epqstate;
1739 : TupleTableSlot *localslot;
1740 : bool found;
1741 :
1742 62102 : EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1);
1743 62102 : ExecOpenIndices(relinfo, false);
1744 :
1745 62102 : found = FindReplTupleInLocalRel(estate, localrel, remoterel,
1746 : remoteslot, &localslot);
1747 :
1748 : /* If found delete it. */
1749 62102 : if (found)
1750 : {
1751 62102 : EvalPlanQualSetSlot(&epqstate, localslot);
1752 :
1753 : /* Do the actual delete. */
1754 62102 : ExecSimpleRelationDelete(relinfo, estate, &epqstate, localslot);
1755 : }
1756 : else
1757 : {
1758 : /* The tuple to be deleted could not be found. */
1759 0 : elog(DEBUG1,
1760 : "logical replication could not find row for delete "
1761 : "in replication target relation \"%s\"",
1762 : RelationGetRelationName(localrel));
1763 : }
1764 :
1765 : /* Cleanup. */
1766 62102 : ExecCloseIndices(relinfo);
1767 62102 : EvalPlanQualEnd(&epqstate);
1768 62102 : }
1769 :
1770 : /*
1771 : * Try to find a tuple received from the publication side (in 'remoteslot') in
1772 : * the corresponding local relation using either replica identity index,
1773 : * primary key or if needed, sequential scan.
1774 : *
1775 : * Local tuple, if found, is returned in '*localslot'.
1776 : */
1777 : static bool
1778 124878 : FindReplTupleInLocalRel(EState *estate, Relation localrel,
1779 : LogicalRepRelation *remoterel,
1780 : TupleTableSlot *remoteslot,
1781 : TupleTableSlot **localslot)
1782 : {
1783 : Oid idxoid;
1784 : bool found;
1785 :
1786 124878 : *localslot = table_slot_create(localrel, &estate->es_tupleTable);
1787 :
1788 124878 : idxoid = GetRelationIdentityOrPK(localrel);
1789 124878 : Assert(OidIsValid(idxoid) ||
1790 : (remoterel->replident == REPLICA_IDENTITY_FULL));
1791 :
1792 124878 : if (OidIsValid(idxoid))
1793 124634 : found = RelationFindReplTupleByIndex(localrel, idxoid,
1794 : LockTupleExclusive,
1795 : remoteslot, *localslot);
1796 : else
1797 244 : found = RelationFindReplTupleSeq(localrel, LockTupleExclusive,
1798 : remoteslot, *localslot);
1799 :
1800 124878 : return found;
1801 : }
1802 :
1803 : /*
1804 : * This handles insert, update, delete on a partitioned table.
1805 : */
1806 : static void
1807 70 : apply_handle_tuple_routing(ResultRelInfo *relinfo,
1808 : EState *estate,
1809 : TupleTableSlot *remoteslot,
1810 : LogicalRepTupleData *newtup,
1811 : LogicalRepRelMapEntry *relmapentry,
1812 : CmdType operation)
1813 : {
1814 70 : Relation parentrel = relinfo->ri_RelationDesc;
1815 70 : ModifyTableState *mtstate = NULL;
1816 70 : PartitionTupleRouting *proute = NULL;
1817 : ResultRelInfo *partrelinfo;
1818 : Relation partrel;
1819 : TupleTableSlot *remoteslot_part;
1820 : TupleConversionMap *map;
1821 : MemoryContext oldctx;
1822 :
1823 : /* ModifyTableState is needed for ExecFindPartition(). */
1824 70 : mtstate = makeNode(ModifyTableState);
1825 70 : mtstate->ps.plan = NULL;
1826 70 : mtstate->ps.state = estate;
1827 70 : mtstate->operation = operation;
1828 70 : mtstate->resultRelInfo = relinfo;
1829 70 : proute = ExecSetupPartitionTupleRouting(estate, mtstate, parentrel);
1830 :
1831 : /*
1832 : * Find the partition to which the "search tuple" belongs.
1833 : */
1834 70 : Assert(remoteslot != NULL);
1835 70 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1836 70 : partrelinfo = ExecFindPartition(mtstate, relinfo, proute,
1837 : remoteslot, estate);
1838 70 : Assert(partrelinfo != NULL);
1839 70 : partrel = partrelinfo->ri_RelationDesc;
1840 :
1841 : /*
1842 : * To perform any of the operations below, the tuple must match the
1843 : * partition's rowtype. Convert if needed or just copy, using a dedicated
1844 : * slot to store the tuple in any case.
1845 : */
1846 70 : remoteslot_part = partrelinfo->ri_PartitionTupleSlot;
1847 70 : if (remoteslot_part == NULL)
1848 22 : remoteslot_part = table_slot_create(partrel, &estate->es_tupleTable);
1849 70 : map = partrelinfo->ri_RootToPartitionMap;
1850 70 : if (map != NULL)
1851 48 : remoteslot_part = execute_attr_map_slot(map->attrMap, remoteslot,
1852 : remoteslot_part);
1853 : else
1854 : {
1855 22 : remoteslot_part = ExecCopySlot(remoteslot_part, remoteslot);
1856 22 : slot_getallattrs(remoteslot_part);
1857 : }
1858 70 : MemoryContextSwitchTo(oldctx);
1859 :
1860 70 : switch (operation)
1861 : {
1862 : case CMD_INSERT:
1863 36 : apply_handle_insert_internal(partrelinfo, estate,
1864 : remoteslot_part);
1865 36 : break;
1866 :
1867 : case CMD_DELETE:
1868 24 : apply_handle_delete_internal(partrelinfo, estate,
1869 : remoteslot_part,
1870 : &relmapentry->remoterel);
1871 24 : break;
1872 :
1873 : case CMD_UPDATE:
1874 :
1875 : /*
1876 : * For UPDATE, depending on whether or not the updated tuple
1877 : * satisfies the partition's constraint, perform a simple UPDATE
1878 : * of the partition or move the updated tuple into a different
1879 : * suitable partition.
1880 : */
1881 : {
1882 10 : AttrMap *attrmap = map ? map->attrMap : NULL;
1883 : LogicalRepRelMapEntry *part_entry;
1884 : TupleTableSlot *localslot;
1885 : ResultRelInfo *partrelinfo_new;
1886 : bool found;
1887 :
1888 10 : part_entry = logicalrep_partition_open(relmapentry, partrel,
1889 : attrmap);
1890 :
1891 : /* Get the matching local tuple from the partition. */
1892 10 : found = FindReplTupleInLocalRel(estate, partrel,
1893 : &part_entry->remoterel,
1894 : remoteslot_part, &localslot);
1895 :
1896 10 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1897 10 : if (found)
1898 : {
1899 : /* Apply the update. */
1900 10 : slot_modify_data(remoteslot_part, localslot,
1901 : part_entry,
1902 : newtup);
1903 10 : MemoryContextSwitchTo(oldctx);
1904 : }
1905 : else
1906 : {
1907 : /*
1908 : * The tuple to be updated could not be found.
1909 : *
1910 : * TODO what to do here, change the log level to LOG
1911 : * perhaps?
1912 : */
1913 0 : elog(DEBUG1,
1914 : "logical replication did not find row for update "
1915 : "in replication target relation \"%s\"",
1916 : RelationGetRelationName(partrel));
1917 : }
1918 :
1919 : /*
1920 : * Does the updated tuple still satisfy the current
1921 : * partition's constraint?
1922 : */
1923 20 : if (!partrel->rd_rel->relispartition ||
1924 10 : ExecPartitionCheck(partrelinfo, remoteslot_part, estate,
1925 : false))
1926 8 : {
1927 : /*
1928 : * Yes, so simply UPDATE the partition. We don't call
1929 : * apply_handle_update_internal() here, which would
1930 : * normally do the following work, to avoid repeating some
1931 : * work already done above to find the local tuple in the
1932 : * partition.
1933 : */
1934 : EPQState epqstate;
1935 :
1936 8 : EvalPlanQualInit(&epqstate, estate, NULL, NIL, -1);
1937 8 : ExecOpenIndices(partrelinfo, false);
1938 :
1939 8 : EvalPlanQualSetSlot(&epqstate, remoteslot_part);
1940 8 : ExecSimpleRelationUpdate(partrelinfo, estate, &epqstate,
1941 : localslot, remoteslot_part);
1942 8 : ExecCloseIndices(partrelinfo);
1943 8 : EvalPlanQualEnd(&epqstate);
1944 : }
1945 : else
1946 : {
1947 : /* Move the tuple into the new partition. */
1948 :
1949 : /*
1950 : * New partition will be found using tuple routing, which
1951 : * can only occur via the parent table. We might need to
1952 : * convert the tuple to the parent's rowtype. Note that
1953 : * this is the tuple found in the partition, not the
1954 : * original search tuple received by this function.
1955 : */
1956 2 : if (map)
1957 : {
1958 2 : TupleConversionMap *PartitionToRootMap =
1959 2 : convert_tuples_by_name(RelationGetDescr(partrel),
1960 : RelationGetDescr(parentrel));
1961 :
1962 2 : remoteslot =
1963 2 : execute_attr_map_slot(PartitionToRootMap->attrMap,
1964 : remoteslot_part, remoteslot);
1965 : }
1966 : else
1967 : {
1968 0 : remoteslot = ExecCopySlot(remoteslot, remoteslot_part);
1969 0 : slot_getallattrs(remoteslot);
1970 : }
1971 :
1972 :
1973 : /* Find the new partition. */
1974 2 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1975 2 : partrelinfo_new = ExecFindPartition(mtstate, relinfo,
1976 : proute, remoteslot,
1977 : estate);
1978 2 : MemoryContextSwitchTo(oldctx);
1979 2 : Assert(partrelinfo_new != partrelinfo);
1980 :
1981 : /* DELETE old tuple found in the old partition. */
1982 2 : apply_handle_delete_internal(partrelinfo, estate,
1983 : localslot,
1984 : &relmapentry->remoterel);
1985 :
1986 : /* INSERT new tuple into the new partition. */
1987 :
1988 : /*
1989 : * Convert the replacement tuple to match the destination
1990 : * partition rowtype.
1991 : */
1992 2 : oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
1993 2 : partrel = partrelinfo_new->ri_RelationDesc;
1994 2 : remoteslot_part = partrelinfo_new->ri_PartitionTupleSlot;
1995 2 : if (remoteslot_part == NULL)
1996 2 : remoteslot_part = table_slot_create(partrel,
1997 : &estate->es_tupleTable);
1998 2 : map = partrelinfo_new->ri_RootToPartitionMap;
1999 2 : if (map != NULL)
2000 : {
2001 0 : remoteslot_part = execute_attr_map_slot(map->attrMap,
2002 : remoteslot,
2003 : remoteslot_part);
2004 : }
2005 : else
2006 : {
2007 2 : remoteslot_part = ExecCopySlot(remoteslot_part,
2008 : remoteslot);
2009 2 : slot_getallattrs(remoteslot);
2010 : }
2011 2 : MemoryContextSwitchTo(oldctx);
2012 2 : apply_handle_insert_internal(partrelinfo_new, estate,
2013 : remoteslot_part);
2014 : }
2015 : }
2016 10 : break;
2017 :
2018 : default:
2019 0 : elog(ERROR, "unrecognized CmdType: %d", (int) operation);
2020 : break;
2021 : }
2022 :
2023 70 : ExecCleanupTupleRouting(mtstate, proute);
2024 70 : }
2025 :
2026 : /*
2027 : * Handle TRUNCATE message.
2028 : *
2029 : * TODO: FDW support
2030 : */
2031 : static void
2032 24 : apply_handle_truncate(StringInfo s)
2033 : {
2034 24 : bool cascade = false;
2035 24 : bool restart_seqs = false;
2036 24 : List *remote_relids = NIL;
2037 24 : List *remote_rels = NIL;
2038 24 : List *rels = NIL;
2039 24 : List *part_rels = NIL;
2040 24 : List *relids = NIL;
2041 24 : List *relids_logged = NIL;
2042 : ListCell *lc;
2043 :
2044 24 : if (handle_streamed_transaction('T', s))
2045 24 : return;
2046 :
2047 24 : ensure_transaction();
2048 :
2049 24 : remote_relids = logicalrep_read_truncate(s, &cascade, &restart_seqs);
2050 :
2051 64 : foreach(lc, remote_relids)
2052 : {
2053 40 : LogicalRepRelId relid = lfirst_oid(lc);
2054 : LogicalRepRelMapEntry *rel;
2055 :
2056 40 : rel = logicalrep_rel_open(relid, RowExclusiveLock);
2057 40 : if (!should_apply_changes_for_rel(rel))
2058 : {
2059 : /*
2060 : * The relation can't become interesting in the middle of the
2061 : * transaction so it's safe to unlock it.
2062 : */
2063 0 : logicalrep_rel_close(rel, RowExclusiveLock);
2064 0 : continue;
2065 : }
2066 :
2067 40 : remote_rels = lappend(remote_rels, rel);
2068 40 : rels = lappend(rels, rel->localrel);
2069 40 : relids = lappend_oid(relids, rel->localreloid);
2070 40 : if (RelationIsLogicallyLogged(rel->localrel))
2071 40 : relids_logged = lappend_oid(relids_logged, rel->localreloid);
2072 :
2073 : /*
2074 : * Truncate partitions if we got a message to truncate a partitioned
2075 : * table.
2076 : */
2077 40 : if (rel->localrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
2078 : {
2079 : ListCell *child;
2080 8 : List *children = find_all_inheritors(rel->localreloid,
2081 : RowExclusiveLock,
2082 : NULL);
2083 :
2084 30 : foreach(child, children)
2085 : {
2086 22 : Oid childrelid = lfirst_oid(child);
2087 : Relation childrel;
2088 :
2089 22 : if (list_member_oid(relids, childrelid))
2090 8 : continue;
2091 :
2092 : /* find_all_inheritors already got lock */
2093 14 : childrel = table_open(childrelid, NoLock);
2094 :
2095 : /*
2096 : * Ignore temp tables of other backends. See similar code in
2097 : * ExecuteTruncate().
2098 : */
2099 14 : if (RELATION_IS_OTHER_TEMP(childrel))
2100 : {
2101 0 : table_close(childrel, RowExclusiveLock);
2102 0 : continue;
2103 : }
2104 :
2105 14 : rels = lappend(rels, childrel);
2106 14 : part_rels = lappend(part_rels, childrel);
2107 14 : relids = lappend_oid(relids, childrelid);
2108 : /* Log this relation only if needed for logical decoding */
2109 14 : if (RelationIsLogicallyLogged(childrel))
2110 14 : relids_logged = lappend_oid(relids_logged, childrelid);
2111 : }
2112 : }
2113 : }
2114 :
2115 : /*
2116 : * Even if we used CASCADE on the upstream primary we explicitly default
2117 : * to replaying changes without further cascading. This might be later
2118 : * changeable with a user specified option.
2119 : */
2120 24 : ExecuteTruncateGuts(rels, relids, relids_logged, DROP_RESTRICT, restart_seqs);
2121 :
2122 64 : foreach(lc, remote_rels)
2123 : {
2124 40 : LogicalRepRelMapEntry *rel = lfirst(lc);
2125 :
2126 40 : logicalrep_rel_close(rel, NoLock);
2127 : }
2128 38 : foreach(lc, part_rels)
2129 : {
2130 14 : Relation rel = lfirst(lc);
2131 :
2132 14 : table_close(rel, NoLock);
2133 : }
2134 :
2135 24 : CommandCounterIncrement();
2136 : }
2137 :
2138 :
2139 : /*
2140 : * Logical replication protocol message dispatcher.
2141 : */
2142 : static void
2143 477586 : apply_dispatch(StringInfo s)
2144 : {
2145 477586 : LogicalRepMsgType action = pq_getmsgbyte(s);
2146 :
2147 477586 : switch (action)
2148 : {
2149 : case LOGICAL_REP_MSG_BEGIN:
2150 500 : apply_handle_begin(s);
2151 500 : return;
2152 :
2153 : case LOGICAL_REP_MSG_COMMIT:
2154 488 : apply_handle_commit(s);
2155 484 : return;
2156 :
2157 : case LOGICAL_REP_MSG_INSERT:
2158 224746 : apply_handle_insert(s);
2159 224740 : return;
2160 :
2161 : case LOGICAL_REP_MSG_UPDATE:
2162 126428 : apply_handle_update(s);
2163 126428 : return;
2164 :
2165 : case LOGICAL_REP_MSG_DELETE:
2166 123746 : apply_handle_delete(s);
2167 123746 : return;
2168 :
2169 : case LOGICAL_REP_MSG_TRUNCATE:
2170 24 : apply_handle_truncate(s);
2171 24 : return;
2172 :
2173 : case LOGICAL_REP_MSG_RELATION:
2174 310 : apply_handle_relation(s);
2175 310 : return;
2176 :
2177 : case LOGICAL_REP_MSG_TYPE:
2178 32 : apply_handle_type(s);
2179 32 : return;
2180 :
2181 : case LOGICAL_REP_MSG_ORIGIN:
2182 0 : apply_handle_origin(s);
2183 0 : return;
2184 :
2185 : case LOGICAL_REP_MSG_STREAM_START:
2186 610 : apply_handle_stream_start(s);
2187 610 : return;
2188 :
2189 : case LOGICAL_REP_MSG_STREAM_END:
2190 610 : apply_handle_stream_stop(s);
2191 610 : return;
2192 :
2193 : case LOGICAL_REP_MSG_STREAM_ABORT:
2194 26 : apply_handle_stream_abort(s);
2195 26 : return;
2196 :
2197 : case LOGICAL_REP_MSG_STREAM_COMMIT:
2198 28 : apply_handle_stream_commit(s);
2199 28 : return;
2200 :
2201 : case LOGICAL_REP_MSG_PREPARE:
2202 26 : apply_handle_prepare(s);
2203 26 : return;
2204 :
2205 : case LOGICAL_REP_MSG_STREAM_PREPARE:
2206 12 : apply_handle_stream_prepare(s);
2207 12 : return;
2208 : }
2209 :
2210 0 : ereport(ERROR,
2211 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
2212 : errmsg("invalid logical replication message type \"%c\"", action)));
2213 : }
2214 :
2215 : /*
2216 : * Figure out which write/flush positions to report to the walsender process.
2217 : *
2218 : * We can't simply report back the last LSN the walsender sent us because the
2219 : * local transaction might not yet be flushed to disk locally. Instead we
2220 : * build a list that associates local with remote LSNs for every commit. When
2221 : * reporting back the flush position to the sender we iterate that list and
2222 : * check which entries on it are already locally flushed. Those we can report
2223 : * as having been flushed.
2224 : *
2225 : * The have_pending_txes is true if there are outstanding transactions that
2226 : * need to be flushed.
2227 : */
2228 : static void
2229 61186 : get_flush_position(XLogRecPtr *write, XLogRecPtr *flush,
2230 : bool *have_pending_txes)
2231 : {
2232 : dlist_mutable_iter iter;
2233 61186 : XLogRecPtr local_flush = GetFlushRecPtr();
2234 :
2235 61186 : *write = InvalidXLogRecPtr;
2236 61186 : *flush = InvalidXLogRecPtr;
2237 :
2238 61582 : dlist_foreach_modify(iter, &lsn_mapping)
2239 : {
2240 27986 : FlushPosition *pos =
2241 27986 : dlist_container(FlushPosition, node, iter.cur);
2242 :
2243 27986 : *write = pos->remote_end;
2244 :
2245 27986 : if (pos->local_end <= local_flush)
2246 : {
2247 396 : *flush = pos->remote_end;
2248 396 : dlist_delete(iter.cur);
2249 396 : pfree(pos);
2250 : }
2251 : else
2252 : {
2253 : /*
2254 : * Don't want to uselessly iterate over the rest of the list which
2255 : * could potentially be long. Instead get the last element and
2256 : * grab the write position from there.
2257 : */
2258 27590 : pos = dlist_tail_element(FlushPosition, node,
2259 : &lsn_mapping);
2260 27590 : *write = pos->remote_end;
2261 27590 : *have_pending_txes = true;
2262 55180 : return;
2263 : }
2264 : }
2265 :
2266 33596 : *have_pending_txes = !dlist_is_empty(&lsn_mapping);
2267 : }
2268 :
2269 : /*
2270 : * Store current remote/local lsn pair in the tracking list.
2271 : */
2272 : static void
2273 434 : store_flush_position(XLogRecPtr remote_lsn)
2274 : {
2275 : FlushPosition *flushpos;
2276 :
2277 : /* Need to do this in permanent context */
2278 434 : MemoryContextSwitchTo(ApplyContext);
2279 :
2280 : /* Track commit lsn */
2281 434 : flushpos = (FlushPosition *) palloc(sizeof(FlushPosition));
2282 434 : flushpos->local_end = XactLastCommitEnd;
2283 434 : flushpos->remote_end = remote_lsn;
2284 :
2285 434 : dlist_push_tail(&lsn_mapping, &flushpos->node);
2286 434 : MemoryContextSwitchTo(ApplyMessageContext);
2287 434 : }
2288 :
2289 :
2290 : /* Update statistics of the worker. */
2291 : static void
2292 253816 : UpdateWorkerStats(XLogRecPtr last_lsn, TimestampTz send_time, bool reply)
2293 : {
2294 253816 : MyLogicalRepWorker->last_lsn = last_lsn;
2295 253816 : MyLogicalRepWorker->last_send_time = send_time;
2296 253816 : MyLogicalRepWorker->last_recv_time = GetCurrentTimestamp();
2297 253816 : if (reply)
2298 : {
2299 2382 : MyLogicalRepWorker->reply_lsn = last_lsn;
2300 2382 : MyLogicalRepWorker->reply_time = send_time;
2301 : }
2302 253816 : }
2303 :
2304 : /*
2305 : * Apply main loop.
2306 : */
2307 : static void
2308 198 : LogicalRepApplyLoop(XLogRecPtr last_received)
2309 : {
2310 198 : TimestampTz last_recv_timestamp = GetCurrentTimestamp();
2311 198 : bool ping_sent = false;
2312 : TimeLineID tli;
2313 :
2314 : /*
2315 : * Init the ApplyMessageContext which we clean up after each replication
2316 : * protocol message.
2317 : */
2318 198 : ApplyMessageContext = AllocSetContextCreate(ApplyContext,
2319 : "ApplyMessageContext",
2320 : ALLOCSET_DEFAULT_SIZES);
2321 :
2322 : /*
2323 : * This memory context is used for per-stream data when the streaming mode
2324 : * is enabled. This context is reset on each stream stop.
2325 : */
2326 198 : LogicalStreamingContext = AllocSetContextCreate(ApplyContext,
2327 : "LogicalStreamingContext",
2328 : ALLOCSET_DEFAULT_SIZES);
2329 :
2330 : /* mark as idle, before starting to loop */
2331 198 : pgstat_report_activity(STATE_IDLE, NULL);
2332 :
2333 : /* This outer loop iterates once per wait. */
2334 : for (;;)
2335 : {
2336 58788 : pgsocket fd = PGINVALID_SOCKET;
2337 : int rc;
2338 : int len;
2339 58788 : char *buf = NULL;
2340 58788 : bool endofstream = false;
2341 : long wait_time;
2342 :
2343 58788 : CHECK_FOR_INTERRUPTS();
2344 :
2345 58788 : MemoryContextSwitchTo(ApplyMessageContext);
2346 :
2347 58788 : len = walrcv_receive(wrconn, &buf, &fd);
2348 :
2349 58774 : if (len != 0)
2350 : {
2351 : /* Loop to process all available data (without blocking). */
2352 : for (;;)
2353 : {
2354 312340 : CHECK_FOR_INTERRUPTS();
2355 :
2356 312340 : if (len == 0)
2357 : {
2358 58520 : break;
2359 : }
2360 253820 : else if (len < 0)
2361 : {
2362 4 : ereport(LOG,
2363 : (errmsg("data stream from publisher has ended")));
2364 4 : endofstream = true;
2365 4 : break;
2366 : }
2367 : else
2368 : {
2369 : int c;
2370 : StringInfoData s;
2371 :
2372 : /* Reset timeout. */
2373 253816 : last_recv_timestamp = GetCurrentTimestamp();
2374 253816 : ping_sent = false;
2375 :
2376 : /* Ensure we are reading the data into our memory context. */
2377 253816 : MemoryContextSwitchTo(ApplyMessageContext);
2378 :
2379 253816 : s.data = buf;
2380 253816 : s.len = len;
2381 253816 : s.cursor = 0;
2382 253816 : s.maxlen = -1;
2383 :
2384 253816 : c = pq_getmsgbyte(&s);
2385 :
2386 253816 : if (c == 'w')
2387 : {
2388 : XLogRecPtr start_lsn;
2389 : XLogRecPtr end_lsn;
2390 : TimestampTz send_time;
2391 :
2392 251434 : start_lsn = pq_getmsgint64(&s);
2393 251434 : end_lsn = pq_getmsgint64(&s);
2394 251434 : send_time = pq_getmsgint64(&s);
2395 :
2396 251434 : if (last_received < start_lsn)
2397 229640 : last_received = start_lsn;
2398 :
2399 251434 : if (last_received < end_lsn)
2400 0 : last_received = end_lsn;
2401 :
2402 251434 : UpdateWorkerStats(last_received, send_time, false);
2403 :
2404 251434 : apply_dispatch(&s);
2405 : }
2406 2382 : else if (c == 'k')
2407 : {
2408 : XLogRecPtr end_lsn;
2409 : TimestampTz timestamp;
2410 : bool reply_requested;
2411 :
2412 2382 : end_lsn = pq_getmsgint64(&s);
2413 2382 : timestamp = pq_getmsgint64(&s);
2414 2382 : reply_requested = pq_getmsgbyte(&s);
2415 :
2416 2382 : if (last_received < end_lsn)
2417 306 : last_received = end_lsn;
2418 :
2419 2382 : send_feedback(last_received, reply_requested, false);
2420 2382 : UpdateWorkerStats(last_received, timestamp, true);
2421 : }
2422 : /* other message types are purposefully ignored */
2423 :
2424 253806 : MemoryContextReset(ApplyMessageContext);
2425 : }
2426 :
2427 253806 : len = walrcv_receive(wrconn, &buf, &fd);
2428 253806 : }
2429 : }
2430 :
2431 : /* confirm all writes so far */
2432 58764 : send_feedback(last_received, false, false);
2433 :
2434 58764 : if (!in_remote_transaction && !in_streamed_transaction)
2435 : {
2436 : /*
2437 : * If we didn't get any transactions for a while there might be
2438 : * unconsumed invalidation messages in the queue, consume them
2439 : * now.
2440 : */
2441 2966 : AcceptInvalidationMessages();
2442 2966 : maybe_reread_subscription();
2443 :
2444 : /* Process any table synchronization changes. */
2445 2958 : process_syncing_tables(last_received);
2446 : }
2447 :
2448 : /* Cleanup the memory. */
2449 58640 : MemoryContextResetAndDeleteChildren(ApplyMessageContext);
2450 58640 : MemoryContextSwitchTo(TopMemoryContext);
2451 :
2452 : /* Check if we need to exit the streaming loop. */
2453 58640 : if (endofstream)
2454 4 : break;
2455 :
2456 : /*
2457 : * Wait for more data or latch. If we have unflushed transactions,
2458 : * wake up after WalWriterDelay to see if they've been flushed yet (in
2459 : * which case we should send a feedback message). Otherwise, there's
2460 : * no particular urgency about waking up unless we get data or a
2461 : * signal.
2462 : */
2463 58636 : if (!dlist_is_empty(&lsn_mapping))
2464 25858 : wait_time = WalWriterDelay;
2465 : else
2466 32778 : wait_time = NAPTIME_PER_CYCLE;
2467 :
2468 58636 : rc = WaitLatchOrSocket(MyLatch,
2469 : WL_SOCKET_READABLE | WL_LATCH_SET |
2470 : WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
2471 : fd, wait_time,
2472 : WAIT_EVENT_LOGICAL_APPLY_MAIN);
2473 :
2474 58636 : if (rc & WL_LATCH_SET)
2475 : {
2476 234 : ResetLatch(MyLatch);
2477 234 : CHECK_FOR_INTERRUPTS();
2478 : }
2479 :
2480 58590 : if (ConfigReloadPending)
2481 : {
2482 0 : ConfigReloadPending = false;
2483 0 : ProcessConfigFile(PGC_SIGHUP);
2484 : }
2485 :
2486 58590 : if (rc & WL_TIMEOUT)
2487 : {
2488 : /*
2489 : * We didn't receive anything new. If we haven't heard anything
2490 : * from the server for more than wal_receiver_timeout / 2, ping
2491 : * the server. Also, if it's been longer than
2492 : * wal_receiver_status_interval since the last update we sent,
2493 : * send a status update to the primary anyway, to report any
2494 : * progress in applying WAL.
2495 : */
2496 40 : bool requestReply = false;
2497 :
2498 : /*
2499 : * Check if time since last receive from standby has reached the
2500 : * configured limit.
2501 : */
2502 40 : if (wal_receiver_timeout > 0)
2503 : {
2504 40 : TimestampTz now = GetCurrentTimestamp();
2505 : TimestampTz timeout;
2506 :
2507 40 : timeout =
2508 40 : TimestampTzPlusMilliseconds(last_recv_timestamp,
2509 : wal_receiver_timeout);
2510 :
2511 40 : if (now >= timeout)
2512 0 : ereport(ERROR,
2513 : (errmsg("terminating logical replication worker due to timeout")));
2514 :
2515 : /* Check to see if it's time for a ping. */
2516 40 : if (!ping_sent)
2517 : {
2518 40 : timeout = TimestampTzPlusMilliseconds(last_recv_timestamp,
2519 : (wal_receiver_timeout / 2));
2520 40 : if (now >= timeout)
2521 : {
2522 0 : requestReply = true;
2523 0 : ping_sent = true;
2524 : }
2525 : }
2526 : }
2527 :
2528 40 : send_feedback(last_received, requestReply, requestReply);
2529 : }
2530 58590 : }
2531 :
2532 : /* All done */
2533 4 : walrcv_endstreaming(wrconn, &tli);
2534 0 : }
2535 :
2536 : /*
2537 : * Send a Standby Status Update message to server.
2538 : *
2539 : * 'recvpos' is the latest LSN we've received data to, force is set if we need
2540 : * to send a response to avoid timeouts.
2541 : */
2542 : static void
2543 61186 : send_feedback(XLogRecPtr recvpos, bool force, bool requestReply)
2544 : {
2545 : static StringInfo reply_message = NULL;
2546 : static TimestampTz send_time = 0;
2547 :
2548 : static XLogRecPtr last_recvpos = InvalidXLogRecPtr;
2549 : static XLogRecPtr last_writepos = InvalidXLogRecPtr;
2550 : static XLogRecPtr last_flushpos = InvalidXLogRecPtr;
2551 :
2552 : XLogRecPtr writepos;
2553 : XLogRecPtr flushpos;
2554 : TimestampTz now;
2555 : bool have_pending_txes;
2556 :
2557 : /*
2558 : * If the user doesn't want status to be reported to the publisher, be
2559 : * sure to exit before doing anything at all.
2560 : */
2561 61186 : if (!force && wal_receiver_status_interval <= 0)
2562 30480 : return;
2563 :
2564 : /* It's legal to not pass a recvpos */
2565 61186 : if (recvpos < last_recvpos)
2566 0 : recvpos = last_recvpos;
2567 :
2568 61186 : get_flush_position(&writepos, &flushpos, &have_pending_txes);
2569 :
2570 : /*
2571 : * No outstanding transactions to flush, we can report the latest received
2572 : * position. This is important for synchronous replication.
2573 : */
2574 61186 : if (!have_pending_txes)
2575 33596 : flushpos = writepos = recvpos;
2576 :
2577 61186 : if (writepos < last_writepos)
2578 0 : writepos = last_writepos;
2579 :
2580 61186 : if (flushpos < last_flushpos)
2581 27548 : flushpos = last_flushpos;
2582 :
2583 61186 : now = GetCurrentTimestamp();
2584 :
2585 : /* if we've already reported everything we're good */
2586 121090 : if (!force &&
2587 90414 : writepos == last_writepos &&
2588 60994 : flushpos == last_flushpos &&
2589 30484 : !TimestampDifferenceExceeds(send_time, now,
2590 : wal_receiver_status_interval * 1000))
2591 30480 : return;
2592 30706 : send_time = now;
2593 :
2594 30706 : if (!reply_message)
2595 : {
2596 198 : MemoryContext oldctx = MemoryContextSwitchTo(ApplyContext);
2597 :
2598 198 : reply_message = makeStringInfo();
2599 198 : MemoryContextSwitchTo(oldctx);
2600 : }
2601 : else
2602 30508 : resetStringInfo(reply_message);
2603 :
2604 30706 : pq_sendbyte(reply_message, 'r');
2605 30706 : pq_sendint64(reply_message, recvpos); /* write */
2606 30706 : pq_sendint64(reply_message, flushpos); /* flush */
2607 30706 : pq_sendint64(reply_message, writepos); /* apply */
2608 30706 : pq_sendint64(reply_message, now); /* sendTime */
2609 30706 : pq_sendbyte(reply_message, requestReply); /* replyRequested */
2610 :
2611 30706 : elog(DEBUG2, "sending feedback (force %d) to recv %X/%X, write %X/%X, flush %X/%X",
2612 : force,
2613 : (uint32) (recvpos >> 32), (uint32) recvpos,
2614 : (uint32) (writepos >> 32), (uint32) writepos,
2615 : (uint32) (flushpos >> 32), (uint32) flushpos
2616 : );
2617 :
2618 30706 : walrcv_send(wrconn, reply_message->data, reply_message->len);
2619 :
2620 30706 : if (recvpos > last_recvpos)
2621 29400 : last_recvpos = recvpos;
2622 30706 : if (writepos > last_writepos)
2623 29396 : last_writepos = writepos;
2624 30706 : if (flushpos > last_flushpos)
2625 29098 : last_flushpos = flushpos;
2626 : }
2627 :
2628 : /*
2629 : * Reread subscription info if needed. Most changes will be exit.
2630 : */
2631 : static void
2632 4160 : maybe_reread_subscription(void)
2633 : {
2634 : MemoryContext oldctx;
2635 : Subscription *newsub;
2636 4160 : bool started_tx = false;
2637 :
2638 : /* When cache state is valid there is nothing to do here. */
2639 4160 : if (MySubscriptionValid)
2640 8286 : return;
2641 :
2642 : /* This function might be called inside or outside of transaction. */
2643 22 : if (!IsTransactionState())
2644 : {
2645 16 : StartTransactionCommand();
2646 16 : started_tx = true;
2647 : }
2648 :
2649 : /* Ensure allocations in permanent context. */
2650 22 : oldctx = MemoryContextSwitchTo(ApplyContext);
2651 :
2652 22 : newsub = GetSubscription(MyLogicalRepWorker->subid, true);
2653 :
2654 : /*
2655 : * Exit if the subscription was removed. This normally should not happen
2656 : * as the worker gets killed during DROP SUBSCRIPTION.
2657 : */
2658 22 : if (!newsub)
2659 : {
2660 0 : ereport(LOG,
2661 : (errmsg("logical replication apply worker for subscription \"%s\" will "
2662 : "stop because the subscription was removed",
2663 : MySubscription->name)));
2664 :
2665 0 : proc_exit(0);
2666 : }
2667 :
2668 : /*
2669 : * Exit if the subscription was disabled. This normally should not happen
2670 : * as the worker gets killed during ALTER SUBSCRIPTION ... DISABLE.
2671 : */
2672 22 : if (!newsub->enabled)
2673 : {
2674 0 : ereport(LOG,
2675 : (errmsg("logical replication apply worker for subscription \"%s\" will "
2676 : "stop because the subscription was disabled",
2677 : MySubscription->name)));
2678 :
2679 0 : proc_exit(0);
2680 : }
2681 :
2682 : /* !slotname should never happen when enabled is true. */
2683 22 : Assert(newsub->slotname);
2684 :
2685 : /*
2686 : * Exit if any parameter that affects the remote connection was changed.
2687 : * The launcher will start a new worker.
2688 : */
2689 42 : if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0 ||
2690 38 : strcmp(newsub->name, MySubscription->name) != 0 ||
2691 36 : strcmp(newsub->slotname, MySubscription->slotname) != 0 ||
2692 30 : newsub->binary != MySubscription->binary ||
2693 24 : newsub->stream != MySubscription->stream ||
2694 12 : !equal(newsub->publications, MySubscription->publications))
2695 : {
2696 12 : ereport(LOG,
2697 : (errmsg("logical replication apply worker for subscription \"%s\" will restart because of a parameter change",
2698 : MySubscription->name)));
2699 :
2700 12 : proc_exit(0);
2701 : }
2702 :
2703 : /* Check for other changes that should never happen too. */
2704 10 : if (newsub->dbid != MySubscription->dbid)
2705 : {
2706 0 : elog(ERROR, "subscription %u changed unexpectedly",
2707 : MyLogicalRepWorker->subid);
2708 : }
2709 :
2710 : /* Clean old subscription info and switch to new one. */
2711 10 : FreeSubscription(MySubscription);
2712 10 : MySubscription = newsub;
2713 :
2714 10 : MemoryContextSwitchTo(oldctx);
2715 :
2716 : /* Change synchronous commit according to the user's wishes */
2717 10 : SetConfigOption("synchronous_commit", MySubscription->synccommit,
2718 : PGC_BACKEND, PGC_S_OVERRIDE);
2719 :
2720 10 : if (started_tx)
2721 8 : CommitTransactionCommand();
2722 :
2723 10 : MySubscriptionValid = true;
2724 : }
2725 :
2726 : /*
2727 : * Callback from subscription syscache invalidation.
2728 : */
2729 : static void
2730 22 : subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue)
2731 : {
2732 22 : MySubscriptionValid = false;
2733 22 : }
2734 :
2735 : /*
2736 : * subxact_info_write
2737 : * Store information about subxacts for a toplevel transaction.
2738 : *
2739 : * For each subxact we store offset of it's first change in the main file.
2740 : * The file is always over-written as a whole.
2741 : *
2742 : * XXX We should only store subxacts that were not aborted yet.
2743 : */
2744 : static void
2745 626 : subxact_info_write(Oid subid, TransactionId xid)
2746 : {
2747 : char path[MAXPGPATH];
2748 : bool found;
2749 : Size len;
2750 : StreamXidHash *ent;
2751 : BufFile *fd;
2752 :
2753 626 : Assert(TransactionIdIsValid(xid));
2754 :
2755 : /* find the xid entry in the xidhash */
2756 626 : ent = (StreamXidHash *) hash_search(xidhash,
2757 : (void *) &xid,
2758 : HASH_FIND,
2759 : &found);
2760 : /* we must found the entry for its top transaction by this time */
2761 626 : Assert(found);
2762 :
2763 : /*
2764 : * If there is no subtransaction then nothing to do, but if already have
2765 : * subxact file then delete that.
2766 : */
2767 626 : if (subxact_data.nsubxacts == 0)
2768 : {
2769 544 : if (ent->subxact_fileset)
2770 : {
2771 6 : cleanup_subxact_info();
2772 6 : SharedFileSetDeleteAll(ent->subxact_fileset);
2773 6 : pfree(ent->subxact_fileset);
2774 6 : ent->subxact_fileset = NULL;
2775 : }
2776 1170 : return;
2777 : }
2778 :
2779 82 : subxact_filename(path, subid, xid);
2780 :
2781 : /*
2782 : * Create the subxact file if it not already created, otherwise open the
2783 : * existing file.
2784 : */
2785 82 : if (ent->subxact_fileset == NULL)
2786 : {
2787 : MemoryContext oldctx;
2788 :
2789 : /*
2790 : * We need to maintain shared fileset across multiple stream
2791 : * start/stop calls. So, need to allocate it in a persistent context.
2792 : */
2793 14 : oldctx = MemoryContextSwitchTo(ApplyContext);
2794 14 : ent->subxact_fileset = palloc(sizeof(SharedFileSet));
2795 14 : SharedFileSetInit(ent->subxact_fileset, NULL);
2796 14 : MemoryContextSwitchTo(oldctx);
2797 :
2798 14 : fd = BufFileCreateShared(ent->subxact_fileset, path);
2799 : }
2800 : else
2801 68 : fd = BufFileOpenShared(ent->subxact_fileset, path, O_RDWR);
2802 :
2803 82 : len = sizeof(SubXactInfo) * subxact_data.nsubxacts;
2804 :
2805 : /* Write the subxact count and subxact info */
2806 82 : BufFileWrite(fd, &subxact_data.nsubxacts, sizeof(subxact_data.nsubxacts));
2807 82 : BufFileWrite(fd, subxact_data.subxacts, len);
2808 :
2809 82 : BufFileClose(fd);
2810 :
2811 : /* free the memory allocated for subxact info */
2812 82 : cleanup_subxact_info();
2813 : }
2814 :
2815 : /*
2816 : * subxact_info_read
2817 : * Restore information about subxacts of a streamed transaction.
2818 : *
2819 : * Read information about subxacts into the structure subxact_data that can be
2820 : * used later.
2821 : */
2822 : static void
2823 590 : subxact_info_read(Oid subid, TransactionId xid)
2824 : {
2825 : char path[MAXPGPATH];
2826 : bool found;
2827 : Size len;
2828 : BufFile *fd;
2829 : StreamXidHash *ent;
2830 : MemoryContext oldctx;
2831 :
2832 590 : Assert(TransactionIdIsValid(xid));
2833 590 : Assert(!subxact_data.subxacts);
2834 590 : Assert(subxact_data.nsubxacts == 0);
2835 590 : Assert(subxact_data.nsubxacts_max == 0);
2836 :
2837 : /* Find the stream xid entry in the xidhash */
2838 590 : ent = (StreamXidHash *) hash_search(xidhash,
2839 : (void *) &xid,
2840 : HASH_FIND,
2841 : &found);
2842 :
2843 : /*
2844 : * If subxact_fileset is not valid that mean we don't have any subxact
2845 : * info
2846 : */
2847 590 : if (ent->subxact_fileset == NULL)
2848 1104 : return;
2849 :
2850 76 : subxact_filename(path, subid, xid);
2851 :
2852 76 : fd = BufFileOpenShared(ent->subxact_fileset, path, O_RDONLY);
2853 :
2854 : /* read number of subxact items */
2855 76 : if (BufFileRead(fd, &subxact_data.nsubxacts,
2856 : sizeof(subxact_data.nsubxacts)) !=
2857 : sizeof(subxact_data.nsubxacts))
2858 0 : ereport(ERROR,
2859 : (errcode_for_file_access(),
2860 : errmsg("could not read from streaming transaction's subxact file \"%s\": %m",
2861 : path)));
2862 :
2863 76 : len = sizeof(SubXactInfo) * subxact_data.nsubxacts;
2864 :
2865 : /* we keep the maximum as a power of 2 */
2866 76 : subxact_data.nsubxacts_max = 1 << my_log2(subxact_data.nsubxacts);
2867 :
2868 : /*
2869 : * Allocate subxact information in the logical streaming context. We need
2870 : * this information during the complete stream so that we can add the sub
2871 : * transaction info to this. On stream stop we will flush this information
2872 : * to the subxact file and reset the logical streaming context.
2873 : */
2874 76 : oldctx = MemoryContextSwitchTo(LogicalStreamingContext);
2875 76 : subxact_data.subxacts = palloc(subxact_data.nsubxacts_max *
2876 : sizeof(SubXactInfo));
2877 76 : MemoryContextSwitchTo(oldctx);
2878 :
2879 76 : if ((len > 0) && ((BufFileRead(fd, subxact_data.subxacts, len)) != len))
2880 0 : ereport(ERROR,
2881 : (errcode_for_file_access(),
2882 : errmsg("could not read from streaming transaction's subxact file \"%s\": %m",
2883 : path)));
2884 :
2885 76 : BufFileClose(fd);
2886 : }
2887 :
2888 : /*
2889 : * subxact_info_add
2890 : * Add information about a subxact (offset in the main file).
2891 : */
2892 : static void
2893 247246 : subxact_info_add(TransactionId xid)
2894 : {
2895 247246 : SubXactInfo *subxacts = subxact_data.subxacts;
2896 : int64 i;
2897 :
2898 : /* We must have a valid top level stream xid and a stream fd. */
2899 247246 : Assert(TransactionIdIsValid(stream_xid));
2900 247246 : Assert(stream_fd != NULL);
2901 :
2902 : /*
2903 : * If the XID matches the toplevel transaction, we don't want to add it.
2904 : */
2905 247246 : if (stream_xid == xid)
2906 221616 : return;
2907 :
2908 : /*
2909 : * In most cases we're checking the same subxact as we've already seen in
2910 : * the last call, so make sure to ignore it (this change comes later).
2911 : */
2912 25630 : if (subxact_data.subxact_last == xid)
2913 25536 : return;
2914 :
2915 : /* OK, remember we're processing this XID. */
2916 94 : subxact_data.subxact_last = xid;
2917 :
2918 : /*
2919 : * Check if the transaction is already present in the array of subxact. We
2920 : * intentionally scan the array from the tail, because we're likely adding
2921 : * a change for the most recent subtransactions.
2922 : *
2923 : * XXX Can we rely on the subxact XIDs arriving in sorted order? That
2924 : * would allow us to use binary search here.
2925 : */
2926 132 : for (i = subxact_data.nsubxacts; i > 0; i--)
2927 : {
2928 : /* found, so we're done */
2929 96 : if (subxacts[i - 1].xid == xid)
2930 58 : return;
2931 : }
2932 :
2933 : /* This is a new subxact, so we need to add it to the array. */
2934 36 : if (subxact_data.nsubxacts == 0)
2935 : {
2936 : MemoryContext oldctx;
2937 :
2938 14 : subxact_data.nsubxacts_max = 128;
2939 :
2940 : /*
2941 : * Allocate this memory for subxacts in per-stream context, see
2942 : * subxact_info_read.
2943 : */
2944 14 : oldctx = MemoryContextSwitchTo(LogicalStreamingContext);
2945 14 : subxacts = palloc(subxact_data.nsubxacts_max * sizeof(SubXactInfo));
2946 14 : MemoryContextSwitchTo(oldctx);
2947 : }
2948 22 : else if (subxact_data.nsubxacts == subxact_data.nsubxacts_max)
2949 : {
2950 20 : subxact_data.nsubxacts_max *= 2;
2951 20 : subxacts = repalloc(subxacts,
2952 20 : subxact_data.nsubxacts_max * sizeof(SubXactInfo));
2953 : }
2954 :
2955 36 : subxacts[subxact_data.nsubxacts].xid = xid;
2956 :
2957 : /*
2958 : * Get the current offset of the stream file and store it as offset of
2959 : * this subxact.
2960 : */
2961 72 : BufFileTell(stream_fd,
2962 36 : &subxacts[subxact_data.nsubxacts].fileno,
2963 36 : &subxacts[subxact_data.nsubxacts].offset);
2964 :
2965 36 : subxact_data.nsubxacts++;
2966 36 : subxact_data.subxacts = subxacts;
2967 : }
2968 :
2969 : /* format filename for file containing the info about subxacts */
2970 : static inline void
2971 166 : subxact_filename(char *path, Oid subid, TransactionId xid)
2972 : {
2973 166 : snprintf(path, MAXPGPATH, "%u-%u.subxacts", subid, xid);
2974 166 : }
2975 :
2976 : /* format filename for file containing serialized changes */
2977 : static inline void
2978 708 : changes_filename(char *path, Oid subid, TransactionId xid)
2979 : {
2980 708 : snprintf(path, MAXPGPATH, "%u-%u.changes", subid, xid);
2981 708 : }
2982 :
2983 : /*
2984 : * stream_cleanup_files
2985 : * Cleanup files for a subscription / toplevel transaction.
2986 : *
2987 : * Remove files with serialized changes and subxact info for a particular
2988 : * toplevel transaction. Each subscription has a separate set of files.
2989 : */
2990 : static void
2991 42 : stream_cleanup_files(Oid subid, TransactionId xid)
2992 : {
2993 : char path[MAXPGPATH];
2994 : StreamXidHash *ent;
2995 :
2996 : /* Remove the xid entry from the stream xid hash */
2997 42 : ent = (StreamXidHash *) hash_search(xidhash,
2998 : (void *) &xid,
2999 : HASH_REMOVE,
3000 : NULL);
3001 : /* By this time we must have created the transaction entry */
3002 42 : Assert(ent != NULL);
3003 :
3004 : /* Delete the change file and release the stream fileset memory */
3005 42 : changes_filename(path, subid, xid);
3006 42 : SharedFileSetDeleteAll(ent->stream_fileset);
3007 42 : pfree(ent->stream_fileset);
3008 42 : ent->stream_fileset = NULL;
3009 :
3010 : /* Delete the subxact file and release the memory, if it exist */
3011 42 : if (ent->subxact_fileset)
3012 : {
3013 8 : subxact_filename(path, subid, xid);
3014 8 : SharedFileSetDeleteAll(ent->subxact_fileset);
3015 8 : pfree(ent->subxact_fileset);
3016 8 : ent->subxact_fileset = NULL;
3017 : }
3018 42 : }
3019 :
3020 : /*
3021 : * stream_open_file
3022 : * Open a file that we'll use to serialize changes for a toplevel
3023 : * transaction.
3024 : *
3025 : * Open a file for streamed changes from a toplevel transaction identified
3026 : * by stream_xid (global variable). If it's the first chunk of streamed
3027 : * changes for this transaction, initialize the shared fileset and create the
3028 : * buffile, otherwise open the previously created file.
3029 : *
3030 : * This can only be called at the beginning of a "streaming" block, i.e.
3031 : * between stream_start/stream_stop messages from the upstream.
3032 : */
3033 : static void
3034 610 : stream_open_file(Oid subid, TransactionId xid, bool first_segment)
3035 : {
3036 : char path[MAXPGPATH];
3037 : bool found;
3038 : MemoryContext oldcxt;
3039 : StreamXidHash *ent;
3040 :
3041 610 : Assert(in_streamed_transaction);
3042 610 : Assert(OidIsValid(subid));
3043 610 : Assert(TransactionIdIsValid(xid));
3044 610 : Assert(stream_fd == NULL);
3045 :
3046 : /* create or find the xid entry in the xidhash */
3047 610 : ent = (StreamXidHash *) hash_search(xidhash,
3048 : (void *) &xid,
3049 : HASH_ENTER | HASH_FIND,
3050 : &found);
3051 610 : Assert(first_segment || found);
3052 610 : changes_filename(path, subid, xid);
3053 610 : elog(DEBUG1, "opening file \"%s\" for streamed changes", path);
3054 :
3055 : /*
3056 : * Create/open the buffiles under the logical streaming context so that we
3057 : * have those files until stream stop.
3058 : */
3059 610 : oldcxt = MemoryContextSwitchTo(LogicalStreamingContext);
3060 :
3061 : /*
3062 : * If this is the first streamed segment, the file must not exist, so make
3063 : * sure we're the ones creating it. Otherwise just open the file for
3064 : * writing, in append mode.
3065 : */
3066 610 : if (first_segment)
3067 : {
3068 : MemoryContext savectx;
3069 : SharedFileSet *fileset;
3070 :
3071 : /*
3072 : * We need to maintain shared fileset across multiple stream
3073 : * start/stop calls. So, need to allocate it in a persistent context.
3074 : */
3075 44 : savectx = MemoryContextSwitchTo(ApplyContext);
3076 44 : fileset = palloc(sizeof(SharedFileSet));
3077 :
3078 44 : SharedFileSetInit(fileset, NULL);
3079 44 : MemoryContextSwitchTo(savectx);
3080 :
3081 44 : stream_fd = BufFileCreateShared(fileset, path);
3082 :
3083 : /* Remember the fileset for the next stream of the same transaction */
3084 44 : ent->xid = xid;
3085 44 : ent->stream_fileset = fileset;
3086 44 : ent->subxact_fileset = NULL;
3087 : }
3088 : else
3089 : {
3090 : /*
3091 : * Open the file and seek to the end of the file because we always
3092 : * append the changes file.
3093 : */
3094 566 : stream_fd = BufFileOpenShared(ent->stream_fileset, path, O_RDWR);
3095 566 : BufFileSeek(stream_fd, 0, 0, SEEK_END);
3096 : }
3097 :
3098 610 : MemoryContextSwitchTo(oldcxt);
3099 610 : }
3100 :
3101 : /*
3102 : * stream_close_file
3103 : * Close the currently open file with streamed changes.
3104 : *
3105 : * This can only be called at the end of a streaming block, i.e. at stream_stop
3106 : * message from the upstream.
3107 : */
3108 : static void
3109 610 : stream_close_file(void)
3110 : {
3111 610 : Assert(in_streamed_transaction);
3112 610 : Assert(TransactionIdIsValid(stream_xid));
3113 610 : Assert(stream_fd != NULL);
3114 :
3115 610 : BufFileClose(stream_fd);
3116 :
3117 610 : stream_xid = InvalidTransactionId;
3118 610 : stream_fd = NULL;
3119 610 : }
3120 :
3121 : /*
3122 : * stream_write_change
3123 : * Serialize a change to a file for the current toplevel transaction.
3124 : *
3125 : * The change is serialized in a simple format, with length (not including
3126 : * the length), action code (identifying the message type) and message
3127 : * contents (without the subxact TransactionId value).
3128 : */
3129 : static void
3130 247246 : stream_write_change(char action, StringInfo s)
3131 : {
3132 : int len;
3133 :
3134 247246 : Assert(in_streamed_transaction);
3135 247246 : Assert(TransactionIdIsValid(stream_xid));
3136 247246 : Assert(stream_fd != NULL);
3137 :
3138 : /* total on-disk size, including the action type character */
3139 247246 : len = (s->len - s->cursor) + sizeof(char);
3140 :
3141 : /* first write the size */
3142 247246 : BufFileWrite(stream_fd, &len, sizeof(len));
3143 :
3144 : /* then the action */
3145 247246 : BufFileWrite(stream_fd, &action, sizeof(action));
3146 :
3147 : /* and finally the remaining part of the buffer (after the XID) */
3148 247246 : len = (s->len - s->cursor);
3149 :
3150 247246 : BufFileWrite(stream_fd, &s->data[s->cursor], len);
3151 247246 : }
3152 :
3153 : /*
3154 : * Cleanup the memory for subxacts and reset the related variables.
3155 : */
3156 : static inline void
3157 96 : cleanup_subxact_info()
3158 : {
3159 96 : if (subxact_data.subxacts)
3160 90 : pfree(subxact_data.subxacts);
3161 :
3162 96 : subxact_data.subxacts = NULL;
3163 96 : subxact_data.subxact_last = InvalidTransactionId;
3164 96 : subxact_data.nsubxacts = 0;
3165 96 : subxact_data.nsubxacts_max = 0;
3166 96 : }
3167 :
3168 : /* Logical Replication Apply worker entry point */
3169 : void
3170 210 : ApplyWorkerMain(Datum main_arg)
3171 : {
3172 210 : int worker_slot = DatumGetInt32(main_arg);
3173 : MemoryContext oldctx;
3174 : char originname[NAMEDATALEN];
3175 : XLogRecPtr origin_startpos;
3176 : char *myslotname;
3177 : WalRcvStreamOptions options;
3178 :
3179 : /* Attach to slot */
3180 210 : logicalrep_worker_attach(worker_slot);
3181 :
3182 : /* Setup signal handling */
3183 210 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
3184 210 : pqsignal(SIGTERM, die);
3185 210 : BackgroundWorkerUnblockSignals();
3186 :
3187 : /*
3188 : * We don't currently need any ResourceOwner in a walreceiver process, but
3189 : * if we did, we could call CreateAuxProcessResourceOwner here.
3190 : */
3191 :
3192 : /* Initialise stats to a sanish value */
3193 420 : MyLogicalRepWorker->last_send_time = MyLogicalRepWorker->last_recv_time =
3194 210 : MyLogicalRepWorker->reply_time = GetCurrentTimestamp();
3195 :
3196 : /* Load the libpq-specific functions */
3197 210 : load_file("libpqwalreceiver", false);
3198 :
3199 : /* Run as replica session replication role. */
3200 210 : SetConfigOption("session_replication_role", "replica",
3201 : PGC_SUSET, PGC_S_OVERRIDE);
3202 :
3203 : /* Connect to our database. */
3204 210 : BackgroundWorkerInitializeConnectionByOid(MyLogicalRepWorker->dbid,
3205 210 : MyLogicalRepWorker->userid,
3206 : 0);
3207 :
3208 : /*
3209 : * Set always-secure search path, so malicious users can't redirect user
3210 : * code (e.g. pg_index.indexprs).
3211 : */
3212 210 : SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
3213 :
3214 : /* Load the subscription into persistent memory context. */
3215 210 : ApplyContext = AllocSetContextCreate(TopMemoryContext,
3216 : "ApplyContext",
3217 : ALLOCSET_DEFAULT_SIZES);
3218 210 : StartTransactionCommand();
3219 210 : oldctx = MemoryContextSwitchTo(ApplyContext);
3220 :
3221 210 : MySubscription = GetSubscription(MyLogicalRepWorker->subid, true);
3222 210 : if (!MySubscription)
3223 : {
3224 0 : ereport(LOG,
3225 : (errmsg("logical replication apply worker for subscription %u will not "
3226 : "start because the subscription was removed during startup",
3227 : MyLogicalRepWorker->subid)));
3228 0 : proc_exit(0);
3229 : }
3230 :
3231 210 : MySubscriptionValid = true;
3232 210 : MemoryContextSwitchTo(oldctx);
3233 :
3234 210 : if (!MySubscription->enabled)
3235 : {
3236 0 : ereport(LOG,
3237 : (errmsg("logical replication apply worker for subscription \"%s\" will not "
3238 : "start because the subscription was disabled during startup",
3239 : MySubscription->name)));
3240 :
3241 0 : proc_exit(0);
3242 : }
3243 :
3244 : /* Setup synchronous commit according to the user's wishes */
3245 210 : SetConfigOption("synchronous_commit", MySubscription->synccommit,
3246 : PGC_BACKEND, PGC_S_OVERRIDE);
3247 :
3248 : /* Keep us informed about subscription changes. */
3249 210 : CacheRegisterSyscacheCallback(SUBSCRIPTIONOID,
3250 : subscription_change_cb,
3251 : (Datum) 0);
3252 :
3253 210 : if (am_tablesync_worker())
3254 126 : ereport(LOG,
3255 : (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has started",
3256 : MySubscription->name, get_rel_name(MyLogicalRepWorker->relid))));
3257 : else
3258 84 : ereport(LOG,
3259 : (errmsg("logical replication apply worker for subscription \"%s\" has started",
3260 : MySubscription->name)));
3261 :
3262 210 : CommitTransactionCommand();
3263 :
3264 : /* Connect to the origin and start the replication. */
3265 210 : elog(DEBUG1, "connecting to publisher using connection string \"%s\"",
3266 : MySubscription->conninfo);
3267 :
3268 210 : if (am_tablesync_worker())
3269 : {
3270 : char *syncslotname;
3271 :
3272 : /* This is table synchronization worker, call initial sync. */
3273 126 : syncslotname = LogicalRepSyncTableStart(&origin_startpos);
3274 :
3275 : /* allocate slot name in long-lived context */
3276 122 : myslotname = MemoryContextStrdup(ApplyContext, syncslotname);
3277 :
3278 122 : pfree(syncslotname);
3279 : }
3280 : else
3281 : {
3282 : /* This is main apply worker */
3283 : RepOriginId originid;
3284 : TimeLineID startpointTLI;
3285 : char *err;
3286 :
3287 84 : myslotname = MySubscription->slotname;
3288 :
3289 : /*
3290 : * This shouldn't happen if the subscription is enabled, but guard
3291 : * against DDL bugs or manual catalog changes. (libpqwalreceiver will
3292 : * crash if slot is NULL.)
3293 : */
3294 84 : if (!myslotname)
3295 0 : ereport(ERROR,
3296 : (errmsg("subscription has no replication slot set")));
3297 :
3298 : /* Setup replication origin tracking. */
3299 84 : StartTransactionCommand();
3300 84 : snprintf(originname, sizeof(originname), "pg_%u", MySubscription->oid);
3301 84 : originid = replorigin_by_name(originname, true);
3302 84 : if (!OidIsValid(originid))
3303 0 : originid = replorigin_create(originname);
3304 84 : replorigin_session_setup(originid);
3305 84 : replorigin_session_origin = originid;
3306 84 : origin_startpos = replorigin_session_get_progress(false);
3307 84 : CommitTransactionCommand();
3308 :
3309 84 : wrconn = walrcv_connect(MySubscription->conninfo, true, MySubscription->name,
3310 : &err);
3311 84 : if (wrconn == NULL)
3312 8 : ereport(ERROR,
3313 : (errmsg("could not connect to the publisher: %s", err)));
3314 :
3315 : /*
3316 : * We don't really use the output identify_system for anything but it
3317 : * does some initializations on the upstream so let's still call it.
3318 : */
3319 76 : (void) walrcv_identify_system(wrconn, &startpointTLI);
3320 : }
3321 :
3322 : /*
3323 : * Setup callback for syscache so that we know when something changes in
3324 : * the subscription relation state.
3325 : */
3326 198 : CacheRegisterSyscacheCallback(SUBSCRIPTIONRELMAP,
3327 : invalidate_syncing_table_states,
3328 : (Datum) 0);
3329 :
3330 : /* Build logical replication streaming options. */
3331 198 : options.logical = true;
3332 198 : options.startpoint = origin_startpos;
3333 198 : options.slotname = myslotname;
3334 198 : options.proto.logical.proto_version =
3335 198 : walrcv_server_version(wrconn) >= 140000 ?
3336 : LOGICALREP_PROTO_STREAM_VERSION_NUM : LOGICALREP_PROTO_VERSION_NUM;
3337 198 : options.proto.logical.publication_names = MySubscription->publications;
3338 198 : options.proto.logical.binary = MySubscription->binary;
3339 198 : options.proto.logical.streaming = MySubscription->stream;
3340 :
3341 : /* Start normal logical streaming replication. */
3342 198 : walrcv_startstreaming(wrconn, &options);
3343 :
3344 : /* Run the main loop. */
3345 198 : LogicalRepApplyLoop(origin_startpos);
3346 :
3347 0 : proc_exit(0);
3348 : }
3349 :
3350 : /*
3351 : * Is current process a logical replication worker?
3352 : */
3353 : bool
3354 130 : IsLogicalWorker(void)
3355 : {
3356 130 : return MyLogicalRepWorker != NULL;
3357 : }
|