Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2020, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of a large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We still rely on max_changes_in_memory when loading serialized changes
71 : * back into memory. At that point we can't use the memory limit directly
72 : * as we load the subxacts independently. One option to deal with this
73 : * would be to count the subxacts, and allow each to allocate 1/N of the
74 : * memory limit. That however does not seem very appealing, because with
75 : * many subtransactions it may easily cause thrashing (short cycles of
76 : * deserializing and applying very few changes). We probably should give
77 : * a bit more memory to the oldest subtransactions, because it's likely
78 : * they are the source for the next sequence of changes.
79 : *
80 : * -------------------------------------------------------------------------
81 : */
82 : #include "postgres.h"
83 :
84 : #include <unistd.h>
85 : #include <sys/stat.h>
86 :
87 : #include "access/detoast.h"
88 : #include "access/heapam.h"
89 : #include "access/rewriteheap.h"
90 : #include "access/transam.h"
91 : #include "access/xact.h"
92 : #include "access/xlog_internal.h"
93 : #include "catalog/catalog.h"
94 : #include "lib/binaryheap.h"
95 : #include "miscadmin.h"
96 : #include "pgstat.h"
97 : #include "replication/logical.h"
98 : #include "replication/reorderbuffer.h"
99 : #include "replication/slot.h"
100 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 : #include "storage/bufmgr.h"
102 : #include "storage/fd.h"
103 : #include "storage/sinval.h"
104 : #include "utils/builtins.h"
105 : #include "utils/combocid.h"
106 : #include "utils/memdebug.h"
107 : #include "utils/memutils.h"
108 : #include "utils/rel.h"
109 : #include "utils/relfilenodemap.h"
110 :
111 :
112 : /* entry for a hash table we use to map from xid to our transaction state */
113 : typedef struct ReorderBufferTXNByIdEnt
114 : {
115 : TransactionId xid;
116 : ReorderBufferTXN *txn;
117 : } ReorderBufferTXNByIdEnt;
118 :
119 : /* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
120 : typedef struct ReorderBufferTupleCidKey
121 : {
122 : RelFileNode relnode;
123 : ItemPointerData tid;
124 : } ReorderBufferTupleCidKey;
125 :
126 : typedef struct ReorderBufferTupleCidEnt
127 : {
128 : ReorderBufferTupleCidKey key;
129 : CommandId cmin;
130 : CommandId cmax;
131 : CommandId combocid; /* just for debugging */
132 : } ReorderBufferTupleCidEnt;
133 :
134 : /* Virtual file descriptor with file offset tracking */
135 : typedef struct TXNEntryFile
136 : {
137 : File vfd; /* -1 when the file is closed */
138 : off_t curOffset; /* offset for next write or read. Reset to 0
139 : * when vfd is opened. */
140 : } TXNEntryFile;
141 :
142 : /* k-way in-order change iteration support structures */
143 : typedef struct ReorderBufferIterTXNEntry
144 : {
145 : XLogRecPtr lsn;
146 : ReorderBufferChange *change;
147 : ReorderBufferTXN *txn;
148 : TXNEntryFile file;
149 : XLogSegNo segno;
150 : } ReorderBufferIterTXNEntry;
151 :
152 : typedef struct ReorderBufferIterTXNState
153 : {
154 : binaryheap *heap;
155 : Size nr_txns;
156 : dlist_head old_change;
157 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
158 : } ReorderBufferIterTXNState;
159 :
160 : /* toast datastructures */
161 : typedef struct ReorderBufferToastEnt
162 : {
163 : Oid chunk_id; /* toast_table.chunk_id */
164 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165 : * have seen */
166 : Size num_chunks; /* number of chunks we've already seen */
167 : Size size; /* combined size of chunks seen */
168 : dlist_head chunks; /* linked list of chunks */
169 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170 : * main tup */
171 : } ReorderBufferToastEnt;
172 :
173 : /* Disk serialization support datastructures */
174 : typedef struct ReorderBufferDiskChange
175 : {
176 : Size size;
177 : ReorderBufferChange change;
178 : /* data follows */
179 : } ReorderBufferDiskChange;
180 :
181 : #define IsSpecInsert(action) \
182 : ( \
183 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 : )
185 : #define IsSpecConfirm(action) \
186 : ( \
187 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \
188 : )
189 : #define IsInsertOrUpdate(action) \
190 : ( \
191 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
192 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
193 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
194 : )
195 :
196 : /*
197 : * Maximum number of changes kept in memory, per transaction. After that,
198 : * changes are spooled to disk.
199 : *
200 : * The current value should be sufficient to decode the entire transaction
201 : * without hitting disk in OLTP workloads, while starting to spool to disk in
202 : * other workloads reasonably fast.
203 : *
204 : * At some point in the future it probably makes sense to have a more elaborate
205 : * resource management here, but it's not entirely clear what that would look
206 : * like.
207 : */
208 : int logical_decoding_work_mem;
209 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
210 :
211 : /* ---------------------------------------
212 : * primary reorderbuffer support routines
213 : * ---------------------------------------
214 : */
215 : static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
216 : static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
217 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
218 : TransactionId xid, bool create, bool *is_new,
219 : XLogRecPtr lsn, bool create_as_top);
220 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
221 : ReorderBufferTXN *subtxn);
222 :
223 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
224 :
225 : /* ---------------------------------------
226 : * support functions for lsn-order iterating over the ->changes of a
227 : * transaction and its subtransactions
228 : *
229 : * used for iteration over the k-way heap merge of a transaction and its
230 : * subtransactions
231 : * ---------------------------------------
232 : */
233 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
234 : ReorderBufferIterTXNState *volatile *iter_state);
235 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
236 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
237 : ReorderBufferIterTXNState *state);
238 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
239 :
240 : /*
241 : * ---------------------------------------
242 : * Disk serialization support functions
243 : * ---------------------------------------
244 : */
245 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
246 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
247 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
248 : int fd, ReorderBufferChange *change);
249 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
250 : TXNEntryFile *file, XLogSegNo *segno);
251 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
252 : char *change);
253 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
254 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : bool txn_prepared);
256 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
257 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
258 : TransactionId xid, XLogSegNo segno);
259 :
260 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
261 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
262 : ReorderBufferTXN *txn, CommandId cid);
263 :
264 : /*
265 : * ---------------------------------------
266 : * Streaming support functions
267 : * ---------------------------------------
268 : */
269 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
270 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
271 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
272 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
273 :
274 : /* ---------------------------------------
275 : * toast reassembly support
276 : * ---------------------------------------
277 : */
278 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
279 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
280 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
281 : Relation relation, ReorderBufferChange *change);
282 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
283 : Relation relation, ReorderBufferChange *change);
284 :
285 : /*
286 : * ---------------------------------------
287 : * memory accounting
288 : * ---------------------------------------
289 : */
290 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
291 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
292 : ReorderBufferChange *change, bool addition);
293 :
294 : /*
295 : * Allocate a new ReorderBuffer and clean out any old serialized state from
296 : * prior ReorderBuffer instances for the same slot.
297 : */
298 : ReorderBuffer *
299 836 : ReorderBufferAllocate(void)
300 : {
301 : ReorderBuffer *buffer;
302 : HASHCTL hash_ctl;
303 : MemoryContext new_ctx;
304 :
305 836 : Assert(MyReplicationSlot != NULL);
306 :
307 : /* allocate memory in own context, to have better accountability */
308 836 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
309 : "ReorderBuffer",
310 : ALLOCSET_DEFAULT_SIZES);
311 :
312 836 : buffer =
313 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
314 :
315 836 : memset(&hash_ctl, 0, sizeof(hash_ctl));
316 :
317 836 : buffer->context = new_ctx;
318 :
319 836 : buffer->change_context = SlabContextCreate(new_ctx,
320 : "Change",
321 : SLAB_DEFAULT_BLOCK_SIZE,
322 : sizeof(ReorderBufferChange));
323 :
324 836 : buffer->txn_context = SlabContextCreate(new_ctx,
325 : "TXN",
326 : SLAB_DEFAULT_BLOCK_SIZE,
327 : sizeof(ReorderBufferTXN));
328 :
329 836 : buffer->tup_context = GenerationContextCreate(new_ctx,
330 : "Tuples",
331 : SLAB_LARGE_BLOCK_SIZE);
332 :
333 836 : hash_ctl.keysize = sizeof(TransactionId);
334 836 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
335 836 : hash_ctl.hcxt = buffer->context;
336 :
337 836 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
338 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
339 :
340 836 : buffer->by_txn_last_xid = InvalidTransactionId;
341 836 : buffer->by_txn_last_txn = NULL;
342 :
343 836 : buffer->outbuf = NULL;
344 836 : buffer->outbufsize = 0;
345 836 : buffer->size = 0;
346 :
347 836 : buffer->spillTxns = 0;
348 836 : buffer->spillCount = 0;
349 836 : buffer->spillBytes = 0;
350 836 : buffer->streamTxns = 0;
351 836 : buffer->streamCount = 0;
352 836 : buffer->streamBytes = 0;
353 :
354 836 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
355 :
356 836 : dlist_init(&buffer->toplevel_by_lsn);
357 836 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
358 :
359 : /*
360 : * Ensure there's no stale data from prior uses of this slot, in case some
361 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
362 : * produce duplicated txns, and it's very cheap if there's nothing there.
363 : */
364 836 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
365 :
366 836 : return buffer;
367 : }
368 :
369 : /*
370 : * Free a ReorderBuffer
371 : */
372 : void
373 752 : ReorderBufferFree(ReorderBuffer *rb)
374 : {
375 752 : MemoryContext context = rb->context;
376 :
377 : /*
378 : * We free separately allocated data by entirely scrapping reorderbuffer's
379 : * memory context.
380 : */
381 752 : MemoryContextDelete(context);
382 :
383 : /* Free disk space used by unconsumed reorder buffers */
384 752 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
385 752 : }
386 :
387 : /*
388 : * Get an unused, possibly preallocated, ReorderBufferTXN.
389 : */
390 : static ReorderBufferTXN *
391 4592 : ReorderBufferGetTXN(ReorderBuffer *rb)
392 : {
393 : ReorderBufferTXN *txn;
394 :
395 4592 : txn = (ReorderBufferTXN *)
396 4592 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
397 :
398 4592 : memset(txn, 0, sizeof(ReorderBufferTXN));
399 :
400 4592 : dlist_init(&txn->changes);
401 4592 : dlist_init(&txn->tuplecids);
402 4592 : dlist_init(&txn->subtxns);
403 :
404 : /* InvalidCommandId is not zero, so set it explicitly */
405 4592 : txn->command_id = InvalidCommandId;
406 :
407 4592 : return txn;
408 : }
409 :
410 : /*
411 : * Free a ReorderBufferTXN.
412 : */
413 : static void
414 4528 : ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
415 : {
416 : /* clean the lookup cache if we were cached (quite likely) */
417 4528 : if (rb->by_txn_last_xid == txn->xid)
418 : {
419 4174 : rb->by_txn_last_xid = InvalidTransactionId;
420 4174 : rb->by_txn_last_txn = NULL;
421 : }
422 :
423 : /* free data that's contained */
424 :
425 4528 : if (txn->gid != NULL)
426 : {
427 56 : pfree(txn->gid);
428 56 : txn->gid = NULL;
429 : }
430 :
431 4528 : if (txn->tuplecid_hash != NULL)
432 : {
433 406 : hash_destroy(txn->tuplecid_hash);
434 406 : txn->tuplecid_hash = NULL;
435 : }
436 :
437 4528 : if (txn->invalidations)
438 : {
439 1186 : pfree(txn->invalidations);
440 1186 : txn->invalidations = NULL;
441 : }
442 :
443 4528 : pfree(txn);
444 4528 : }
445 :
446 : /*
447 : * Get an fresh ReorderBufferChange.
448 : */
449 : ReorderBufferChange *
450 3361926 : ReorderBufferGetChange(ReorderBuffer *rb)
451 : {
452 : ReorderBufferChange *change;
453 :
454 3361926 : change = (ReorderBufferChange *)
455 3361926 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
456 :
457 3361926 : memset(change, 0, sizeof(ReorderBufferChange));
458 3361926 : return change;
459 : }
460 :
461 : /*
462 : * Free a ReorderBufferChange and update memory accounting, if requested.
463 : */
464 : void
465 3358716 : ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change,
466 : bool upd_mem)
467 : {
468 : /* update memory accounting info */
469 3358716 : if (upd_mem)
470 3340054 : ReorderBufferChangeMemoryUpdate(rb, change, false);
471 :
472 : /* free contained data */
473 3358716 : switch (change->action)
474 : {
475 : case REORDER_BUFFER_CHANGE_INSERT:
476 : case REORDER_BUFFER_CHANGE_UPDATE:
477 : case REORDER_BUFFER_CHANGE_DELETE:
478 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
479 3248992 : if (change->data.tp.newtuple)
480 : {
481 2834376 : ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
482 2834376 : change->data.tp.newtuple = NULL;
483 : }
484 :
485 3248992 : if (change->data.tp.oldtuple)
486 : {
487 275986 : ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
488 275986 : change->data.tp.oldtuple = NULL;
489 : }
490 3248992 : break;
491 : case REORDER_BUFFER_CHANGE_MESSAGE:
492 76 : if (change->data.msg.prefix != NULL)
493 76 : pfree(change->data.msg.prefix);
494 76 : change->data.msg.prefix = NULL;
495 76 : if (change->data.msg.message != NULL)
496 76 : pfree(change->data.msg.message);
497 76 : change->data.msg.message = NULL;
498 76 : break;
499 : case REORDER_BUFFER_CHANGE_INVALIDATION:
500 5964 : if (change->data.inval.invalidations)
501 5964 : pfree(change->data.inval.invalidations);
502 5964 : change->data.inval.invalidations = NULL;
503 5964 : break;
504 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
505 1218 : if (change->data.snapshot)
506 : {
507 1218 : ReorderBufferFreeSnap(rb, change->data.snapshot);
508 1218 : change->data.snapshot = NULL;
509 : }
510 1218 : break;
511 : /* no data in addition to the struct itself */
512 : case REORDER_BUFFER_CHANGE_TRUNCATE:
513 50 : if (change->data.truncate.relids != NULL)
514 : {
515 50 : ReorderBufferReturnRelids(rb, change->data.truncate.relids);
516 50 : change->data.truncate.relids = NULL;
517 : }
518 50 : break;
519 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
520 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
521 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
522 102416 : break;
523 : }
524 :
525 3358716 : pfree(change);
526 3358716 : }
527 :
528 : /*
529 : * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
530 : * tuple_len (excluding header overhead).
531 : */
532 : ReorderBufferTupleBuf *
533 3113388 : ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
534 : {
535 : ReorderBufferTupleBuf *tuple;
536 : Size alloc_len;
537 :
538 3113388 : alloc_len = tuple_len + SizeofHeapTupleHeader;
539 :
540 3113388 : tuple = (ReorderBufferTupleBuf *)
541 3113388 : MemoryContextAlloc(rb->tup_context,
542 : sizeof(ReorderBufferTupleBuf) +
543 : MAXIMUM_ALIGNOF + alloc_len);
544 3113388 : tuple->alloc_tuple_size = alloc_len;
545 3113388 : tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
546 :
547 3113388 : return tuple;
548 : }
549 :
550 : /*
551 : * Free an ReorderBufferTupleBuf.
552 : */
553 : void
554 3110362 : ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
555 : {
556 3110362 : pfree(tuple);
557 3110362 : }
558 :
559 : /*
560 : * Get an array for relids of truncated relations.
561 : *
562 : * We use the global memory context (for the whole reorder buffer), because
563 : * none of the existing ones seems like a good match (some are SLAB, so we
564 : * can't use those, and tup_context is meant for tuple data, not relids). We
565 : * could add yet another context, but it seems like an overkill - TRUNCATE is
566 : * not particularly common operation, so it does not seem worth it.
567 : */
568 : Oid *
569 50 : ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids)
570 : {
571 : Oid *relids;
572 : Size alloc_len;
573 :
574 50 : alloc_len = sizeof(Oid) * nrelids;
575 :
576 50 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
577 :
578 50 : return relids;
579 : }
580 :
581 : /*
582 : * Free an array of relids.
583 : */
584 : void
585 50 : ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
586 : {
587 50 : pfree(relids);
588 50 : }
589 :
590 : /*
591 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
592 : * If create is true, and a transaction doesn't already exist, create it
593 : * (with the given LSN, and as top transaction if that's specified);
594 : * when this happens, is_new is set to true.
595 : */
596 : static ReorderBufferTXN *
597 11455058 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
598 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
599 : {
600 : ReorderBufferTXN *txn;
601 : ReorderBufferTXNByIdEnt *ent;
602 : bool found;
603 :
604 11455058 : Assert(TransactionIdIsValid(xid));
605 :
606 : /*
607 : * Check the one-entry lookup cache first
608 : */
609 22905904 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
610 11450846 : rb->by_txn_last_xid == xid)
611 : {
612 9452670 : txn = rb->by_txn_last_txn;
613 :
614 9452670 : if (txn != NULL)
615 : {
616 : /* found it, and it's valid */
617 9452664 : if (is_new)
618 3118 : *is_new = false;
619 9452664 : return txn;
620 : }
621 :
622 : /*
623 : * cached as non-existent, and asked not to create? Then nothing else
624 : * to do.
625 : */
626 6 : if (!create)
627 6 : return NULL;
628 : /* otherwise fall through to create it */
629 : }
630 :
631 : /*
632 : * If the cache wasn't hit or it yielded an "does-not-exist" and we want
633 : * to create an entry.
634 : */
635 :
636 : /* search the lookup table */
637 2002388 : ent = (ReorderBufferTXNByIdEnt *)
638 2002388 : hash_search(rb->by_txn,
639 : (void *) &xid,
640 : create ? HASH_ENTER : HASH_FIND,
641 : &found);
642 2002388 : if (found)
643 1995246 : txn = ent->txn;
644 7142 : else if (create)
645 : {
646 : /* initialize the new entry, if creation was requested */
647 4592 : Assert(ent != NULL);
648 4592 : Assert(lsn != InvalidXLogRecPtr);
649 :
650 4592 : ent->txn = ReorderBufferGetTXN(rb);
651 4592 : ent->txn->xid = xid;
652 4592 : txn = ent->txn;
653 4592 : txn->first_lsn = lsn;
654 4592 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
655 :
656 4592 : if (create_as_top)
657 : {
658 3298 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
659 3298 : AssertTXNLsnOrder(rb);
660 : }
661 : }
662 : else
663 2550 : txn = NULL; /* not found and not asked to create */
664 :
665 : /* update cache */
666 2002388 : rb->by_txn_last_xid = xid;
667 2002388 : rb->by_txn_last_txn = txn;
668 :
669 2002388 : if (is_new)
670 3442 : *is_new = !found;
671 :
672 2002388 : Assert(!create || txn != NULL);
673 2002388 : return txn;
674 : }
675 :
676 : /*
677 : * Record the partial change for the streaming of in-progress transactions. We
678 : * can stream only complete changes so if we have a partial change like toast
679 : * table insert or speculative insert then we mark such a 'txn' so that it
680 : * can't be streamed. We also ensure that if the changes in such a 'txn' are
681 : * above logical_decoding_work_mem threshold then we stream them as soon as we
682 : * have a complete change.
683 : */
684 : static void
685 2994836 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
686 : ReorderBufferChange *change,
687 : bool toast_insert)
688 : {
689 : ReorderBufferTXN *toptxn;
690 :
691 : /*
692 : * The partial changes need to be processed only while streaming
693 : * in-progress transactions.
694 : */
695 2994836 : if (!ReorderBufferCanStream(rb))
696 5427072 : return;
697 :
698 : /* Get the top transaction. */
699 562600 : if (txn->toptxn != NULL)
700 30532 : toptxn = txn->toptxn;
701 : else
702 532068 : toptxn = txn;
703 :
704 : /*
705 : * Set the toast insert bit whenever we get toast insert to indicate a
706 : * partial change and clear it when we get the insert or update on main
707 : * table (Both update and insert will do the insert in the toast table).
708 : */
709 562600 : if (toast_insert)
710 2726 : toptxn->txn_flags |= RBTXN_HAS_TOAST_INSERT;
711 559896 : else if (rbtxn_has_toast_insert(toptxn) &&
712 22 : IsInsertOrUpdate(change->action))
713 22 : toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT;
714 :
715 : /*
716 : * Set the spec insert bit whenever we get the speculative insert to
717 : * indicate the partial change and clear the same on speculative confirm.
718 : */
719 562600 : if (IsSpecInsert(change->action))
720 0 : toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT;
721 562600 : else if (IsSpecConfirm(change->action))
722 : {
723 : /*
724 : * Speculative confirm change must be preceded by speculative
725 : * insertion.
726 : */
727 0 : Assert(rbtxn_has_spec_insert(toptxn));
728 0 : toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT;
729 : }
730 :
731 : /*
732 : * Stream the transaction if it is serialized before and the changes are
733 : * now complete in the top-level transaction.
734 : *
735 : * The reason for doing the streaming of such a transaction as soon as we
736 : * get the complete change for it is that previously it would have reached
737 : * the memory threshold and wouldn't get streamed because of incomplete
738 : * changes. Delaying such transactions would increase apply lag for them.
739 : */
740 887454 : if (ReorderBufferCanStartStreaming(rb) &&
741 969110 : !(rbtxn_has_incomplete_tuple(toptxn)) &&
742 322128 : rbtxn_is_serialized(txn))
743 4 : ReorderBufferStreamTXN(rb, toptxn);
744 : }
745 :
746 : /*
747 : * Queue a change into a transaction so it can be replayed upon commit or will be
748 : * streamed when we reach logical_decoding_work_mem threshold.
749 : */
750 : void
751 3013498 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
752 : ReorderBufferChange *change, bool toast_insert)
753 : {
754 : ReorderBufferTXN *txn;
755 :
756 3013498 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
757 :
758 : /*
759 : * While streaming the previous changes we have detected that the
760 : * transaction is aborted. So there is no point in collecting further
761 : * changes for it.
762 : */
763 3013498 : if (txn->concurrent_abort)
764 : {
765 : /*
766 : * We don't need to update memory accounting for this change as we
767 : * have not added it to the queue yet.
768 : */
769 18662 : ReorderBufferReturnChange(rb, change, false);
770 3032156 : return;
771 : }
772 :
773 2994836 : change->lsn = lsn;
774 2994836 : change->txn = txn;
775 :
776 2994836 : Assert(InvalidXLogRecPtr != lsn);
777 2994836 : dlist_push_tail(&txn->changes, &change->node);
778 2994836 : txn->nentries++;
779 2994836 : txn->nentries_mem++;
780 :
781 : /* update memory accounting information */
782 2994836 : ReorderBufferChangeMemoryUpdate(rb, change, true);
783 :
784 : /* process partial change */
785 2994836 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
786 :
787 : /* check the memory limits and evict something if needed */
788 2994836 : ReorderBufferCheckMemoryLimit(rb);
789 : }
790 :
791 : /*
792 : * Queue message into a transaction so it can be processed upon commit.
793 : */
794 : void
795 80 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
796 : Snapshot snapshot, XLogRecPtr lsn,
797 : bool transactional, const char *prefix,
798 : Size message_size, const char *message)
799 : {
800 80 : if (transactional)
801 : {
802 : MemoryContext oldcontext;
803 : ReorderBufferChange *change;
804 :
805 74 : Assert(xid != InvalidTransactionId);
806 :
807 74 : oldcontext = MemoryContextSwitchTo(rb->context);
808 :
809 74 : change = ReorderBufferGetChange(rb);
810 74 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
811 74 : change->data.msg.prefix = pstrdup(prefix);
812 74 : change->data.msg.message_size = message_size;
813 74 : change->data.msg.message = palloc(message_size);
814 74 : memcpy(change->data.msg.message, message, message_size);
815 :
816 74 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
817 :
818 74 : MemoryContextSwitchTo(oldcontext);
819 : }
820 : else
821 : {
822 6 : ReorderBufferTXN *txn = NULL;
823 6 : volatile Snapshot snapshot_now = snapshot;
824 :
825 6 : if (xid != InvalidTransactionId)
826 4 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
827 :
828 : /* setup snapshot to allow catalog access */
829 6 : SetupHistoricSnapshot(snapshot_now, NULL);
830 6 : PG_TRY();
831 : {
832 6 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
833 :
834 6 : TeardownHistoricSnapshot(false);
835 : }
836 0 : PG_CATCH();
837 : {
838 0 : TeardownHistoricSnapshot(true);
839 0 : PG_RE_THROW();
840 : }
841 6 : PG_END_TRY();
842 : }
843 80 : }
844 :
845 : /*
846 : * AssertTXNLsnOrder
847 : * Verify LSN ordering of transaction lists in the reorderbuffer
848 : *
849 : * Other LSN-related invariants are checked too.
850 : *
851 : * No-op if assertions are not in use.
852 : */
853 : static void
854 8182 : AssertTXNLsnOrder(ReorderBuffer *rb)
855 : {
856 : #ifdef USE_ASSERT_CHECKING
857 : dlist_iter iter;
858 8182 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
859 8182 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
860 :
861 17010 : dlist_foreach(iter, &rb->toplevel_by_lsn)
862 : {
863 8828 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
864 : iter.cur);
865 :
866 : /* start LSN must be set */
867 8828 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
868 :
869 : /* If there is an end LSN, it must be higher than start LSN */
870 8828 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
871 8 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
872 :
873 : /* Current initial LSN must be strictly higher than previous */
874 8828 : if (prev_first_lsn != InvalidXLogRecPtr)
875 944 : Assert(prev_first_lsn < cur_txn->first_lsn);
876 :
877 : /* known-as-subtxn txns must not be listed */
878 8828 : Assert(!rbtxn_is_known_subxact(cur_txn));
879 :
880 8828 : prev_first_lsn = cur_txn->first_lsn;
881 : }
882 :
883 13438 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
884 : {
885 5256 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
886 : base_snapshot_node,
887 : iter.cur);
888 :
889 : /* base snapshot (and its LSN) must be set */
890 5256 : Assert(cur_txn->base_snapshot != NULL);
891 5256 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
892 :
893 : /* current LSN must be strictly higher than previous */
894 5256 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
895 836 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
896 :
897 : /* known-as-subtxn txns must not be listed */
898 5256 : Assert(!rbtxn_is_known_subxact(cur_txn));
899 :
900 5256 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
901 : }
902 : #endif
903 8182 : }
904 :
905 : /*
906 : * AssertChangeLsnOrder
907 : *
908 : * Check ordering of changes in the (sub)transaction.
909 : */
910 : static void
911 2428 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
912 : {
913 : #ifdef USE_ASSERT_CHECKING
914 : dlist_iter iter;
915 2428 : XLogRecPtr prev_lsn = txn->first_lsn;
916 :
917 362046 : dlist_foreach(iter, &txn->changes)
918 : {
919 : ReorderBufferChange *cur_change;
920 :
921 359618 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
922 :
923 359618 : Assert(txn->first_lsn != InvalidXLogRecPtr);
924 359618 : Assert(cur_change->lsn != InvalidXLogRecPtr);
925 359618 : Assert(txn->first_lsn <= cur_change->lsn);
926 :
927 359618 : if (txn->end_lsn != InvalidXLogRecPtr)
928 46774 : Assert(cur_change->lsn <= txn->end_lsn);
929 :
930 359618 : Assert(prev_lsn <= cur_change->lsn);
931 :
932 359618 : prev_lsn = cur_change->lsn;
933 : }
934 : #endif
935 2428 : }
936 :
937 : /*
938 : * ReorderBufferGetOldestTXN
939 : * Return oldest transaction in reorderbuffer
940 : */
941 : ReorderBufferTXN *
942 158 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
943 : {
944 : ReorderBufferTXN *txn;
945 :
946 158 : AssertTXNLsnOrder(rb);
947 :
948 158 : if (dlist_is_empty(&rb->toplevel_by_lsn))
949 144 : return NULL;
950 :
951 14 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
952 :
953 14 : Assert(!rbtxn_is_known_subxact(txn));
954 14 : Assert(txn->first_lsn != InvalidXLogRecPtr);
955 14 : return txn;
956 : }
957 :
958 : /*
959 : * ReorderBufferGetOldestXmin
960 : * Return oldest Xmin in reorderbuffer
961 : *
962 : * Returns oldest possibly running Xid from the point of view of snapshots
963 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
964 : * there are none.
965 : *
966 : * Since snapshots are assigned monotonically, this equals the Xmin of the
967 : * base snapshot with minimal base_snapshot_lsn.
968 : */
969 : TransactionId
970 172 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
971 : {
972 : ReorderBufferTXN *txn;
973 :
974 172 : AssertTXNLsnOrder(rb);
975 :
976 172 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
977 158 : return InvalidTransactionId;
978 :
979 14 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
980 : &rb->txns_by_base_snapshot_lsn);
981 14 : return txn->base_snapshot->xmin;
982 : }
983 :
984 : void
985 188 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
986 : {
987 188 : rb->current_restart_decoding_lsn = ptr;
988 188 : }
989 :
990 : /*
991 : * ReorderBufferAssignChild
992 : *
993 : * Make note that we know that subxid is a subtransaction of xid, seen as of
994 : * the given lsn.
995 : */
996 : void
997 1650 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
998 : TransactionId subxid, XLogRecPtr lsn)
999 : {
1000 : ReorderBufferTXN *txn;
1001 : ReorderBufferTXN *subtxn;
1002 : bool new_top;
1003 : bool new_sub;
1004 :
1005 1650 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1006 1650 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1007 :
1008 1650 : if (!new_sub)
1009 : {
1010 356 : if (rbtxn_is_known_subxact(subtxn))
1011 : {
1012 : /* already associated, nothing to do */
1013 2006 : return;
1014 : }
1015 : else
1016 : {
1017 : /*
1018 : * We already saw this transaction, but initially added it to the
1019 : * list of top-level txns. Now that we know it's not top-level,
1020 : * remove it from there.
1021 : */
1022 0 : dlist_delete(&subtxn->node);
1023 : }
1024 : }
1025 :
1026 1294 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1027 1294 : subtxn->toplevel_xid = xid;
1028 1294 : Assert(subtxn->nsubtxns == 0);
1029 :
1030 : /* set the reference to top-level transaction */
1031 1294 : subtxn->toptxn = txn;
1032 :
1033 : /* add to subtransaction list */
1034 1294 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1035 1294 : txn->nsubtxns++;
1036 :
1037 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1038 1294 : ReorderBufferTransferSnapToParent(txn, subtxn);
1039 :
1040 : /* Verify LSN-ordering invariant */
1041 1294 : AssertTXNLsnOrder(rb);
1042 : }
1043 :
1044 : /*
1045 : * ReorderBufferTransferSnapToParent
1046 : * Transfer base snapshot from subtxn to top-level txn, if needed
1047 : *
1048 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1049 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1050 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1051 : * txn but there are some in the subtxn, or the first change in subtxn has
1052 : * earlier LSN than first change in the top-level txn and we learned about
1053 : * their kinship only now.
1054 : *
1055 : * The subtransaction's snapshot is cleared regardless of the transfer
1056 : * happening, since it's not needed anymore in either case.
1057 : *
1058 : * We do this as soon as we become aware of their kinship, to avoid queueing
1059 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1060 : * receive further snapshots.
1061 : */
1062 : static void
1063 1302 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1064 : ReorderBufferTXN *subtxn)
1065 : {
1066 1302 : Assert(subtxn->toplevel_xid == txn->xid);
1067 :
1068 1302 : if (subtxn->base_snapshot != NULL)
1069 : {
1070 0 : if (txn->base_snapshot == NULL ||
1071 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1072 : {
1073 : /*
1074 : * If the toplevel transaction already has a base snapshot but
1075 : * it's newer than the subxact's, purge it.
1076 : */
1077 0 : if (txn->base_snapshot != NULL)
1078 : {
1079 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1080 0 : dlist_delete(&txn->base_snapshot_node);
1081 : }
1082 :
1083 : /*
1084 : * The snapshot is now the top transaction's; transfer it, and
1085 : * adjust the list position of the top transaction in the list by
1086 : * moving it to where the subtransaction is.
1087 : */
1088 0 : txn->base_snapshot = subtxn->base_snapshot;
1089 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1090 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1091 : &txn->base_snapshot_node);
1092 :
1093 : /*
1094 : * The subtransaction doesn't have a snapshot anymore (so it
1095 : * mustn't be in the list.)
1096 : */
1097 0 : subtxn->base_snapshot = NULL;
1098 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1099 0 : dlist_delete(&subtxn->base_snapshot_node);
1100 : }
1101 : else
1102 : {
1103 : /* Base snap of toplevel is fine, so subxact's is not needed */
1104 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1105 0 : dlist_delete(&subtxn->base_snapshot_node);
1106 0 : subtxn->base_snapshot = NULL;
1107 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1108 : }
1109 : }
1110 1302 : }
1111 :
1112 : /*
1113 : * Associate a subtransaction with its toplevel transaction at commit
1114 : * time. There may be no further changes added after this.
1115 : */
1116 : void
1117 518 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1118 : TransactionId subxid, XLogRecPtr commit_lsn,
1119 : XLogRecPtr end_lsn)
1120 : {
1121 : ReorderBufferTXN *subtxn;
1122 :
1123 518 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1124 : InvalidXLogRecPtr, false);
1125 :
1126 : /*
1127 : * No need to do anything if that subtxn didn't contain any changes
1128 : */
1129 518 : if (!subtxn)
1130 680 : return;
1131 :
1132 356 : subtxn->final_lsn = commit_lsn;
1133 356 : subtxn->end_lsn = end_lsn;
1134 :
1135 : /*
1136 : * Assign this subxact as a child of the toplevel xact (no-op if already
1137 : * done.)
1138 : */
1139 356 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1140 : }
1141 :
1142 :
1143 : /*
1144 : * Support for efficiently iterating over a transaction's and its
1145 : * subtransactions' changes.
1146 : *
1147 : * We do by doing a k-way merge between transactions/subtransactions. For that
1148 : * we model the current heads of the different transactions as a binary heap
1149 : * so we easily know which (sub-)transaction has the change with the smallest
1150 : * lsn next.
1151 : *
1152 : * We assume the changes in individual transactions are already sorted by LSN.
1153 : */
1154 :
1155 : /*
1156 : * Binary heap comparison function.
1157 : */
1158 : static int
1159 110866 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1160 : {
1161 110866 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1162 110866 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1163 110866 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1164 :
1165 110866 : if (pos_a < pos_b)
1166 108214 : return 1;
1167 2652 : else if (pos_a == pos_b)
1168 0 : return 0;
1169 2652 : return -1;
1170 : }
1171 :
1172 : /*
1173 : * Allocate & initialize an iterator which iterates in lsn order over a
1174 : * transaction and all its subtransactions.
1175 : *
1176 : * Note: The iterator state is returned through iter_state parameter rather
1177 : * than the function's return value. This is because the state gets cleaned up
1178 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1179 : * back the state even if this function throws an exception.
1180 : */
1181 : static void
1182 1910 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1183 : ReorderBufferIterTXNState *volatile *iter_state)
1184 : {
1185 1910 : Size nr_txns = 0;
1186 : ReorderBufferIterTXNState *state;
1187 : dlist_iter cur_txn_i;
1188 : int32 off;
1189 :
1190 1910 : *iter_state = NULL;
1191 :
1192 : /* Check ordering of changes in the toplevel transaction. */
1193 1910 : AssertChangeLsnOrder(txn);
1194 :
1195 : /*
1196 : * Calculate the size of our heap: one element for every transaction that
1197 : * contains changes. (Besides the transactions already in the reorder
1198 : * buffer, we count the one we were directly passed.)
1199 : */
1200 1910 : if (txn->nentries > 0)
1201 1804 : nr_txns++;
1202 :
1203 2428 : dlist_foreach(cur_txn_i, &txn->subtxns)
1204 : {
1205 : ReorderBufferTXN *cur_txn;
1206 :
1207 518 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1208 :
1209 : /* Check ordering of changes in this subtransaction. */
1210 518 : AssertChangeLsnOrder(cur_txn);
1211 :
1212 518 : if (cur_txn->nentries > 0)
1213 442 : nr_txns++;
1214 : }
1215 :
1216 : /* allocate iteration state */
1217 1910 : state = (ReorderBufferIterTXNState *)
1218 1910 : MemoryContextAllocZero(rb->context,
1219 : sizeof(ReorderBufferIterTXNState) +
1220 1910 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1221 :
1222 1910 : state->nr_txns = nr_txns;
1223 1910 : dlist_init(&state->old_change);
1224 :
1225 4156 : for (off = 0; off < state->nr_txns; off++)
1226 : {
1227 2246 : state->entries[off].file.vfd = -1;
1228 2246 : state->entries[off].segno = 0;
1229 : }
1230 :
1231 : /* allocate heap */
1232 1910 : state->heap = binaryheap_allocate(state->nr_txns,
1233 : ReorderBufferIterCompare,
1234 : state);
1235 :
1236 : /* Now that the state fields are initialized, it is safe to return it. */
1237 1910 : *iter_state = state;
1238 :
1239 : /*
1240 : * Now insert items into the binary heap, in an unordered fashion. (We
1241 : * will run a heap assembly step at the end; this is more efficient.)
1242 : */
1243 :
1244 1910 : off = 0;
1245 :
1246 : /* add toplevel transaction if it contains changes */
1247 1910 : if (txn->nentries > 0)
1248 : {
1249 : ReorderBufferChange *cur_change;
1250 :
1251 1804 : if (rbtxn_is_serialized(txn))
1252 : {
1253 : /* serialize remaining changes */
1254 34 : ReorderBufferSerializeTXN(rb, txn);
1255 34 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1256 : &state->entries[off].segno);
1257 : }
1258 :
1259 1804 : cur_change = dlist_head_element(ReorderBufferChange, node,
1260 : &txn->changes);
1261 :
1262 1804 : state->entries[off].lsn = cur_change->lsn;
1263 1804 : state->entries[off].change = cur_change;
1264 1804 : state->entries[off].txn = txn;
1265 :
1266 1804 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1267 : }
1268 :
1269 : /* add subtransactions if they contain changes */
1270 2428 : dlist_foreach(cur_txn_i, &txn->subtxns)
1271 : {
1272 : ReorderBufferTXN *cur_txn;
1273 :
1274 518 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1275 :
1276 518 : if (cur_txn->nentries > 0)
1277 : {
1278 : ReorderBufferChange *cur_change;
1279 :
1280 442 : if (rbtxn_is_serialized(cur_txn))
1281 : {
1282 : /* serialize remaining changes */
1283 32 : ReorderBufferSerializeTXN(rb, cur_txn);
1284 32 : ReorderBufferRestoreChanges(rb, cur_txn,
1285 : &state->entries[off].file,
1286 : &state->entries[off].segno);
1287 : }
1288 442 : cur_change = dlist_head_element(ReorderBufferChange, node,
1289 : &cur_txn->changes);
1290 :
1291 442 : state->entries[off].lsn = cur_change->lsn;
1292 442 : state->entries[off].change = cur_change;
1293 442 : state->entries[off].txn = cur_txn;
1294 :
1295 442 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1296 : }
1297 : }
1298 :
1299 : /* assemble a valid binary heap */
1300 1910 : binaryheap_build(state->heap);
1301 1910 : }
1302 :
1303 : /*
1304 : * Return the next change when iterating over a transaction and its
1305 : * subtransactions.
1306 : *
1307 : * Returns NULL when no further changes exist.
1308 : */
1309 : static ReorderBufferChange *
1310 665390 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1311 : {
1312 : ReorderBufferChange *change;
1313 : ReorderBufferIterTXNEntry *entry;
1314 : int32 off;
1315 :
1316 : /* nothing there anymore */
1317 665390 : if (state->heap->bh_size == 0)
1318 1886 : return NULL;
1319 :
1320 663504 : off = DatumGetInt32(binaryheap_first(state->heap));
1321 663504 : entry = &state->entries[off];
1322 :
1323 : /* free memory we might have "leaked" in the previous *Next call */
1324 663504 : if (!dlist_is_empty(&state->old_change))
1325 : {
1326 82 : change = dlist_container(ReorderBufferChange, node,
1327 : dlist_pop_head_node(&state->old_change));
1328 82 : ReorderBufferReturnChange(rb, change, true);
1329 82 : Assert(dlist_is_empty(&state->old_change));
1330 : }
1331 :
1332 663504 : change = entry->change;
1333 :
1334 : /*
1335 : * update heap with information about which transaction has the next
1336 : * relevant change in LSN order
1337 : */
1338 :
1339 : /* there are in-memory changes */
1340 663504 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1341 : {
1342 661218 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1343 661218 : ReorderBufferChange *next_change =
1344 661218 : dlist_container(ReorderBufferChange, node, next);
1345 :
1346 : /* txn stays the same */
1347 661218 : state->entries[off].lsn = next_change->lsn;
1348 661218 : state->entries[off].change = next_change;
1349 :
1350 661218 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1351 661218 : return change;
1352 : }
1353 :
1354 : /* try to load changes from disk */
1355 2286 : if (entry->txn->nentries != entry->txn->nentries_mem)
1356 : {
1357 : /*
1358 : * Ugly: restoring changes will reuse *Change records, thus delete the
1359 : * current one from the per-tx list and only free in the next call.
1360 : */
1361 116 : dlist_delete(&change->node);
1362 116 : dlist_push_tail(&state->old_change, &change->node);
1363 :
1364 116 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1365 : &state->entries[off].segno))
1366 : {
1367 : /* successfully restored changes from disk */
1368 64 : ReorderBufferChange *next_change =
1369 64 : dlist_head_element(ReorderBufferChange, node,
1370 : &entry->txn->changes);
1371 :
1372 64 : elog(DEBUG2, "restored %u/%u changes from disk",
1373 : (uint32) entry->txn->nentries_mem,
1374 : (uint32) entry->txn->nentries);
1375 :
1376 64 : Assert(entry->txn->nentries_mem);
1377 : /* txn stays the same */
1378 64 : state->entries[off].lsn = next_change->lsn;
1379 64 : state->entries[off].change = next_change;
1380 64 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1381 :
1382 64 : return change;
1383 : }
1384 : }
1385 :
1386 : /* ok, no changes there anymore, remove */
1387 2222 : binaryheap_remove_first(state->heap);
1388 :
1389 2222 : return change;
1390 : }
1391 :
1392 : /*
1393 : * Deallocate the iterator
1394 : */
1395 : static void
1396 1906 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1397 : ReorderBufferIterTXNState *state)
1398 : {
1399 : int32 off;
1400 :
1401 4148 : for (off = 0; off < state->nr_txns; off++)
1402 : {
1403 2242 : if (state->entries[off].file.vfd != -1)
1404 0 : FileClose(state->entries[off].file.vfd);
1405 : }
1406 :
1407 : /* free memory we might have "leaked" in the last *Next call */
1408 1906 : if (!dlist_is_empty(&state->old_change))
1409 : {
1410 : ReorderBufferChange *change;
1411 :
1412 32 : change = dlist_container(ReorderBufferChange, node,
1413 : dlist_pop_head_node(&state->old_change));
1414 32 : ReorderBufferReturnChange(rb, change, true);
1415 32 : Assert(dlist_is_empty(&state->old_change));
1416 : }
1417 :
1418 1906 : binaryheap_free(state->heap);
1419 1906 : pfree(state);
1420 1906 : }
1421 :
1422 : /*
1423 : * Cleanup the contents of a transaction, usually after the transaction
1424 : * committed or aborted.
1425 : */
1426 : static void
1427 4528 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1428 : {
1429 : bool found;
1430 : dlist_mutable_iter iter;
1431 :
1432 : /* cleanup subtransactions & their changes */
1433 4882 : dlist_foreach_modify(iter, &txn->subtxns)
1434 : {
1435 : ReorderBufferTXN *subtxn;
1436 :
1437 354 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1438 :
1439 : /*
1440 : * Subtransactions are always associated to the toplevel TXN, even if
1441 : * they originally were happening inside another subtxn, so we won't
1442 : * ever recurse more than one level deep here.
1443 : */
1444 354 : Assert(rbtxn_is_known_subxact(subtxn));
1445 354 : Assert(subtxn->nsubtxns == 0);
1446 :
1447 354 : ReorderBufferCleanupTXN(rb, subtxn);
1448 : }
1449 :
1450 : /* cleanup changes in the txn */
1451 135302 : dlist_foreach_modify(iter, &txn->changes)
1452 : {
1453 : ReorderBufferChange *change;
1454 :
1455 130774 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1456 :
1457 : /* Check we're not mixing changes from different transactions. */
1458 130774 : Assert(change->txn == txn);
1459 :
1460 130774 : ReorderBufferReturnChange(rb, change, true);
1461 : }
1462 :
1463 : /*
1464 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1465 : * They are always stored in the toplevel transaction.
1466 : */
1467 35650 : dlist_foreach_modify(iter, &txn->tuplecids)
1468 : {
1469 : ReorderBufferChange *change;
1470 :
1471 31122 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1472 :
1473 : /* Check we're not mixing changes from different transactions. */
1474 31122 : Assert(change->txn == txn);
1475 31122 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1476 :
1477 31122 : ReorderBufferReturnChange(rb, change, true);
1478 : }
1479 :
1480 : /*
1481 : * Cleanup the base snapshot, if set.
1482 : */
1483 4528 : if (txn->base_snapshot != NULL)
1484 : {
1485 3210 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1486 3210 : dlist_delete(&txn->base_snapshot_node);
1487 : }
1488 :
1489 : /*
1490 : * Cleanup the snapshot for the last streamed run.
1491 : */
1492 4528 : if (txn->snapshot_now != NULL)
1493 : {
1494 54 : Assert(rbtxn_is_streamed(txn));
1495 54 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1496 : }
1497 :
1498 : /*
1499 : * Remove TXN from its containing list.
1500 : *
1501 : * Note: if txn is known as subxact, we are deleting the TXN from its
1502 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1503 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1504 : * from the LSN-ordered list of toplevel TXNs.
1505 : */
1506 4528 : dlist_delete(&txn->node);
1507 :
1508 : /* now remove reference from buffer */
1509 4528 : hash_search(rb->by_txn,
1510 4528 : (void *) &txn->xid,
1511 : HASH_REMOVE,
1512 : &found);
1513 4528 : Assert(found);
1514 :
1515 : /* remove entries spilled to disk */
1516 4528 : if (rbtxn_is_serialized(txn))
1517 440 : ReorderBufferRestoreCleanup(rb, txn);
1518 :
1519 : /* deallocate */
1520 4528 : ReorderBufferReturnTXN(rb, txn);
1521 4528 : }
1522 :
1523 : /*
1524 : * Discard changes from a transaction (and subtransactions), either after streaming or
1525 : * after a PREPARE.
1526 : * The flag txn_prepared indicates if this is called after a PREPARE.
1527 : * If streaming, keep the remaining info - transactions, tuplecids, invalidations and
1528 : * snapshots. If after a PREPARE, keep only the invalidations and snapshots.
1529 : */
1530 : static void
1531 1064 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1532 : {
1533 : dlist_mutable_iter iter;
1534 :
1535 : /* cleanup subtransactions & their changes */
1536 1256 : dlist_foreach_modify(iter, &txn->subtxns)
1537 : {
1538 : ReorderBufferTXN *subtxn;
1539 :
1540 192 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1541 :
1542 : /*
1543 : * Subtransactions are always associated to the toplevel TXN, even if
1544 : * they originally were happening inside another subtxn, so we won't
1545 : * ever recurse more than one level deep here.
1546 : */
1547 192 : Assert(rbtxn_is_known_subxact(subtxn));
1548 192 : Assert(subtxn->nsubtxns == 0);
1549 :
1550 192 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1551 : }
1552 :
1553 : /* cleanup changes in the txn */
1554 320200 : dlist_foreach_modify(iter, &txn->changes)
1555 : {
1556 : ReorderBufferChange *change;
1557 :
1558 319136 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1559 :
1560 : /* Check we're not mixing changes from different transactions. */
1561 319136 : Assert(change->txn == txn);
1562 :
1563 : /* remove the change from it's containing list */
1564 319136 : dlist_delete(&change->node);
1565 :
1566 319136 : ReorderBufferReturnChange(rb, change, true);
1567 : }
1568 :
1569 : /*
1570 : * Mark the transaction as streamed.
1571 : *
1572 : * The toplevel transaction, identified by (toptxn==NULL), is marked as
1573 : * streamed always, even if it does not contain any changes (that is, when
1574 : * all the changes are in subtransactions).
1575 : *
1576 : * For subtransactions, we only mark them as streamed when there are
1577 : * changes in them.
1578 : *
1579 : * We do it this way because of aborts - we don't want to send aborts for
1580 : * XIDs the downstream is not aware of. And of course, it always knows
1581 : * about the toplevel xact (we send the XID in all messages), but we never
1582 : * stream XIDs of empty subxacts.
1583 : */
1584 1064 : if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0)))
1585 908 : txn->txn_flags |= RBTXN_IS_STREAMED;
1586 :
1587 1064 : if (txn_prepared)
1588 : {
1589 : /*
1590 : * If this is a prepared txn, cleanup the tuplecids we stored for
1591 : * decoding catalog snapshot access. They are always stored in the
1592 : * toplevel transaction.
1593 : */
1594 358 : dlist_foreach_modify(iter, &txn->tuplecids)
1595 : {
1596 : ReorderBufferChange *change;
1597 :
1598 276 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1599 :
1600 : /* Check we're not mixing changes from different transactions. */
1601 276 : Assert(change->txn == txn);
1602 276 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1603 :
1604 : /* Remove the change from its containing list. */
1605 276 : dlist_delete(&change->node);
1606 :
1607 276 : ReorderBufferReturnChange(rb, change, true);
1608 : }
1609 : }
1610 :
1611 : /*
1612 : * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any
1613 : * memory. We could also keep the hash table and update it with new ctid
1614 : * values, but this seems simpler and good enough for now.
1615 : */
1616 1064 : if (txn->tuplecid_hash != NULL)
1617 : {
1618 40 : hash_destroy(txn->tuplecid_hash);
1619 40 : txn->tuplecid_hash = NULL;
1620 : }
1621 :
1622 : /* If this txn is serialized then clean the disk space. */
1623 1064 : if (rbtxn_is_serialized(txn))
1624 : {
1625 4 : ReorderBufferRestoreCleanup(rb, txn);
1626 4 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1627 :
1628 : /*
1629 : * We set this flag to indicate if the transaction is ever serialized.
1630 : * We need this to accurately update the stats as otherwise the same
1631 : * transaction can be counted as serialized multiple times.
1632 : */
1633 4 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1634 : }
1635 :
1636 : /* also reset the number of entries in the transaction */
1637 1064 : txn->nentries_mem = 0;
1638 1064 : txn->nentries = 0;
1639 1064 : }
1640 :
1641 : /*
1642 : * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
1643 : * HeapTupleSatisfiesHistoricMVCC.
1644 : */
1645 : static void
1646 1910 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1647 : {
1648 : dlist_iter iter;
1649 : HASHCTL hash_ctl;
1650 :
1651 1910 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1652 3374 : return;
1653 :
1654 446 : memset(&hash_ctl, 0, sizeof(hash_ctl));
1655 :
1656 446 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1657 446 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1658 446 : hash_ctl.hcxt = rb->context;
1659 :
1660 : /*
1661 : * create the hash with the exact number of to-be-stored tuplecids from
1662 : * the start
1663 : */
1664 446 : txn->tuplecid_hash =
1665 446 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1666 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1667 :
1668 12620 : dlist_foreach(iter, &txn->tuplecids)
1669 : {
1670 : ReorderBufferTupleCidKey key;
1671 : ReorderBufferTupleCidEnt *ent;
1672 : bool found;
1673 : ReorderBufferChange *change;
1674 :
1675 12174 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1676 :
1677 12174 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1678 :
1679 : /* be careful about padding */
1680 12174 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1681 :
1682 12174 : key.relnode = change->data.tuplecid.node;
1683 :
1684 12174 : ItemPointerCopy(&change->data.tuplecid.tid,
1685 : &key.tid);
1686 :
1687 12174 : ent = (ReorderBufferTupleCidEnt *)
1688 12174 : hash_search(txn->tuplecid_hash,
1689 : (void *) &key,
1690 : HASH_ENTER | HASH_FIND,
1691 : &found);
1692 12174 : if (!found)
1693 : {
1694 9262 : ent->cmin = change->data.tuplecid.cmin;
1695 9262 : ent->cmax = change->data.tuplecid.cmax;
1696 9262 : ent->combocid = change->data.tuplecid.combocid;
1697 : }
1698 : else
1699 : {
1700 : /*
1701 : * Maybe we already saw this tuple before in this transaction, but
1702 : * if so it must have the same cmin.
1703 : */
1704 2912 : Assert(ent->cmin == change->data.tuplecid.cmin);
1705 :
1706 : /*
1707 : * cmax may be initially invalid, but once set it can only grow,
1708 : * and never become invalid again.
1709 : */
1710 2912 : Assert((ent->cmax == InvalidCommandId) ||
1711 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1712 : (change->data.tuplecid.cmax > ent->cmax)));
1713 2912 : ent->cmax = change->data.tuplecid.cmax;
1714 : }
1715 : }
1716 : }
1717 :
1718 : /*
1719 : * Copy a provided snapshot so we can modify it privately. This is needed so
1720 : * that catalog modifying transactions can look into intermediate catalog
1721 : * states.
1722 : */
1723 : static Snapshot
1724 1650 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1725 : ReorderBufferTXN *txn, CommandId cid)
1726 : {
1727 : Snapshot snap;
1728 : dlist_iter iter;
1729 1650 : int i = 0;
1730 : Size size;
1731 :
1732 1650 : size = sizeof(SnapshotData) +
1733 3300 : sizeof(TransactionId) * orig_snap->xcnt +
1734 1650 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1735 :
1736 1650 : snap = MemoryContextAllocZero(rb->context, size);
1737 1650 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1738 :
1739 1650 : snap->copied = true;
1740 1650 : snap->active_count = 1; /* mark as active so nobody frees it */
1741 1650 : snap->regd_count = 0;
1742 1650 : snap->xip = (TransactionId *) (snap + 1);
1743 :
1744 1650 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1745 :
1746 : /*
1747 : * snap->subxip contains all txids that belong to our transaction which we
1748 : * need to check via cmin/cmax. That's why we store the toplevel
1749 : * transaction in there as well.
1750 : */
1751 1650 : snap->subxip = snap->xip + snap->xcnt;
1752 1650 : snap->subxip[i++] = txn->xid;
1753 :
1754 : /*
1755 : * subxcnt isn't decreased when subtransactions abort, so count manually.
1756 : * Since it's an upper boundary it is safe to use it for the allocation
1757 : * above.
1758 : */
1759 1650 : snap->subxcnt = 1;
1760 :
1761 1856 : dlist_foreach(iter, &txn->subtxns)
1762 : {
1763 : ReorderBufferTXN *sub_txn;
1764 :
1765 206 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1766 206 : snap->subxip[i++] = sub_txn->xid;
1767 206 : snap->subxcnt++;
1768 : }
1769 :
1770 : /* sort so we can bsearch() later */
1771 1650 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1772 :
1773 : /* store the specified current CommandId */
1774 1650 : snap->curcid = cid;
1775 :
1776 1650 : return snap;
1777 : }
1778 :
1779 : /*
1780 : * Free a previously ReorderBufferCopySnap'ed snapshot
1781 : */
1782 : static void
1783 2854 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1784 : {
1785 2854 : if (snap->copied)
1786 1640 : pfree(snap);
1787 : else
1788 1214 : SnapBuildSnapDecRefcount(snap);
1789 2854 : }
1790 :
1791 : /*
1792 : * If the transaction was (partially) streamed, we need to commit it in a
1793 : * 'streamed' way. That is, we first stream the remaining part of the
1794 : * transaction, and then invoke stream_commit message.
1795 : */
1796 : static void
1797 56 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1798 : {
1799 : /* we should only call this for previously streamed transactions */
1800 56 : Assert(rbtxn_is_streamed(txn));
1801 :
1802 56 : ReorderBufferStreamTXN(rb, txn);
1803 :
1804 56 : if (rbtxn_prepared(txn))
1805 : {
1806 22 : rb->stream_prepare(rb, txn, txn->final_lsn);
1807 :
1808 : /*
1809 : * This is a PREPARED transaction, part of a two-phase commit. The
1810 : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
1811 : * just truncate txn by removing changes and tuple_cids
1812 : */
1813 22 : ReorderBufferTruncateTXN(rb, txn, true);
1814 : /* Reset the CheckXidAlive */
1815 22 : CheckXidAlive = InvalidTransactionId;
1816 : }
1817 : else
1818 : {
1819 34 : rb->stream_commit(rb, txn, txn->final_lsn);
1820 34 : ReorderBufferCleanupTXN(rb, txn);
1821 : }
1822 56 : }
1823 :
1824 : /*
1825 : * Set xid to detect concurrent aborts.
1826 : *
1827 : * While streaming an in-progress transaction there is a possibility that the
1828 : * (sub)transaction might get aborted concurrently. In such case if the
1829 : * (sub)transaction has catalog update then we might decode the tuple using
1830 : * wrong catalog version. For example, suppose there is one catalog tuple with
1831 : * (xmin: 500, xmax: 0). Now, the transaction 501 updates the catalog tuple
1832 : * and after that we will have two tuples (xmin: 500, xmax: 501) and
1833 : * (xmin: 501, xmax: 0). Now, if 501 is aborted and some other transaction
1834 : * say 502 updates the same catalog tuple then the first tuple will be changed
1835 : * to (xmin: 500, xmax: 502). So, the problem is that when we try to decode
1836 : * the tuple inserted/updated in 501 after the catalog update, we will see the
1837 : * catalog tuple with (xmin: 500, xmax: 502) as visible because it will
1838 : * consider that the tuple is deleted by xid 502 which is not visible to our
1839 : * snapshot. And when we will try to decode with that catalog tuple, it can
1840 : * lead to a wrong result or a crash. So, it is necessary to detect
1841 : * concurrent aborts to allow streaming of in-progress transactions.
1842 : *
1843 : * For detecting the concurrent abort we set CheckXidAlive to the current
1844 : * (sub)transaction's xid for which this change belongs to. And, during
1845 : * catalog scan we can check the status of the xid and if it is aborted we will
1846 : * report a specific error so that we can stop streaming current transaction
1847 : * and discard the already streamed changes on such an error. We might have
1848 : * already streamed some of the changes for the aborted (sub)transaction, but
1849 : * that is fine because when we decode the abort we will stream abort message
1850 : * to truncate the changes in the subscriber.
1851 : */
1852 : static inline void
1853 320252 : SetupCheckXidLive(TransactionId xid)
1854 : {
1855 : /*
1856 : * If the input transaction id is already set as a CheckXidAlive then
1857 : * nothing to do.
1858 : */
1859 320252 : if (TransactionIdEquals(CheckXidAlive, xid))
1860 529520 : return;
1861 :
1862 : /*
1863 : * setup CheckXidAlive if it's not committed yet. We don't check if the
1864 : * xid is aborted. That will happen during catalog access.
1865 : */
1866 110984 : if (!TransactionIdDidCommit(xid))
1867 590 : CheckXidAlive = xid;
1868 : else
1869 110394 : CheckXidAlive = InvalidTransactionId;
1870 : }
1871 :
1872 : /*
1873 : * Helper function for ReorderBufferProcessTXN for applying change.
1874 : */
1875 : static inline void
1876 637388 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
1877 : Relation relation, ReorderBufferChange *change,
1878 : bool streaming)
1879 : {
1880 637388 : if (streaming)
1881 316972 : rb->stream_change(rb, txn, relation, change);
1882 : else
1883 320416 : rb->apply_change(rb, txn, relation, change);
1884 637382 : }
1885 :
1886 : /*
1887 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
1888 : */
1889 : static inline void
1890 20 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
1891 : int nrelations, Relation *relations,
1892 : ReorderBufferChange *change, bool streaming)
1893 : {
1894 20 : if (streaming)
1895 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
1896 : else
1897 20 : rb->apply_truncate(rb, txn, nrelations, relations, change);
1898 20 : }
1899 :
1900 : /*
1901 : * Helper function for ReorderBufferProcessTXN for applying the message.
1902 : */
1903 : static inline void
1904 18 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
1905 : ReorderBufferChange *change, bool streaming)
1906 : {
1907 18 : if (streaming)
1908 24 : rb->stream_message(rb, txn, change->lsn, true,
1909 8 : change->data.msg.prefix,
1910 : change->data.msg.message_size,
1911 8 : change->data.msg.message);
1912 : else
1913 30 : rb->message(rb, txn, change->lsn, true,
1914 10 : change->data.msg.prefix,
1915 : change->data.msg.message_size,
1916 10 : change->data.msg.message);
1917 18 : }
1918 :
1919 : /*
1920 : * Function to store the command id and snapshot at the end of the current
1921 : * stream so that we can reuse the same while sending the next stream.
1922 : */
1923 : static inline void
1924 820 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
1925 : Snapshot snapshot_now, CommandId command_id)
1926 : {
1927 820 : txn->command_id = command_id;
1928 :
1929 : /* Avoid copying if it's already copied. */
1930 820 : if (snapshot_now->copied)
1931 820 : txn->snapshot_now = snapshot_now;
1932 : else
1933 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1934 : txn, command_id);
1935 820 : }
1936 :
1937 : /*
1938 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
1939 : * abort of the streaming transaction. This resets the TXN such that it
1940 : * can be used to stream the remaining data of transaction being processed.
1941 : */
1942 : static void
1943 16 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
1944 : Snapshot snapshot_now,
1945 : CommandId command_id,
1946 : XLogRecPtr last_lsn,
1947 : ReorderBufferChange *specinsert)
1948 : {
1949 : /*
1950 : * Discard the changes that we just streamed.
1951 : */
1952 16 : ReorderBufferTruncateTXN(rb, txn, false);
1953 :
1954 : /* Free all resources allocated for toast reconstruction */
1955 16 : ReorderBufferToastReset(rb, txn);
1956 :
1957 : /* Return the spec insert change if it is not NULL */
1958 16 : if (specinsert != NULL)
1959 : {
1960 0 : ReorderBufferReturnChange(rb, specinsert, true);
1961 0 : specinsert = NULL;
1962 : }
1963 :
1964 : /* Stop the stream. */
1965 16 : rb->stream_stop(rb, txn, last_lsn);
1966 :
1967 : /* Remember the command ID and snapshot for the streaming run */
1968 16 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
1969 16 : }
1970 :
1971 : /*
1972 : * Helper function for ReorderBufferCommit and ReorderBufferStreamTXN.
1973 : *
1974 : * Send data of a transaction (and its subtransactions) to the
1975 : * output plugin. We iterate over the top and subtransactions (using a k-way
1976 : * merge) and replay the changes in lsn order.
1977 : *
1978 : * If streaming is true then data will be sent using stream API.
1979 : *
1980 : * Note: "volatile" markers on some parameters are to avoid trouble with
1981 : * PG_TRY inside the function.
1982 : */
1983 : static void
1984 1910 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
1985 : XLogRecPtr commit_lsn,
1986 : volatile Snapshot snapshot_now,
1987 : volatile CommandId command_id,
1988 : bool streaming)
1989 : {
1990 : bool using_subtxn;
1991 1910 : MemoryContext ccxt = CurrentMemoryContext;
1992 1910 : ReorderBufferIterTXNState *volatile iterstate = NULL;
1993 1910 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
1994 1910 : ReorderBufferChange *volatile specinsert = NULL;
1995 1910 : volatile bool stream_started = false;
1996 1910 : ReorderBufferTXN *volatile curtxn = NULL;
1997 :
1998 : /* build data to be able to lookup the CommandIds of catalog tuples */
1999 1910 : ReorderBufferBuildTupleCidHash(rb, txn);
2000 :
2001 : /* setup the initial snapshot */
2002 1910 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2003 :
2004 : /*
2005 : * Decoding needs access to syscaches et al., which in turn use
2006 : * heavyweight locks and such. Thus we need to have enough state around to
2007 : * keep track of those. The easiest way is to simply use a transaction
2008 : * internally. That also allows us to easily enforce that nothing writes
2009 : * to the database by checking for xid assignments.
2010 : *
2011 : * When we're called via the SQL SRF there's already a transaction
2012 : * started, so start an explicit subtransaction there.
2013 : */
2014 1910 : using_subtxn = IsTransactionOrTransactionBlock();
2015 :
2016 1910 : PG_TRY();
2017 : {
2018 : ReorderBufferChange *change;
2019 :
2020 1910 : if (using_subtxn)
2021 754 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2022 : else
2023 1156 : StartTransactionCommand();
2024 :
2025 : /* We only need to send begin/commit for non-streamed transactions. */
2026 1910 : if (!streaming)
2027 1086 : rb->begin(rb, txn);
2028 :
2029 1910 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2030 1910 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2031 : {
2032 663504 : Relation relation = NULL;
2033 : Oid reloid;
2034 :
2035 : /*
2036 : * We can't call start stream callback before processing first
2037 : * change.
2038 : */
2039 663504 : if (prev_lsn == InvalidXLogRecPtr)
2040 : {
2041 1902 : if (streaming)
2042 : {
2043 816 : txn->origin_id = change->origin_id;
2044 816 : rb->stream_start(rb, txn, change->lsn);
2045 816 : stream_started = true;
2046 : }
2047 : }
2048 :
2049 : /*
2050 : * Enforce correct ordering of changes, merged from multiple
2051 : * subtransactions. The changes may have the same LSN due to
2052 : * MULTI_INSERT xlog records.
2053 : */
2054 663504 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2055 :
2056 663504 : prev_lsn = change->lsn;
2057 :
2058 : /* Set the current xid to detect concurrent aborts. */
2059 663504 : if (streaming || rbtxn_prepared(change->txn))
2060 : {
2061 320252 : curtxn = change->txn;
2062 320252 : SetupCheckXidLive(curtxn->xid);
2063 : }
2064 :
2065 663504 : switch (change->action)
2066 : {
2067 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2068 :
2069 : /*
2070 : * Confirmation for speculative insertion arrived. Simply
2071 : * use as a normal record. It'll be cleaned up at the end
2072 : * of INSERT processing.
2073 : */
2074 3564 : if (specinsert == NULL)
2075 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2076 3564 : Assert(specinsert->data.tp.oldtuple == NULL);
2077 3564 : change = specinsert;
2078 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2079 :
2080 : /* intentionally fall through */
2081 : case REORDER_BUFFER_CHANGE_INSERT:
2082 : case REORDER_BUFFER_CHANGE_UPDATE:
2083 : case REORDER_BUFFER_CHANGE_DELETE:
2084 645444 : Assert(snapshot_now);
2085 :
2086 645444 : reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode,
2087 : change->data.tp.relnode.relNode);
2088 :
2089 : /*
2090 : * Mapped catalog tuple without data, emitted while
2091 : * catalog table was in the process of being rewritten. We
2092 : * can fail to look up the relfilenode, because the
2093 : * relmapper has no "historic" view, in contrast to normal
2094 : * the normal catalog during decoding. Thus repeated
2095 : * rewrites can cause a lookup failure. That's OK because
2096 : * we do not decode catalog changes anyway. Normally such
2097 : * tuples would be skipped over below, but we can't
2098 : * identify whether the table should be logically logged
2099 : * without mapping the relfilenode to the oid.
2100 : */
2101 645578 : if (reloid == InvalidOid &&
2102 304 : change->data.tp.newtuple == NULL &&
2103 152 : change->data.tp.oldtuple == NULL)
2104 : goto change_done;
2105 645274 : else if (reloid == InvalidOid)
2106 0 : elog(ERROR, "could not map filenode \"%s\" to relation OID",
2107 : relpathperm(change->data.tp.relnode,
2108 : MAIN_FORKNUM));
2109 :
2110 645274 : relation = RelationIdGetRelation(reloid);
2111 :
2112 645274 : if (!RelationIsValid(relation))
2113 0 : elog(ERROR, "could not open relation with OID %u (for filenode \"%s\")",
2114 : reloid,
2115 : relpathperm(change->data.tp.relnode,
2116 : MAIN_FORKNUM));
2117 :
2118 645274 : if (!RelationIsLogicallyLogged(relation))
2119 : goto change_done;
2120 :
2121 : /*
2122 : * Ignore temporary heaps created during DDL unless the
2123 : * plugin has asked for them.
2124 : */
2125 641228 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2126 40 : goto change_done;
2127 :
2128 : /*
2129 : * For now ignore sequence changes entirely. Most of the
2130 : * time they don't log changes using records we
2131 : * understand, so it doesn't make sense to handle the few
2132 : * cases we do.
2133 : */
2134 641188 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2135 0 : goto change_done;
2136 :
2137 : /* user-triggered change */
2138 641188 : if (!IsToastRelation(relation))
2139 : {
2140 637388 : ReorderBufferToastReplace(rb, txn, relation, change);
2141 637388 : ReorderBufferApplyChange(rb, txn, relation, change,
2142 : streaming);
2143 :
2144 : /*
2145 : * Only clear reassembled toast chunks if we're sure
2146 : * they're not required anymore. The creator of the
2147 : * tuple tells us.
2148 : */
2149 637382 : if (change->data.tp.clear_toast_afterwards)
2150 636978 : ReorderBufferToastReset(rb, txn);
2151 : }
2152 : /* we're not interested in toast deletions */
2153 3800 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2154 : {
2155 : /*
2156 : * Need to reassemble the full toasted Datum in
2157 : * memory, to ensure the chunks don't get reused till
2158 : * we're done remove it from the list of this
2159 : * transaction's changes. Otherwise it will get
2160 : * freed/reused while restoring spooled data from
2161 : * disk.
2162 : */
2163 3338 : Assert(change->data.tp.newtuple != NULL);
2164 :
2165 3338 : dlist_delete(&change->node);
2166 3338 : ReorderBufferToastAppendChunk(rb, txn, relation,
2167 : change);
2168 : }
2169 :
2170 : change_done:
2171 :
2172 : /*
2173 : * Either speculative insertion was confirmed, or it was
2174 : * unsuccessful and the record isn't needed anymore.
2175 : */
2176 645420 : if (specinsert != NULL)
2177 : {
2178 3564 : ReorderBufferReturnChange(rb, specinsert, true);
2179 3564 : specinsert = NULL;
2180 : }
2181 :
2182 645420 : if (RelationIsValid(relation))
2183 : {
2184 645268 : RelationClose(relation);
2185 645268 : relation = NULL;
2186 : }
2187 645420 : break;
2188 :
2189 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2190 :
2191 : /*
2192 : * Speculative insertions are dealt with by delaying the
2193 : * processing of the insert until the confirmation record
2194 : * arrives. For that we simply unlink the record from the
2195 : * chain, so it does not get freed/reused while restoring
2196 : * spooled data from disk.
2197 : *
2198 : * This is safe in the face of concurrent catalog changes
2199 : * because the relevant relation can't be changed between
2200 : * speculative insertion and confirmation due to
2201 : * CheckTableNotInUse() and locking.
2202 : */
2203 :
2204 : /* clear out a pending (and thus failed) speculation */
2205 3564 : if (specinsert != NULL)
2206 : {
2207 0 : ReorderBufferReturnChange(rb, specinsert, true);
2208 0 : specinsert = NULL;
2209 : }
2210 :
2211 : /* and memorize the pending insertion */
2212 3564 : dlist_delete(&change->node);
2213 3564 : specinsert = change;
2214 3564 : break;
2215 :
2216 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2217 : {
2218 : int i;
2219 20 : int nrelids = change->data.truncate.nrelids;
2220 20 : int nrelations = 0;
2221 : Relation *relations;
2222 :
2223 20 : relations = palloc0(nrelids * sizeof(Relation));
2224 50 : for (i = 0; i < nrelids; i++)
2225 : {
2226 30 : Oid relid = change->data.truncate.relids[i];
2227 : Relation relation;
2228 :
2229 30 : relation = RelationIdGetRelation(relid);
2230 :
2231 30 : if (!RelationIsValid(relation))
2232 0 : elog(ERROR, "could not open relation with OID %u", relid);
2233 :
2234 30 : if (!RelationIsLogicallyLogged(relation))
2235 0 : continue;
2236 :
2237 30 : relations[nrelations++] = relation;
2238 : }
2239 :
2240 : /* Apply the truncate. */
2241 20 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2242 : relations, change,
2243 : streaming);
2244 :
2245 50 : for (i = 0; i < nrelations; i++)
2246 30 : RelationClose(relations[i]);
2247 :
2248 20 : break;
2249 : }
2250 :
2251 : case REORDER_BUFFER_CHANGE_MESSAGE:
2252 18 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2253 18 : break;
2254 :
2255 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2256 : /* Execute the invalidation messages locally */
2257 2174 : ReorderBufferExecuteInvalidations(
2258 : change->data.inval.ninvalidations,
2259 : change->data.inval.invalidations);
2260 2174 : break;
2261 :
2262 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2263 : /* get rid of the old */
2264 452 : TeardownHistoricSnapshot(false);
2265 :
2266 452 : if (snapshot_now->copied)
2267 : {
2268 414 : ReorderBufferFreeSnap(rb, snapshot_now);
2269 414 : snapshot_now =
2270 414 : ReorderBufferCopySnap(rb, change->data.snapshot,
2271 : txn, command_id);
2272 : }
2273 :
2274 : /*
2275 : * Restored from disk, need to be careful not to double
2276 : * free. We could introduce refcounting for that, but for
2277 : * now this seems infrequent enough not to care.
2278 : */
2279 38 : else if (change->data.snapshot->copied)
2280 : {
2281 0 : snapshot_now =
2282 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2283 : txn, command_id);
2284 : }
2285 : else
2286 : {
2287 38 : snapshot_now = change->data.snapshot;
2288 : }
2289 :
2290 : /* and continue with the new one */
2291 452 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2292 452 : break;
2293 :
2294 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2295 11832 : Assert(change->data.command_id != InvalidCommandId);
2296 :
2297 11832 : if (command_id < change->data.command_id)
2298 : {
2299 1760 : command_id = change->data.command_id;
2300 :
2301 1760 : if (!snapshot_now->copied)
2302 : {
2303 : /* we don't use the global one anymore */
2304 412 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2305 : txn, command_id);
2306 : }
2307 :
2308 1760 : snapshot_now->curcid = command_id;
2309 :
2310 1760 : TeardownHistoricSnapshot(false);
2311 1760 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2312 : }
2313 :
2314 11832 : break;
2315 :
2316 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2317 0 : elog(ERROR, "tuplecid value in changequeue");
2318 : break;
2319 : }
2320 : }
2321 :
2322 : /*
2323 : * There's a speculative insertion remaining, just clean in up, it
2324 : * can't have been successful, otherwise we'd gotten a confirmation
2325 : * record.
2326 : */
2327 1886 : if (specinsert)
2328 : {
2329 0 : ReorderBufferReturnChange(rb, specinsert, true);
2330 0 : specinsert = NULL;
2331 : }
2332 :
2333 : /* clean up the iterator */
2334 1886 : ReorderBufferIterTXNFinish(rb, iterstate);
2335 1886 : iterstate = NULL;
2336 :
2337 : /*
2338 : * Done with current changes, send the last message for this set of
2339 : * changes depending upon streaming mode.
2340 : */
2341 1886 : if (streaming)
2342 : {
2343 804 : if (stream_started)
2344 : {
2345 796 : rb->stream_stop(rb, txn, prev_lsn);
2346 796 : stream_started = false;
2347 : }
2348 : }
2349 : else
2350 : {
2351 : /*
2352 : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2353 : * regular ones).
2354 : */
2355 1082 : if (rbtxn_prepared(txn))
2356 26 : rb->prepare(rb, txn, commit_lsn);
2357 : else
2358 1056 : rb->commit(rb, txn, commit_lsn);
2359 : }
2360 :
2361 : /* this is just a sanity check against bad output plugin behaviour */
2362 1886 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2363 0 : elog(ERROR, "output plugin used XID %u",
2364 : GetCurrentTransactionId());
2365 :
2366 : /*
2367 : * Remember the command ID and snapshot for the next set of changes in
2368 : * streaming mode.
2369 : */
2370 1886 : if (streaming)
2371 804 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2372 1082 : else if (snapshot_now->copied)
2373 412 : ReorderBufferFreeSnap(rb, snapshot_now);
2374 :
2375 : /* cleanup */
2376 1886 : TeardownHistoricSnapshot(false);
2377 :
2378 : /*
2379 : * Aborting the current (sub-)transaction as a whole has the right
2380 : * semantics. We want all locks acquired in here to be released, not
2381 : * reassigned to the parent and we do not want any database access
2382 : * have persistent effects.
2383 : */
2384 1886 : AbortCurrentTransaction();
2385 :
2386 : /* make sure there's no cache pollution */
2387 1886 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2388 :
2389 1886 : if (using_subtxn)
2390 742 : RollbackAndReleaseCurrentSubTransaction();
2391 :
2392 : /*
2393 : * We are here due to one of the 4 scenarios: 1. Prepare of a
2394 : * two-phase commit. 2. Prepare of a two-phase commit and a part of
2395 : * streaming in-progress txn. 3. streaming of an in-progress txn. 3.
2396 : * Commit of a transaction.
2397 : *
2398 : * Scenario 1 and 2, we handle the same way, pass in prepared as true
2399 : * to ReorderBufferTruncateTXN and allow more elaborate truncation of
2400 : * txn data as the entire transaction has been decoded, only commit is
2401 : * pending. Scenario 3, we pass in prepared as false to
2402 : * ReorderBufferTruncateTXN as the txn is not yet completely decoded.
2403 : * Scenario 4, all txn has been decoded and we can fully cleanup the
2404 : * TXN reorder buffer.
2405 : */
2406 1886 : if (rbtxn_prepared(txn))
2407 : {
2408 48 : ReorderBufferTruncateTXN(rb, txn, true);
2409 :
2410 : /* Reset the CheckXidAlive */
2411 48 : CheckXidAlive = InvalidTransactionId;
2412 : }
2413 1838 : else if (streaming)
2414 : {
2415 782 : ReorderBufferTruncateTXN(rb, txn, false);
2416 : /* Reset the CheckXidAlive */
2417 782 : CheckXidAlive = InvalidTransactionId;
2418 : }
2419 : else
2420 1056 : ReorderBufferCleanupTXN(rb, txn);
2421 : }
2422 20 : PG_CATCH();
2423 : {
2424 20 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2425 20 : ErrorData *errdata = CopyErrorData();
2426 :
2427 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2428 20 : if (iterstate)
2429 20 : ReorderBufferIterTXNFinish(rb, iterstate);
2430 :
2431 20 : TeardownHistoricSnapshot(true);
2432 :
2433 : /*
2434 : * Force cache invalidation to happen outside of a valid transaction
2435 : * to prevent catalog access as we just caught an error.
2436 : */
2437 20 : AbortCurrentTransaction();
2438 :
2439 : /* make sure there's no cache pollution */
2440 20 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2441 : txn->invalidations);
2442 :
2443 20 : if (using_subtxn)
2444 12 : RollbackAndReleaseCurrentSubTransaction();
2445 :
2446 : /*
2447 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2448 : * abort of the (sub)transaction we are streaming or preparing. We
2449 : * need to do the cleanup and return gracefully on this error, see
2450 : * SetupCheckXidLive.
2451 : */
2452 20 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK)
2453 : {
2454 : /*
2455 : * This error can occur either when we are sending the data in
2456 : * streaming mode and the streaming is not finished yet or when we
2457 : * are sending the data out on a PREPARE during a two-phase
2458 : * commit.
2459 : */
2460 20 : Assert(streaming || rbtxn_prepared(txn));
2461 20 : Assert(stream_started || rbtxn_prepared(txn));
2462 :
2463 : /* Cleanup the temporary error state. */
2464 20 : FlushErrorState();
2465 20 : FreeErrorData(errdata);
2466 20 : errdata = NULL;
2467 20 : curtxn->concurrent_abort = true;
2468 :
2469 : /*
2470 : * If streaming, reset the TXN so that it is allowed to stream
2471 : * remaining data. Streaming can also be on a prepared txn, handle
2472 : * it the same way.
2473 : */
2474 20 : if (streaming)
2475 : {
2476 16 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2477 : command_id, prev_lsn,
2478 : specinsert);
2479 : }
2480 : else
2481 : {
2482 4 : elog(LOG, "stopping decoding of %s (%u)",
2483 : txn->gid[0] != '\0' ? txn->gid : "", txn->xid);
2484 4 : ReorderBufferTruncateTXN(rb, txn, true);
2485 : }
2486 : }
2487 : else
2488 : {
2489 0 : ReorderBufferCleanupTXN(rb, txn);
2490 0 : MemoryContextSwitchTo(ecxt);
2491 0 : PG_RE_THROW();
2492 : }
2493 : }
2494 1906 : PG_END_TRY();
2495 1906 : }
2496 :
2497 : /*
2498 : * Perform the replay of a transaction and its non-aborted subtransactions.
2499 : *
2500 : * Subtransactions previously have to be processed by
2501 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2502 : * transaction with ReorderBufferAssignChild.
2503 : *
2504 : * This interface is called once a toplevel commit is read for both streamed
2505 : * as well as non-streamed transactions.
2506 : */
2507 : static void
2508 1144 : ReorderBufferCommitInternal(ReorderBufferTXN *txn,
2509 : ReorderBuffer *rb, TransactionId xid,
2510 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2511 : TimestampTz commit_time,
2512 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2513 : {
2514 : Snapshot snapshot_now;
2515 1144 : CommandId command_id = FirstCommandId;
2516 :
2517 1144 : txn->final_lsn = commit_lsn;
2518 1144 : txn->end_lsn = end_lsn;
2519 1144 : txn->commit_time = commit_time;
2520 1144 : txn->origin_id = origin_id;
2521 1144 : txn->origin_lsn = origin_lsn;
2522 :
2523 : /*
2524 : * If the transaction was (partially) streamed, we need to commit it in a
2525 : * 'streamed' way. That is, we first stream the remaining part of the
2526 : * transaction, and then invoke stream_commit message.
2527 : *
2528 : * Called after everything (origin ID, LSN, ...) is stored in the
2529 : * transaction to avoid passing that information directly.
2530 : */
2531 1144 : if (rbtxn_is_streamed(txn))
2532 : {
2533 56 : ReorderBufferStreamCommit(rb, txn);
2534 56 : return;
2535 : }
2536 :
2537 : /*
2538 : * If this transaction has no snapshot, it didn't make any changes to the
2539 : * database, so there's nothing to decode. Note that
2540 : * ReorderBufferCommitChild will have transferred any snapshots from
2541 : * subtransactions if there were any.
2542 : */
2543 1088 : if (txn->base_snapshot == NULL)
2544 : {
2545 2 : Assert(txn->ninvalidations == 0);
2546 2 : ReorderBufferCleanupTXN(rb, txn);
2547 2 : return;
2548 : }
2549 :
2550 1086 : snapshot_now = txn->base_snapshot;
2551 :
2552 : /* Process and send the changes to output plugin. */
2553 1086 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2554 : command_id, false);
2555 : }
2556 :
2557 : /*
2558 : * Ask output plugin whether we want to skip this PREPARE and send
2559 : * this transaction as a regular commit later.
2560 : */
2561 : bool
2562 190 : ReorderBufferPrepareNeedSkip(ReorderBuffer *rb, TransactionId xid, const char *gid)
2563 : {
2564 : ReorderBufferTXN *txn;
2565 :
2566 190 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2567 :
2568 190 : return rb->filter_prepare(rb, txn, xid, gid);
2569 : }
2570 :
2571 :
2572 : /*
2573 : * Commit a transaction.
2574 : *
2575 : * See comments for ReorderBufferCommitInternal()
2576 : */
2577 : void
2578 1094 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2579 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2580 : TimestampTz commit_time,
2581 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2582 : {
2583 : ReorderBufferTXN *txn;
2584 :
2585 1094 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2586 : false);
2587 :
2588 : /* unknown transaction, nothing to replay */
2589 1094 : if (txn == NULL)
2590 1096 : return;
2591 :
2592 1092 : ReorderBufferCommitInternal(txn, rb, xid, commit_lsn, end_lsn,
2593 : commit_time, origin_id, origin_lsn);
2594 : }
2595 :
2596 : /*
2597 : * Prepare a two-phase transaction. It calls ReorderBufferCommitInternal()
2598 : * since all prepared transactions need to be decoded at PREPARE time.
2599 : */
2600 : void
2601 52 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2602 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2603 : TimestampTz commit_time,
2604 : RepOriginId origin_id, XLogRecPtr origin_lsn,
2605 : char *gid)
2606 : {
2607 : ReorderBufferTXN *txn;
2608 :
2609 52 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2610 : false);
2611 :
2612 : /* unknown transaction, nothing to replay */
2613 52 : if (txn == NULL)
2614 52 : return;
2615 :
2616 52 : txn->txn_flags |= RBTXN_PREPARE;
2617 52 : txn->gid = palloc(strlen(gid) + 1); /* trailing '\0' */
2618 52 : strcpy(txn->gid, gid);
2619 :
2620 52 : ReorderBufferCommitInternal(txn, rb, xid, commit_lsn, end_lsn,
2621 : commit_time, origin_id, origin_lsn);
2622 : }
2623 :
2624 : /*
2625 : * Send standalone xact event. This is used to handle COMMIT/ROLLBACK PREPARED.
2626 : */
2627 : void
2628 56 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
2629 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2630 : TimestampTz commit_time,
2631 : RepOriginId origin_id, XLogRecPtr origin_lsn,
2632 : char *gid, bool is_commit)
2633 : {
2634 : ReorderBufferTXN *txn;
2635 :
2636 : /*
2637 : * The transaction may or may not exist (during restarts for example).
2638 : * Anyway, two-phase transactions do not contain any reorderbuffers. So
2639 : * allow it to be created below.
2640 : */
2641 56 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, commit_lsn,
2642 : true);
2643 :
2644 56 : txn->final_lsn = commit_lsn;
2645 56 : txn->end_lsn = end_lsn;
2646 56 : txn->commit_time = commit_time;
2647 56 : txn->origin_id = origin_id;
2648 56 : txn->origin_lsn = origin_lsn;
2649 : /* this txn is obviously prepared */
2650 56 : txn->txn_flags |= RBTXN_PREPARE;
2651 56 : txn->gid = palloc(strlen(gid) + 1); /* trailing '\0' */
2652 56 : strcpy(txn->gid, gid);
2653 :
2654 56 : if (is_commit)
2655 32 : txn->txn_flags |= RBTXN_COMMIT_PREPARED;
2656 : else
2657 24 : txn->txn_flags |= RBTXN_ROLLBACK_PREPARED;
2658 :
2659 56 : if (rbtxn_commit_prepared(txn))
2660 32 : rb->commit_prepared(rb, txn, commit_lsn);
2661 24 : else if (rbtxn_rollback_prepared(txn))
2662 24 : rb->rollback_prepared(rb, txn, commit_lsn);
2663 :
2664 :
2665 : /* cleanup: make sure there's no cache pollution */
2666 56 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2667 : txn->invalidations);
2668 56 : ReorderBufferCleanupTXN(rb, txn);
2669 56 : }
2670 :
2671 : /*
2672 : * Abort a transaction that possibly has previous changes. Needs to be first
2673 : * called for subtransactions and then for the toplevel xid.
2674 : *
2675 : * NB: Transactions handled here have to have actively aborted (i.e. have
2676 : * produced an abort record). Implicitly aborted transactions are handled via
2677 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
2678 : * which have committed are handled in ReorderBufferForget().
2679 : *
2680 : * This function purges this transaction and its contents from memory and
2681 : * disk.
2682 : */
2683 : void
2684 160 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2685 : {
2686 : ReorderBufferTXN *txn;
2687 :
2688 160 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2689 : false);
2690 :
2691 : /* unknown, nothing to remove */
2692 160 : if (txn == NULL)
2693 160 : return;
2694 :
2695 : /* For streamed transactions notify the remote node about the abort. */
2696 160 : if (rbtxn_is_streamed(txn))
2697 : {
2698 34 : rb->stream_abort(rb, txn, lsn);
2699 :
2700 : /*
2701 : * We might have decoded changes for this transaction that could load
2702 : * the cache as per the current transaction's view (consider DDL's
2703 : * happened in this transaction). We don't want the decoding of future
2704 : * transactions to use those cache entries so execute invalidations.
2705 : */
2706 34 : if (txn->ninvalidations > 0)
2707 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2708 : txn->invalidations);
2709 : }
2710 :
2711 : /* cosmetic... */
2712 160 : txn->final_lsn = lsn;
2713 :
2714 : /*
2715 : * remove potential on-disk data, and deallocate.
2716 : *
2717 : * We remove it even for prepared transactions (GID is enough to
2718 : * commit/abort those later).
2719 : */
2720 160 : ReorderBufferCleanupTXN(rb, txn);
2721 : }
2722 :
2723 : /*
2724 : * Abort all transactions that aren't actually running anymore because the
2725 : * server restarted.
2726 : *
2727 : * NB: These really have to be transactions that have aborted due to a server
2728 : * crash/immediate restart, as we don't deal with invalidations here.
2729 : */
2730 : void
2731 984 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
2732 : {
2733 : dlist_mutable_iter it;
2734 :
2735 : /*
2736 : * Iterate through all (potential) toplevel TXNs and abort all that are
2737 : * older than what possibly can be running. Once we've found the first
2738 : * that is alive we stop, there might be some that acquired an xid earlier
2739 : * but started writing later, but it's unlikely and they will be cleaned
2740 : * up in a later call to this function.
2741 : */
2742 988 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
2743 : {
2744 : ReorderBufferTXN *txn;
2745 :
2746 18 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
2747 :
2748 18 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2749 : {
2750 4 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
2751 :
2752 : /* remove potential on-disk data, and deallocate this tx */
2753 4 : ReorderBufferCleanupTXN(rb, txn);
2754 : }
2755 : else
2756 28 : return;
2757 : }
2758 : }
2759 :
2760 : /*
2761 : * Forget the contents of a transaction if we aren't interested in its
2762 : * contents. Needs to be first called for subtransactions and then for the
2763 : * toplevel xid.
2764 : *
2765 : * This is significantly different to ReorderBufferAbort() because
2766 : * transactions that have committed need to be treated differently from aborted
2767 : * ones since they may have modified the catalog.
2768 : *
2769 : * Note that this is only allowed to be called in the moment a transaction
2770 : * commit has just been read, not earlier; otherwise later records referring
2771 : * to this xid might re-create the transaction incompletely.
2772 : */
2773 : void
2774 3976 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2775 : {
2776 : ReorderBufferTXN *txn;
2777 :
2778 3976 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2779 : false);
2780 :
2781 : /* unknown, nothing to forget */
2782 3976 : if (txn == NULL)
2783 5090 : return;
2784 :
2785 : /* For streamed transactions notify the remote node about the abort. */
2786 2862 : if (rbtxn_is_streamed(txn))
2787 0 : rb->stream_abort(rb, txn, lsn);
2788 :
2789 : /* cosmetic... */
2790 2862 : txn->final_lsn = lsn;
2791 :
2792 : /*
2793 : * Process cache invalidation messages if there are any. Even if we're not
2794 : * interested in the transaction's contents, it could have manipulated the
2795 : * catalog and we need to update the caches according to that.
2796 : */
2797 2862 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2798 740 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2799 : txn->invalidations);
2800 : else
2801 2122 : Assert(txn->ninvalidations == 0);
2802 :
2803 : /* remove potential on-disk data, and deallocate */
2804 2862 : ReorderBufferCleanupTXN(rb, txn);
2805 : }
2806 :
2807 : /*
2808 : * Invalidate cache for those transactions that need to be skipped just in case
2809 : * catalogs were manipulated as part of the transaction.
2810 : * Note that this is only allowed to be called when a transaction prepare
2811 : * has just been read, not otherwise.
2812 : */
2813 : void
2814 156 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2815 : {
2816 : ReorderBufferTXN *txn;
2817 :
2818 156 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2819 : false);
2820 :
2821 : /* unknown, nothing to do */
2822 156 : if (txn == NULL)
2823 156 : return;
2824 :
2825 : /*
2826 : * Process cache invalidation messages if there are any. Even if we're not
2827 : * interested in the transaction's contents, it could have manipulated the
2828 : * catalog and we need to update the caches according to that.
2829 : */
2830 156 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2831 58 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2832 : txn->invalidations);
2833 : else
2834 98 : Assert(txn->ninvalidations == 0);
2835 : }
2836 :
2837 :
2838 : /*
2839 : * Execute invalidations happening outside the context of a decoded
2840 : * transaction. That currently happens either for xid-less commits
2841 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
2842 : * transactions (via ReorderBufferForget()).
2843 : */
2844 : void
2845 798 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
2846 : SharedInvalidationMessage *invalidations)
2847 : {
2848 798 : bool use_subtxn = IsTransactionOrTransactionBlock();
2849 : int i;
2850 :
2851 798 : if (use_subtxn)
2852 760 : BeginInternalSubTransaction("replay");
2853 :
2854 : /*
2855 : * Force invalidations to happen outside of a valid transaction - that way
2856 : * entries will just be marked as invalid without accessing the catalog.
2857 : * That's advantageous because we don't need to setup the full state
2858 : * necessary for catalog access.
2859 : */
2860 798 : if (use_subtxn)
2861 760 : AbortCurrentTransaction();
2862 :
2863 37928 : for (i = 0; i < ninvalidations; i++)
2864 37130 : LocalExecuteInvalidationMessage(&invalidations[i]);
2865 :
2866 798 : if (use_subtxn)
2867 760 : RollbackAndReleaseCurrentSubTransaction();
2868 798 : }
2869 :
2870 : /*
2871 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
2872 : * least once for every xid in XLogRecord->xl_xid (other places in records
2873 : * may, but do not have to be passed through here).
2874 : *
2875 : * Reorderbuffer keeps some datastructures about transactions in LSN order,
2876 : * for efficiency. To do that it has to know about when transactions are seen
2877 : * first in the WAL. As many types of records are not actually interesting for
2878 : * logical decoding, they do not necessarily pass though here.
2879 : */
2880 : void
2881 4365474 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2882 : {
2883 : /* many records won't have an xid assigned, centralize check here */
2884 4365474 : if (xid != InvalidTransactionId)
2885 4363566 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
2886 4365474 : }
2887 :
2888 : /*
2889 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
2890 : * because the previous snapshot doesn't describe the catalog correctly for
2891 : * following rows.
2892 : */
2893 : void
2894 1220 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
2895 : XLogRecPtr lsn, Snapshot snap)
2896 : {
2897 1220 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
2898 :
2899 1220 : change->data.snapshot = snap;
2900 1220 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
2901 :
2902 1220 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
2903 1220 : }
2904 :
2905 : /*
2906 : * Set up the transaction's base snapshot.
2907 : *
2908 : * If we know that xid is a subtransaction, set the base snapshot on the
2909 : * top-level transaction instead.
2910 : */
2911 : void
2912 3260 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
2913 : XLogRecPtr lsn, Snapshot snap)
2914 : {
2915 : ReorderBufferTXN *txn;
2916 : bool is_new;
2917 :
2918 3260 : AssertArg(snap != NULL);
2919 :
2920 : /*
2921 : * Fetch the transaction to operate on. If we know it's a subtransaction,
2922 : * operate on its top-level transaction instead.
2923 : */
2924 3260 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
2925 3260 : if (rbtxn_is_known_subxact(txn))
2926 242 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
2927 : NULL, InvalidXLogRecPtr, false);
2928 3260 : Assert(txn->base_snapshot == NULL);
2929 :
2930 3260 : txn->base_snapshot = snap;
2931 3260 : txn->base_snapshot_lsn = lsn;
2932 3260 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
2933 :
2934 3260 : AssertTXNLsnOrder(rb);
2935 3260 : }
2936 :
2937 : /*
2938 : * Access the catalog with this CommandId at this point in the changestream.
2939 : *
2940 : * May only be called for command ids > 1
2941 : */
2942 : void
2943 31518 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
2944 : XLogRecPtr lsn, CommandId cid)
2945 : {
2946 31518 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
2947 :
2948 31518 : change->data.command_id = cid;
2949 31518 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
2950 :
2951 31518 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
2952 31518 : }
2953 :
2954 : /*
2955 : * Update memory counters to account for the new or removed change.
2956 : *
2957 : * We update two counters - in the reorder buffer, and in the transaction
2958 : * containing the change. The reorder buffer counter allows us to quickly
2959 : * decide if we reached the memory limit, the transaction counter allows
2960 : * us to quickly pick the largest transaction for eviction.
2961 : *
2962 : * When streaming is enabled, we need to update the toplevel transaction
2963 : * counters instead - we don't really care about subtransactions as we
2964 : * can't stream them individually anyway, and we only pick toplevel
2965 : * transactions for eviction. So only toplevel transactions matter.
2966 : */
2967 : static void
2968 6652728 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
2969 : ReorderBufferChange *change,
2970 : bool addition)
2971 : {
2972 : Size sz;
2973 : ReorderBufferTXN *txn;
2974 6652728 : ReorderBufferTXN *toptxn = NULL;
2975 :
2976 6652728 : Assert(change->txn);
2977 :
2978 : /*
2979 : * Ignore tuple CID changes, because those are not evicted when reaching
2980 : * memory limit. So we just don't count them, because it might easily
2981 : * trigger a pointless attempt to spill.
2982 : */
2983 6652728 : if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
2984 6684126 : return;
2985 :
2986 6621330 : txn = change->txn;
2987 :
2988 : /* If streaming supported, update the total size in top level as well. */
2989 6621330 : if (ReorderBufferCanStream(rb))
2990 : {
2991 1127726 : if (txn->toptxn != NULL)
2992 61024 : toptxn = txn->toptxn;
2993 : else
2994 1066702 : toptxn = txn;
2995 : }
2996 :
2997 6621330 : sz = ReorderBufferChangeSize(change);
2998 :
2999 6621330 : if (addition)
3000 : {
3001 3312210 : txn->size += sz;
3002 3312210 : rb->size += sz;
3003 :
3004 : /* Update the total size in the top transaction. */
3005 3312210 : if (toptxn)
3006 565370 : toptxn->total_size += sz;
3007 : }
3008 : else
3009 : {
3010 3309120 : Assert((rb->size >= sz) && (txn->size >= sz));
3011 3309120 : txn->size -= sz;
3012 3309120 : rb->size -= sz;
3013 :
3014 : /* Update the total size in the top transaction. */
3015 3309120 : if (toptxn)
3016 562356 : toptxn->total_size -= sz;
3017 : }
3018 :
3019 6621330 : Assert(txn->size <= rb->size);
3020 : }
3021 :
3022 : /*
3023 : * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
3024 : *
3025 : * We do not include this change type in memory accounting, because we
3026 : * keep CIDs in a separate list and do not evict them when reaching
3027 : * the memory limit.
3028 : */
3029 : void
3030 31518 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3031 : XLogRecPtr lsn, RelFileNode node,
3032 : ItemPointerData tid, CommandId cmin,
3033 : CommandId cmax, CommandId combocid)
3034 : {
3035 31518 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3036 : ReorderBufferTXN *txn;
3037 :
3038 31518 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3039 :
3040 31518 : change->data.tuplecid.node = node;
3041 31518 : change->data.tuplecid.tid = tid;
3042 31518 : change->data.tuplecid.cmin = cmin;
3043 31518 : change->data.tuplecid.cmax = cmax;
3044 31518 : change->data.tuplecid.combocid = combocid;
3045 31518 : change->lsn = lsn;
3046 31518 : change->txn = txn;
3047 31518 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3048 :
3049 31518 : dlist_push_tail(&txn->tuplecids, &change->node);
3050 31518 : txn->ntuplecids++;
3051 31518 : }
3052 :
3053 : /*
3054 : * Setup the invalidation of the toplevel transaction.
3055 : *
3056 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3057 : * accumulates all the invalidation messages in the toplevel transaction as
3058 : * well as in the form of change in reorder buffer. We require to record it in
3059 : * form of the change so that we can execute only the required invalidations
3060 : * instead of executing all the invalidations on each CommandId increment. We
3061 : * also need to accumulate these in the toplevel transaction because in some
3062 : * cases we skip processing the transaction (see ReorderBufferForget), we need
3063 : * to execute all the invalidations together.
3064 : */
3065 : void
3066 5938 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3067 : XLogRecPtr lsn, Size nmsgs,
3068 : SharedInvalidationMessage *msgs)
3069 : {
3070 : ReorderBufferTXN *txn;
3071 : MemoryContext oldcontext;
3072 : ReorderBufferChange *change;
3073 :
3074 5938 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3075 :
3076 5938 : oldcontext = MemoryContextSwitchTo(rb->context);
3077 :
3078 : /*
3079 : * Collect all the invalidations under the top transaction so that we can
3080 : * execute them all together. See comment atop this function
3081 : */
3082 5938 : if (txn->toptxn)
3083 356 : txn = txn->toptxn;
3084 :
3085 5938 : Assert(nmsgs > 0);
3086 :
3087 : /* Accumulate invalidations. */
3088 5938 : if (txn->ninvalidations == 0)
3089 : {
3090 1204 : txn->ninvalidations = nmsgs;
3091 1204 : txn->invalidations = (SharedInvalidationMessage *)
3092 1204 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3093 1204 : memcpy(txn->invalidations, msgs,
3094 : sizeof(SharedInvalidationMessage) * nmsgs);
3095 : }
3096 : else
3097 : {
3098 4734 : txn->invalidations = (SharedInvalidationMessage *)
3099 4734 : repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
3100 4734 : (txn->ninvalidations + nmsgs));
3101 :
3102 4734 : memcpy(txn->invalidations + txn->ninvalidations, msgs,
3103 : nmsgs * sizeof(SharedInvalidationMessage));
3104 4734 : txn->ninvalidations += nmsgs;
3105 : }
3106 :
3107 5938 : change = ReorderBufferGetChange(rb);
3108 5938 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3109 5938 : change->data.inval.ninvalidations = nmsgs;
3110 5938 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3111 5938 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3112 5938 : memcpy(change->data.inval.invalidations, msgs,
3113 : sizeof(SharedInvalidationMessage) * nmsgs);
3114 :
3115 5938 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3116 :
3117 5938 : MemoryContextSwitchTo(oldcontext);
3118 5938 : }
3119 :
3120 : /*
3121 : * Apply all invalidations we know. Possibly we only need parts at this point
3122 : * in the changestream but we don't know which those are.
3123 : */
3124 : static void
3125 4136 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3126 : {
3127 : int i;
3128 :
3129 49776 : for (i = 0; i < nmsgs; i++)
3130 45640 : LocalExecuteInvalidationMessage(&msgs[i]);
3131 4136 : }
3132 :
3133 : /*
3134 : * Mark a transaction as containing catalog changes
3135 : */
3136 : void
3137 38950 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3138 : XLogRecPtr lsn)
3139 : {
3140 : ReorderBufferTXN *txn;
3141 :
3142 38950 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3143 :
3144 38950 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3145 :
3146 : /*
3147 : * Mark top-level transaction as having catalog changes too if one of its
3148 : * children has so that the ReorderBufferBuildTupleCidHash can
3149 : * conveniently check just top-level transaction and decide whether to
3150 : * build the hash table or not.
3151 : */
3152 38950 : if (txn->toptxn != NULL)
3153 1862 : txn->toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3154 38950 : }
3155 :
3156 : /*
3157 : * Query whether a transaction is already *known* to contain catalog
3158 : * changes. This can be wrong until directly before the commit!
3159 : */
3160 : bool
3161 5616 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3162 : {
3163 : ReorderBufferTXN *txn;
3164 :
3165 5616 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3166 : false);
3167 5616 : if (txn == NULL)
3168 1278 : return false;
3169 :
3170 4338 : return rbtxn_has_catalog_changes(txn);
3171 : }
3172 :
3173 : /*
3174 : * ReorderBufferXidHasBaseSnapshot
3175 : * Have we already set the base snapshot for the given txn/subtxn?
3176 : */
3177 : bool
3178 2989474 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3179 : {
3180 : ReorderBufferTXN *txn;
3181 :
3182 2989474 : txn = ReorderBufferTXNByXid(rb, xid, false,
3183 : NULL, InvalidXLogRecPtr, false);
3184 :
3185 : /* transaction isn't known yet, ergo no snapshot */
3186 2989474 : if (txn == NULL)
3187 0 : return false;
3188 :
3189 : /* a known subtxn? operate on top-level txn instead */
3190 2989474 : if (rbtxn_is_known_subxact(txn))
3191 993490 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3192 : NULL, InvalidXLogRecPtr, false);
3193 :
3194 2989474 : return txn->base_snapshot != NULL;
3195 : }
3196 :
3197 :
3198 : /*
3199 : * ---------------------------------------
3200 : * Disk serialization support
3201 : * ---------------------------------------
3202 : */
3203 :
3204 : /*
3205 : * Ensure the IO buffer is >= sz.
3206 : */
3207 : static void
3208 5683130 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3209 : {
3210 5683130 : if (!rb->outbufsize)
3211 : {
3212 86 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3213 86 : rb->outbufsize = sz;
3214 : }
3215 5683044 : else if (rb->outbufsize < sz)
3216 : {
3217 590 : rb->outbuf = repalloc(rb->outbuf, sz);
3218 590 : rb->outbufsize = sz;
3219 : }
3220 5683130 : }
3221 :
3222 : /*
3223 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3224 : *
3225 : * XXX With many subtransactions this might be quite slow, because we'll have
3226 : * to walk through all of them. There are some options how we could improve
3227 : * that: (a) maintain some secondary structure with transactions sorted by
3228 : * amount of changes, (b) not looking for the entirely largest transaction,
3229 : * but e.g. for transaction using at least some fraction of the memory limit,
3230 : * and (c) evicting multiple transactions at once, e.g. to free a given portion
3231 : * of the memory limit (e.g. 50%).
3232 : */
3233 : static ReorderBufferTXN *
3234 5932 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3235 : {
3236 : HASH_SEQ_STATUS hash_seq;
3237 : ReorderBufferTXNByIdEnt *ent;
3238 5932 : ReorderBufferTXN *largest = NULL;
3239 :
3240 5932 : hash_seq_init(&hash_seq, rb->by_txn);
3241 21314 : while ((ent = hash_seq_search(&hash_seq)) != NULL)
3242 : {
3243 9450 : ReorderBufferTXN *txn = ent->txn;
3244 :
3245 : /* if the current transaction is larger, remember it */
3246 9450 : if ((!largest) || (txn->size > largest->size))
3247 7788 : largest = txn;
3248 : }
3249 :
3250 5932 : Assert(largest);
3251 5932 : Assert(largest->size > 0);
3252 5932 : Assert(largest->size <= rb->size);
3253 :
3254 5932 : return largest;
3255 : }
3256 :
3257 : /*
3258 : * Find the largest toplevel transaction to evict (by streaming).
3259 : *
3260 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3261 : * should give us the same transaction (because we don't update memory account
3262 : * for subtransaction with streaming, so it's always 0). But we can simply
3263 : * iterate over the limited number of toplevel transactions.
3264 : *
3265 : * Note that, we skip transactions that contains incomplete changes. There
3266 : * is a scope of optimization here such that we can select the largest transaction
3267 : * which has complete changes. But that will make the code and design quite complex
3268 : * and that might not be worth the benefit. If we plan to stream the transactions
3269 : * that contains incomplete changes then we need to find a way to partially
3270 : * stream/truncate the transaction changes in-memory and build a mechanism to
3271 : * partially truncate the spilled files. Additionally, whenever we partially
3272 : * stream the transaction we need to maintain the last streamed lsn and next time
3273 : * we need to restore from that segment and the offset in WAL. As we stream the
3274 : * changes from the top transaction and restore them subtransaction wise, we need
3275 : * to even remember the subxact from where we streamed the last change.
3276 : */
3277 : static ReorderBufferTXN *
3278 850 : ReorderBufferLargestTopTXN(ReorderBuffer *rb)
3279 : {
3280 : dlist_iter iter;
3281 850 : Size largest_size = 0;
3282 850 : ReorderBufferTXN *largest = NULL;
3283 :
3284 : /* Find the largest top-level transaction. */
3285 1786 : dlist_foreach(iter, &rb->toplevel_by_lsn)
3286 : {
3287 : ReorderBufferTXN *txn;
3288 :
3289 936 : txn = dlist_container(ReorderBufferTXN, node, iter.cur);
3290 :
3291 1788 : if ((largest != NULL || txn->total_size > largest_size) &&
3292 1704 : (txn->total_size > 0) && !(rbtxn_has_incomplete_tuple(txn)))
3293 : {
3294 764 : largest = txn;
3295 764 : largest_size = txn->total_size;
3296 : }
3297 : }
3298 :
3299 850 : return largest;
3300 : }
3301 :
3302 : /*
3303 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3304 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3305 : * disk until we reach under the memory limit.
3306 : *
3307 : * XXX At this point we select the transactions until we reach under the memory
3308 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3309 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3310 : * limit.
3311 : */
3312 : static void
3313 2994836 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3314 : {
3315 : ReorderBufferTXN *txn;
3316 :
3317 : /* bail out if we haven't exceeded the memory limit */
3318 2994836 : if (rb->size < logical_decoding_work_mem * 1024L)
3319 5982972 : return;
3320 :
3321 : /*
3322 : * Loop until we reach under the memory limit. One might think that just
3323 : * by evicting the largest (sub)transaction we will come under the memory
3324 : * limit based on assumption that the selected transaction is at least as
3325 : * large as the most recent change (which caused us to go over the memory
3326 : * limit). However, that is not true because a user can reduce the
3327 : * logical_decoding_work_mem to a smaller value before the most recent
3328 : * change.
3329 : */
3330 20084 : while (rb->size >= logical_decoding_work_mem * 1024L)
3331 : {
3332 : /*
3333 : * Pick the largest transaction (or subtransaction) and evict it from
3334 : * memory by streaming, if possible. Otherwise, spill to disk.
3335 : */
3336 6696 : if (ReorderBufferCanStartStreaming(rb) &&
3337 : (txn = ReorderBufferLargestTopTXN(rb)) != NULL)
3338 : {
3339 : /* we know there has to be one, because the size is not zero */
3340 764 : Assert(txn && !txn->toptxn);
3341 764 : Assert(txn->total_size > 0);
3342 764 : Assert(rb->size >= txn->total_size);
3343 :
3344 764 : ReorderBufferStreamTXN(rb, txn);
3345 : }
3346 : else
3347 : {
3348 : /*
3349 : * Pick the largest transaction (or subtransaction) and evict it
3350 : * from memory by serializing it to disk.
3351 : */
3352 5932 : txn = ReorderBufferLargestTXN(rb);
3353 :
3354 : /* we know there has to be one, because the size is not zero */
3355 5932 : Assert(txn);
3356 5932 : Assert(txn->size > 0);
3357 5932 : Assert(rb->size >= txn->size);
3358 :
3359 5932 : ReorderBufferSerializeTXN(rb, txn);
3360 : }
3361 :
3362 : /*
3363 : * After eviction, the transaction should have no entries in memory,
3364 : * and should use 0 bytes for changes.
3365 : */
3366 6692 : Assert(txn->size == 0);
3367 6692 : Assert(txn->nentries_mem == 0);
3368 : }
3369 :
3370 : /* We must be under the memory limit now. */
3371 6692 : Assert(rb->size < logical_decoding_work_mem * 1024L);
3372 : }
3373 :
3374 : /*
3375 : * Spill data of a large transaction (and its subtransactions) to disk.
3376 : */
3377 : static void
3378 6534 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3379 : {
3380 : dlist_iter subtxn_i;
3381 : dlist_mutable_iter change_i;
3382 6534 : int fd = -1;
3383 6534 : XLogSegNo curOpenSegNo = 0;
3384 6534 : Size spilled = 0;
3385 6534 : Size size = txn->size;
3386 :
3387 6534 : elog(DEBUG2, "spill %u changes in XID %u to disk",
3388 : (uint32) txn->nentries_mem, txn->xid);
3389 :
3390 : /* do the same to all child TXs */
3391 7070 : dlist_foreach(subtxn_i, &txn->subtxns)
3392 : {
3393 : ReorderBufferTXN *subtxn;
3394 :
3395 536 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3396 536 : ReorderBufferSerializeTXN(rb, subtxn);
3397 : }
3398 :
3399 : /* serialize changestream */
3400 2548268 : dlist_foreach_modify(change_i, &txn->changes)
3401 : {
3402 : ReorderBufferChange *change;
3403 :
3404 2541734 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
3405 :
3406 : /*
3407 : * store in segment in which it belongs by start lsn, don't split over
3408 : * multiple segments tho
3409 : */
3410 5077436 : if (fd == -1 ||
3411 2535702 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3412 : {
3413 : char path[MAXPGPATH];
3414 :
3415 6032 : if (fd != -1)
3416 0 : CloseTransientFile(fd);
3417 :
3418 6032 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3419 :
3420 : /*
3421 : * No need to care about TLIs here, only used during a single run,
3422 : * so each LSN only maps to a specific WAL record.
3423 : */
3424 6032 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3425 : curOpenSegNo);
3426 :
3427 : /* open segment, create it if necessary */
3428 6032 : fd = OpenTransientFile(path,
3429 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3430 :
3431 6032 : if (fd < 0)
3432 0 : ereport(ERROR,
3433 : (errcode_for_file_access(),
3434 : errmsg("could not open file \"%s\": %m", path)));
3435 : }
3436 :
3437 2541734 : ReorderBufferSerializeChange(rb, txn, fd, change);
3438 2541734 : dlist_delete(&change->node);
3439 2541734 : ReorderBufferReturnChange(rb, change, true);
3440 :
3441 2541734 : spilled++;
3442 : }
3443 :
3444 : /* update the statistics iff we have spilled anything */
3445 6534 : if (spilled)
3446 : {
3447 6032 : rb->spillCount += 1;
3448 6032 : rb->spillBytes += size;
3449 :
3450 : /* don't consider already serialized transactions */
3451 6032 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3452 : }
3453 :
3454 6534 : Assert(spilled == txn->nentries_mem);
3455 6534 : Assert(dlist_is_empty(&txn->changes));
3456 6534 : txn->nentries_mem = 0;
3457 6534 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
3458 :
3459 6534 : if (fd != -1)
3460 6032 : CloseTransientFile(fd);
3461 6534 : }
3462 :
3463 : /*
3464 : * Serialize individual change to disk.
3465 : */
3466 : static void
3467 2541734 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
3468 : int fd, ReorderBufferChange *change)
3469 : {
3470 : ReorderBufferDiskChange *ondisk;
3471 2541734 : Size sz = sizeof(ReorderBufferDiskChange);
3472 :
3473 2541734 : ReorderBufferSerializeReserve(rb, sz);
3474 :
3475 2541734 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3476 2541734 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3477 :
3478 2541734 : switch (change->action)
3479 : {
3480 : /* fall through these, they're all similar enough */
3481 : case REORDER_BUFFER_CHANGE_INSERT:
3482 : case REORDER_BUFFER_CHANGE_UPDATE:
3483 : case REORDER_BUFFER_CHANGE_DELETE:
3484 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3485 : {
3486 : char *data;
3487 : ReorderBufferTupleBuf *oldtup,
3488 : *newtup;
3489 2507250 : Size oldlen = 0;
3490 2507250 : Size newlen = 0;
3491 :
3492 2507250 : oldtup = change->data.tp.oldtuple;
3493 2507250 : newtup = change->data.tp.newtuple;
3494 :
3495 2507250 : if (oldtup)
3496 : {
3497 184414 : sz += sizeof(HeapTupleData);
3498 184414 : oldlen = oldtup->tuple.t_len;
3499 184414 : sz += oldlen;
3500 : }
3501 :
3502 2507250 : if (newtup)
3503 : {
3504 2215528 : sz += sizeof(HeapTupleData);
3505 2215528 : newlen = newtup->tuple.t_len;
3506 2215528 : sz += newlen;
3507 : }
3508 :
3509 : /* make sure we have enough space */
3510 2507250 : ReorderBufferSerializeReserve(rb, sz);
3511 :
3512 2507250 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3513 : /* might have been reallocated above */
3514 2507250 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3515 :
3516 2507250 : if (oldlen)
3517 : {
3518 184414 : memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3519 184414 : data += sizeof(HeapTupleData);
3520 :
3521 184414 : memcpy(data, oldtup->tuple.t_data, oldlen);
3522 184414 : data += oldlen;
3523 : }
3524 :
3525 2507250 : if (newlen)
3526 : {
3527 2215528 : memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3528 2215528 : data += sizeof(HeapTupleData);
3529 :
3530 2215528 : memcpy(data, newtup->tuple.t_data, newlen);
3531 2215528 : data += newlen;
3532 : }
3533 2507250 : break;
3534 : }
3535 : case REORDER_BUFFER_CHANGE_MESSAGE:
3536 : {
3537 : char *data;
3538 46 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3539 :
3540 46 : sz += prefix_size + change->data.msg.message_size +
3541 : sizeof(Size) + sizeof(Size);
3542 46 : ReorderBufferSerializeReserve(rb, sz);
3543 :
3544 46 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3545 :
3546 : /* might have been reallocated above */
3547 46 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3548 :
3549 : /* write the prefix including the size */
3550 46 : memcpy(data, &prefix_size, sizeof(Size));
3551 46 : data += sizeof(Size);
3552 46 : memcpy(data, change->data.msg.prefix,
3553 : prefix_size);
3554 46 : data += prefix_size;
3555 :
3556 : /* write the message including the size */
3557 46 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
3558 46 : data += sizeof(Size);
3559 46 : memcpy(data, change->data.msg.message,
3560 : change->data.msg.message_size);
3561 46 : data += change->data.msg.message_size;
3562 :
3563 46 : break;
3564 : }
3565 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3566 : {
3567 : char *data;
3568 210 : Size inval_size = sizeof(SharedInvalidationMessage) *
3569 210 : change->data.inval.ninvalidations;
3570 :
3571 210 : sz += inval_size;
3572 :
3573 210 : ReorderBufferSerializeReserve(rb, sz);
3574 210 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3575 :
3576 : /* might have been reallocated above */
3577 210 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3578 210 : memcpy(data, change->data.inval.invalidations, inval_size);
3579 210 : data += inval_size;
3580 :
3581 210 : break;
3582 : }
3583 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3584 : {
3585 : Snapshot snap;
3586 : char *data;
3587 :
3588 4 : snap = change->data.snapshot;
3589 :
3590 4 : sz += sizeof(SnapshotData) +
3591 8 : sizeof(TransactionId) * snap->xcnt +
3592 4 : sizeof(TransactionId) * snap->subxcnt;
3593 :
3594 : /* make sure we have enough space */
3595 4 : ReorderBufferSerializeReserve(rb, sz);
3596 4 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3597 : /* might have been reallocated above */
3598 4 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3599 :
3600 4 : memcpy(data, snap, sizeof(SnapshotData));
3601 4 : data += sizeof(SnapshotData);
3602 :
3603 4 : if (snap->xcnt)
3604 : {
3605 4 : memcpy(data, snap->xip,
3606 4 : sizeof(TransactionId) * snap->xcnt);
3607 4 : data += sizeof(TransactionId) * snap->xcnt;
3608 : }
3609 :
3610 4 : if (snap->subxcnt)
3611 : {
3612 0 : memcpy(data, snap->subxip,
3613 0 : sizeof(TransactionId) * snap->subxcnt);
3614 0 : data += sizeof(TransactionId) * snap->subxcnt;
3615 : }
3616 4 : break;
3617 : }
3618 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3619 : {
3620 : Size size;
3621 : char *data;
3622 :
3623 : /* account for the OIDs of truncated relations */
3624 0 : size = sizeof(Oid) * change->data.truncate.nrelids;
3625 0 : sz += size;
3626 :
3627 : /* make sure we have enough space */
3628 0 : ReorderBufferSerializeReserve(rb, sz);
3629 :
3630 0 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3631 : /* might have been reallocated above */
3632 0 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3633 :
3634 0 : memcpy(data, change->data.truncate.relids, size);
3635 0 : data += size;
3636 :
3637 0 : break;
3638 : }
3639 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3640 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3641 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3642 : /* ReorderBufferChange contains everything important */
3643 34224 : break;
3644 : }
3645 :
3646 2541734 : ondisk->size = sz;
3647 :
3648 2541734 : errno = 0;
3649 2541734 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3650 2541734 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3651 : {
3652 0 : int save_errno = errno;
3653 :
3654 0 : CloseTransientFile(fd);
3655 :
3656 : /* if write didn't set errno, assume problem is no disk space */
3657 0 : errno = save_errno ? save_errno : ENOSPC;
3658 0 : ereport(ERROR,
3659 : (errcode_for_file_access(),
3660 : errmsg("could not write to data file for XID %u: %m",
3661 : txn->xid)));
3662 : }
3663 2541734 : pgstat_report_wait_end();
3664 :
3665 : /*
3666 : * Keep the transaction's final_lsn up to date with each change we send to
3667 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3668 : * only do this on commit and abort records, but that doesn't work if a
3669 : * system crash leaves a transaction without its abort record).
3670 : *
3671 : * Make sure not to move it backwards.
3672 : */
3673 2541734 : if (txn->final_lsn < change->lsn)
3674 2532310 : txn->final_lsn = change->lsn;
3675 :
3676 2541734 : Assert(ondisk->change.action == change->action);
3677 2541734 : }
3678 :
3679 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
3680 : static inline bool
3681 10185462 : ReorderBufferCanStream(ReorderBuffer *rb)
3682 : {
3683 10185462 : LogicalDecodingContext *ctx = rb->private_data;
3684 :
3685 10185462 : return ctx->streaming;
3686 : }
3687 :
3688 : /* Returns true, if the streaming can be started now, false, otherwise. */
3689 : static inline bool
3690 569296 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
3691 : {
3692 569296 : LogicalDecodingContext *ctx = rb->private_data;
3693 569296 : SnapBuild *builder = ctx->snapshot_builder;
3694 :
3695 : /*
3696 : * We can't start streaming immediately even if the streaming is enabled
3697 : * because we previously decoded this transaction and now just are
3698 : * restarting.
3699 : */
3700 1133310 : if (ReorderBufferCanStream(rb) &&
3701 564014 : !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
3702 : {
3703 : /* We must have a consistent snapshot by this time */
3704 325704 : Assert(SnapBuildCurrentState(builder) == SNAPBUILD_CONSISTENT);
3705 325704 : return true;
3706 : }
3707 :
3708 243592 : return false;
3709 : }
3710 :
3711 : /*
3712 : * Send data of a large transaction (and its subtransactions) to the
3713 : * output plugin, but using the stream API.
3714 : */
3715 : static void
3716 824 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3717 : {
3718 : Snapshot snapshot_now;
3719 : CommandId command_id;
3720 : Size stream_bytes;
3721 : bool txn_is_streamed;
3722 :
3723 : /* We can never reach here for a subtransaction. */
3724 824 : Assert(txn->toptxn == NULL);
3725 :
3726 : /*
3727 : * We can't make any assumptions about base snapshot here, similar to what
3728 : * ReorderBufferCommit() does. That relies on base_snapshot getting
3729 : * transferred from subxact in ReorderBufferCommitChild(), but that was
3730 : * not yet called as the transaction is in-progress.
3731 : *
3732 : * So just walk the subxacts and use the same logic here. But we only need
3733 : * to do that once, when the transaction is streamed for the first time.
3734 : * After that we need to reuse the snapshot from the previous run.
3735 : *
3736 : * Unlike DecodeCommit which adds xids of all the subtransactions in
3737 : * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
3738 : * but we do add them to subxip array instead via ReorderBufferCopySnap.
3739 : * This allows the catalog changes made in subtransactions decoded till
3740 : * now to be visible.
3741 : */
3742 824 : if (txn->snapshot_now == NULL)
3743 : {
3744 : dlist_iter subxact_i;
3745 :
3746 : /* make sure this transaction is streamed for the first time */
3747 68 : Assert(!rbtxn_is_streamed(txn));
3748 :
3749 : /* at the beginning we should have invalid command ID */
3750 68 : Assert(txn->command_id == InvalidCommandId);
3751 :
3752 76 : dlist_foreach(subxact_i, &txn->subtxns)
3753 : {
3754 : ReorderBufferTXN *subtxn;
3755 :
3756 8 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
3757 8 : ReorderBufferTransferSnapToParent(txn, subtxn);
3758 : }
3759 :
3760 : /*
3761 : * If this transaction has no snapshot, it didn't make any changes to
3762 : * the database till now, so there's nothing to decode.
3763 : */
3764 68 : if (txn->base_snapshot == NULL)
3765 : {
3766 0 : Assert(txn->ninvalidations == 0);
3767 820 : return;
3768 : }
3769 :
3770 68 : command_id = FirstCommandId;
3771 68 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
3772 : txn, command_id);
3773 : }
3774 : else
3775 : {
3776 : /* the transaction must have been already streamed */
3777 756 : Assert(rbtxn_is_streamed(txn));
3778 :
3779 : /*
3780 : * Nah, we already have snapshot from the previous streaming run. We
3781 : * assume new subxacts can't move the LSN backwards, and so can't beat
3782 : * the LSN condition in the previous branch (so no need to walk
3783 : * through subxacts again). In fact, we must not do that as we may be
3784 : * using snapshot half-way through the subxact.
3785 : */
3786 756 : command_id = txn->command_id;
3787 :
3788 : /*
3789 : * We can't use txn->snapshot_now directly because after the last
3790 : * streaming run, we might have got some new sub-transactions. So we
3791 : * need to add them to the snapshot.
3792 : */
3793 756 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
3794 : txn, command_id);
3795 :
3796 : /* Free the previously copied snapshot. */
3797 756 : Assert(txn->snapshot_now->copied);
3798 756 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
3799 756 : txn->snapshot_now = NULL;
3800 : }
3801 :
3802 : /*
3803 : * Remember this information to be used later to update stats. We can't
3804 : * update the stats here as an error while processing the changes would
3805 : * lead to the accumulation of stats even though we haven't streamed all
3806 : * the changes.
3807 : */
3808 824 : txn_is_streamed = rbtxn_is_streamed(txn);
3809 824 : stream_bytes = txn->total_size;
3810 :
3811 : /* Process and send the changes to output plugin. */
3812 824 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
3813 : command_id, true);
3814 :
3815 820 : rb->streamCount += 1;
3816 820 : rb->streamBytes += stream_bytes;
3817 :
3818 : /* Don't consider already streamed transaction. */
3819 820 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
3820 :
3821 820 : Assert(dlist_is_empty(&txn->changes));
3822 820 : Assert(txn->nentries == 0);
3823 820 : Assert(txn->nentries_mem == 0);
3824 : }
3825 :
3826 : /*
3827 : * Size of a change in memory.
3828 : */
3829 : static Size
3830 6621330 : ReorderBufferChangeSize(ReorderBufferChange *change)
3831 : {
3832 6621330 : Size sz = sizeof(ReorderBufferChange);
3833 :
3834 6621330 : switch (change->action)
3835 : {
3836 : /* fall through these, they're all similar enough */
3837 : case REORDER_BUFFER_CHANGE_INSERT:
3838 : case REORDER_BUFFER_CHANGE_UPDATE:
3839 : case REORDER_BUFFER_CHANGE_DELETE:
3840 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3841 : {
3842 : ReorderBufferTupleBuf *oldtup,
3843 : *newtup;
3844 6464838 : Size oldlen = 0;
3845 6464838 : Size newlen = 0;
3846 :
3847 6464838 : oldtup = change->data.tp.oldtuple;
3848 6464838 : newtup = change->data.tp.newtuple;
3849 :
3850 6464838 : if (oldtup)
3851 : {
3852 547312 : sz += sizeof(HeapTupleData);
3853 547312 : oldlen = oldtup->tuple.t_len;
3854 547312 : sz += oldlen;
3855 : }
3856 :
3857 6464838 : if (newtup)
3858 : {
3859 5640338 : sz += sizeof(HeapTupleData);
3860 5640338 : newlen = newtup->tuple.t_len;
3861 5640338 : sz += newlen;
3862 : }
3863 :
3864 6464838 : break;
3865 : }
3866 : case REORDER_BUFFER_CHANGE_MESSAGE:
3867 : {
3868 152 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3869 :
3870 152 : sz += prefix_size + change->data.msg.message_size +
3871 : sizeof(Size) + sizeof(Size);
3872 :
3873 152 : break;
3874 : }
3875 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3876 : {
3877 11862 : sz += sizeof(SharedInvalidationMessage) *
3878 11862 : change->data.inval.ninvalidations;
3879 11862 : break;
3880 : }
3881 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3882 : {
3883 : Snapshot snap;
3884 :
3885 2442 : snap = change->data.snapshot;
3886 :
3887 2442 : sz += sizeof(SnapshotData) +
3888 4884 : sizeof(TransactionId) * snap->xcnt +
3889 2442 : sizeof(TransactionId) * snap->subxcnt;
3890 :
3891 2442 : break;
3892 : }
3893 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3894 : {
3895 84 : sz += sizeof(Oid) * change->data.truncate.nrelids;
3896 :
3897 84 : break;
3898 : }
3899 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3900 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3901 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3902 : /* ReorderBufferChange contains everything important */
3903 141952 : break;
3904 : }
3905 :
3906 6621330 : return sz;
3907 : }
3908 :
3909 :
3910 : /*
3911 : * Restore a number of changes spilled to disk back into memory.
3912 : */
3913 : static Size
3914 182 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
3915 : TXNEntryFile *file, XLogSegNo *segno)
3916 : {
3917 182 : Size restored = 0;
3918 : XLogSegNo last_segno;
3919 : dlist_mutable_iter cleanup_iter;
3920 182 : File *fd = &file->vfd;
3921 :
3922 182 : Assert(txn->first_lsn != InvalidXLogRecPtr);
3923 182 : Assert(txn->final_lsn != InvalidXLogRecPtr);
3924 :
3925 : /* free current entries, so we have memory for more */
3926 310178 : dlist_foreach_modify(cleanup_iter, &txn->changes)
3927 : {
3928 309996 : ReorderBufferChange *cleanup =
3929 309996 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
3930 :
3931 309996 : dlist_delete(&cleanup->node);
3932 309996 : ReorderBufferReturnChange(rb, cleanup, true);
3933 : }
3934 182 : txn->nentries_mem = 0;
3935 182 : Assert(dlist_is_empty(&txn->changes));
3936 :
3937 182 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
3938 :
3939 317340 : while (restored < max_changes_in_memory && *segno <= last_segno)
3940 : {
3941 : int readBytes;
3942 : ReorderBufferDiskChange *ondisk;
3943 :
3944 316976 : if (*fd == -1)
3945 : {
3946 : char path[MAXPGPATH];
3947 :
3948 : /* first time in */
3949 66 : if (*segno == 0)
3950 66 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
3951 :
3952 66 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
3953 :
3954 : /*
3955 : * No need to care about TLIs here, only used during a single run,
3956 : * so each LSN only maps to a specific WAL record.
3957 : */
3958 66 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3959 : *segno);
3960 :
3961 66 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
3962 :
3963 : /* No harm in resetting the offset even in case of failure */
3964 66 : file->curOffset = 0;
3965 :
3966 66 : if (*fd < 0 && errno == ENOENT)
3967 : {
3968 0 : *fd = -1;
3969 0 : (*segno)++;
3970 0 : continue;
3971 : }
3972 66 : else if (*fd < 0)
3973 0 : ereport(ERROR,
3974 : (errcode_for_file_access(),
3975 : errmsg("could not open file \"%s\": %m",
3976 : path)));
3977 : }
3978 :
3979 : /*
3980 : * Read the statically sized part of a change which has information
3981 : * about the total size. If we couldn't read a record, we're at the
3982 : * end of this file.
3983 : */
3984 316976 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
3985 316976 : readBytes = FileRead(file->vfd, rb->outbuf,
3986 : sizeof(ReorderBufferDiskChange),
3987 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
3988 :
3989 : /* eof */
3990 316976 : if (readBytes == 0)
3991 : {
3992 66 : FileClose(*fd);
3993 66 : *fd = -1;
3994 66 : (*segno)++;
3995 66 : continue;
3996 : }
3997 316910 : else if (readBytes < 0)
3998 0 : ereport(ERROR,
3999 : (errcode_for_file_access(),
4000 : errmsg("could not read from reorderbuffer spill file: %m")));
4001 316910 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4002 0 : ereport(ERROR,
4003 : (errcode_for_file_access(),
4004 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4005 : readBytes,
4006 : (uint32) sizeof(ReorderBufferDiskChange))));
4007 :
4008 316910 : file->curOffset += readBytes;
4009 :
4010 316910 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4011 :
4012 316910 : ReorderBufferSerializeReserve(rb,
4013 316910 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4014 316910 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4015 :
4016 950730 : readBytes = FileRead(file->vfd,
4017 316910 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4018 316910 : ondisk->size - sizeof(ReorderBufferDiskChange),
4019 : file->curOffset,
4020 : WAIT_EVENT_REORDER_BUFFER_READ);
4021 :
4022 316910 : if (readBytes < 0)
4023 0 : ereport(ERROR,
4024 : (errcode_for_file_access(),
4025 : errmsg("could not read from reorderbuffer spill file: %m")));
4026 316910 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4027 0 : ereport(ERROR,
4028 : (errcode_for_file_access(),
4029 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4030 : readBytes,
4031 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4032 :
4033 316910 : file->curOffset += readBytes;
4034 :
4035 : /*
4036 : * ok, read a full change from disk, now restore it into proper
4037 : * in-memory format
4038 : */
4039 316910 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4040 316910 : restored++;
4041 : }
4042 :
4043 182 : return restored;
4044 : }
4045 :
4046 : /*
4047 : * Convert change from its on-disk format to in-memory format and queue it onto
4048 : * the TXN's ->changes list.
4049 : *
4050 : * Note: although "data" is declared char*, at entry it points to a
4051 : * maxalign'd buffer, making it safe in most of this function to assume
4052 : * that the pointed-to data is suitably aligned for direct access.
4053 : */
4054 : static void
4055 316910 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4056 : char *data)
4057 : {
4058 : ReorderBufferDiskChange *ondisk;
4059 : ReorderBufferChange *change;
4060 :
4061 316910 : ondisk = (ReorderBufferDiskChange *) data;
4062 :
4063 316910 : change = ReorderBufferGetChange(rb);
4064 :
4065 : /* copy static part */
4066 316910 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4067 :
4068 316910 : data += sizeof(ReorderBufferDiskChange);
4069 :
4070 : /* restore individual stuff */
4071 316910 : switch (change->action)
4072 : {
4073 : /* fall through these, they're all similar enough */
4074 : case REORDER_BUFFER_CHANGE_INSERT:
4075 : case REORDER_BUFFER_CHANGE_UPDATE:
4076 : case REORDER_BUFFER_CHANGE_DELETE:
4077 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4078 313124 : if (change->data.tp.oldtuple)
4079 : {
4080 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4081 :
4082 10012 : change->data.tp.oldtuple =
4083 10012 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4084 :
4085 : /* restore ->tuple */
4086 10012 : memcpy(&change->data.tp.oldtuple->tuple, data,
4087 : sizeof(HeapTupleData));
4088 10012 : data += sizeof(HeapTupleData);
4089 :
4090 : /* reset t_data pointer into the new tuplebuf */
4091 20024 : change->data.tp.oldtuple->tuple.t_data =
4092 10012 : ReorderBufferTupleBufData(change->data.tp.oldtuple);
4093 :
4094 : /* restore tuple data itself */
4095 10012 : memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4096 10012 : data += tuplelen;
4097 : }
4098 :
4099 313124 : if (change->data.tp.newtuple)
4100 : {
4101 : /* here, data might not be suitably aligned! */
4102 : uint32 tuplelen;
4103 :
4104 292684 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4105 : sizeof(uint32));
4106 :
4107 292684 : change->data.tp.newtuple =
4108 292684 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4109 :
4110 : /* restore ->tuple */
4111 292684 : memcpy(&change->data.tp.newtuple->tuple, data,
4112 : sizeof(HeapTupleData));
4113 292684 : data += sizeof(HeapTupleData);
4114 :
4115 : /* reset t_data pointer into the new tuplebuf */
4116 585368 : change->data.tp.newtuple->tuple.t_data =
4117 292684 : ReorderBufferTupleBufData(change->data.tp.newtuple);
4118 :
4119 : /* restore tuple data itself */
4120 292684 : memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4121 292684 : data += tuplelen;
4122 : }
4123 :
4124 313124 : break;
4125 : case REORDER_BUFFER_CHANGE_MESSAGE:
4126 : {
4127 : Size prefix_size;
4128 :
4129 : /* read prefix */
4130 2 : memcpy(&prefix_size, data, sizeof(Size));
4131 2 : data += sizeof(Size);
4132 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4133 : prefix_size);
4134 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4135 2 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4136 2 : data += prefix_size;
4137 :
4138 : /* read the message */
4139 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4140 2 : data += sizeof(Size);
4141 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4142 : change->data.msg.message_size);
4143 2 : memcpy(change->data.msg.message, data,
4144 : change->data.msg.message_size);
4145 2 : data += change->data.msg.message_size;
4146 :
4147 2 : break;
4148 : }
4149 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4150 : {
4151 36 : Size inval_size = sizeof(SharedInvalidationMessage) *
4152 36 : change->data.inval.ninvalidations;
4153 :
4154 36 : change->data.inval.invalidations =
4155 36 : MemoryContextAlloc(rb->context, inval_size);
4156 :
4157 : /* read the message */
4158 36 : memcpy(change->data.inval.invalidations, data, inval_size);
4159 :
4160 36 : break;
4161 : }
4162 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4163 : {
4164 : Snapshot oldsnap;
4165 : Snapshot newsnap;
4166 : Size size;
4167 :
4168 4 : oldsnap = (Snapshot) data;
4169 :
4170 4 : size = sizeof(SnapshotData) +
4171 8 : sizeof(TransactionId) * oldsnap->xcnt +
4172 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4173 :
4174 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4175 :
4176 4 : newsnap = change->data.snapshot;
4177 :
4178 4 : memcpy(newsnap, data, size);
4179 4 : newsnap->xip = (TransactionId *)
4180 : (((char *) newsnap) + sizeof(SnapshotData));
4181 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4182 4 : newsnap->copied = true;
4183 4 : break;
4184 : }
4185 : /* the base struct contains all the data, easy peasy */
4186 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4187 : {
4188 : Oid *relids;
4189 :
4190 0 : relids = ReorderBufferGetRelids(rb,
4191 0 : change->data.truncate.nrelids);
4192 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4193 0 : change->data.truncate.relids = relids;
4194 :
4195 0 : break;
4196 : }
4197 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4198 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4199 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4200 3744 : break;
4201 : }
4202 :
4203 316910 : dlist_push_tail(&txn->changes, &change->node);
4204 316910 : txn->nentries_mem++;
4205 :
4206 : /*
4207 : * Update memory accounting for the restored change. We need to do this
4208 : * although we don't check the memory limit when restoring the changes in
4209 : * this branch (we only do that when initially queueing the changes after
4210 : * decoding), because we will release the changes later, and that will
4211 : * update the accounting too (subtracting the size from the counters). And
4212 : * we don't want to underflow there.
4213 : */
4214 316910 : ReorderBufferChangeMemoryUpdate(rb, change, true);
4215 316910 : }
4216 :
4217 : /*
4218 : * Remove all on-disk stored for the passed in transaction.
4219 : */
4220 : static void
4221 444 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4222 : {
4223 : XLogSegNo first;
4224 : XLogSegNo cur;
4225 : XLogSegNo last;
4226 :
4227 444 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4228 444 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4229 :
4230 444 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4231 444 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4232 :
4233 : /* iterate over all possible filenames, and delete them */
4234 888 : for (cur = first; cur <= last; cur++)
4235 : {
4236 : char path[MAXPGPATH];
4237 :
4238 444 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4239 444 : if (unlink(path) != 0 && errno != ENOENT)
4240 0 : ereport(ERROR,
4241 : (errcode_for_file_access(),
4242 : errmsg("could not remove file \"%s\": %m", path)));
4243 : }
4244 444 : }
4245 :
4246 : /*
4247 : * Remove any leftover serialized reorder buffers from a slot directory after a
4248 : * prior crash or decoding session exit.
4249 : */
4250 : static void
4251 1604 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4252 : {
4253 : DIR *spill_dir;
4254 : struct dirent *spill_de;
4255 : struct stat statbuf;
4256 : char path[MAXPGPATH * 2 + 12];
4257 :
4258 1604 : sprintf(path, "pg_replslot/%s", slotname);
4259 :
4260 : /* we're only handling directories here, skip if it's not ours */
4261 1604 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4262 1604 : return;
4263 :
4264 1604 : spill_dir = AllocateDir(path);
4265 1604 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4266 : {
4267 : /* only look at names that can be ours */
4268 4812 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4269 : {
4270 0 : snprintf(path, sizeof(path),
4271 : "pg_replslot/%s/%s", slotname,
4272 0 : spill_de->d_name);
4273 :
4274 0 : if (unlink(path) != 0)
4275 0 : ereport(ERROR,
4276 : (errcode_for_file_access(),
4277 : errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4278 : path, slotname)));
4279 : }
4280 : }
4281 1604 : FreeDir(spill_dir);
4282 : }
4283 :
4284 : /*
4285 : * Given a replication slot, transaction ID and segment number, fill in the
4286 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4287 : * at least MAXPGPATH.
4288 : */
4289 : static void
4290 6542 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4291 : XLogSegNo segno)
4292 : {
4293 : XLogRecPtr recptr;
4294 :
4295 6542 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4296 :
4297 19626 : snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4298 6542 : NameStr(MyReplicationSlot->data.name),
4299 : xid,
4300 6542 : (uint32) (recptr >> 32), (uint32) recptr);
4301 6542 : }
4302 :
4303 : /*
4304 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4305 : * recreated when the respective slots are reused.
4306 : */
4307 : void
4308 322 : StartupReorderBuffer(void)
4309 : {
4310 : DIR *logical_dir;
4311 : struct dirent *logical_de;
4312 :
4313 322 : logical_dir = AllocateDir("pg_replslot");
4314 1304 : while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4315 : {
4316 998 : if (strcmp(logical_de->d_name, ".") == 0 ||
4317 338 : strcmp(logical_de->d_name, "..") == 0)
4318 644 : continue;
4319 :
4320 : /* if it cannot be a slot, skip the directory */
4321 16 : if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4322 0 : continue;
4323 :
4324 : /*
4325 : * ok, has to be a surviving logical slot, iterate and delete
4326 : * everything starting with xid-*
4327 : */
4328 16 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4329 : }
4330 322 : FreeDir(logical_dir);
4331 322 : }
4332 :
4333 : /* ---------------------------------------
4334 : * toast reassembly support
4335 : * ---------------------------------------
4336 : */
4337 :
4338 : /*
4339 : * Initialize per tuple toast reconstruction support.
4340 : */
4341 : static void
4342 60 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4343 : {
4344 : HASHCTL hash_ctl;
4345 :
4346 60 : Assert(txn->toast_hash == NULL);
4347 :
4348 60 : memset(&hash_ctl, 0, sizeof(hash_ctl));
4349 60 : hash_ctl.keysize = sizeof(Oid);
4350 60 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4351 60 : hash_ctl.hcxt = rb->context;
4352 60 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4353 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4354 60 : }
4355 :
4356 : /*
4357 : * Per toast-chunk handling for toast reconstruction
4358 : *
4359 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4360 : * toasted Datum comes along.
4361 : */
4362 : static void
4363 3338 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4364 : Relation relation, ReorderBufferChange *change)
4365 : {
4366 : ReorderBufferToastEnt *ent;
4367 : ReorderBufferTupleBuf *newtup;
4368 : bool found;
4369 : int32 chunksize;
4370 : bool isnull;
4371 : Pointer chunk;
4372 3338 : TupleDesc desc = RelationGetDescr(relation);
4373 : Oid chunk_id;
4374 : int32 chunk_seq;
4375 :
4376 3338 : if (txn->toast_hash == NULL)
4377 60 : ReorderBufferToastInitHash(rb, txn);
4378 :
4379 3338 : Assert(IsToastRelation(relation));
4380 :
4381 3338 : newtup = change->data.tp.newtuple;
4382 3338 : chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4383 3338 : Assert(!isnull);
4384 3338 : chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4385 3338 : Assert(!isnull);
4386 :
4387 3338 : ent = (ReorderBufferToastEnt *)
4388 3338 : hash_search(txn->toast_hash,
4389 : (void *) &chunk_id,
4390 : HASH_ENTER,
4391 : &found);
4392 :
4393 3338 : if (!found)
4394 : {
4395 68 : Assert(ent->chunk_id == chunk_id);
4396 68 : ent->num_chunks = 0;
4397 68 : ent->last_chunk_seq = 0;
4398 68 : ent->size = 0;
4399 68 : ent->reconstructed = NULL;
4400 68 : dlist_init(&ent->chunks);
4401 :
4402 68 : if (chunk_seq != 0)
4403 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4404 : chunk_seq, chunk_id);
4405 : }
4406 3270 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4407 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4408 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4409 :
4410 3338 : chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4411 3338 : Assert(!isnull);
4412 :
4413 : /* calculate size so we can allocate the right size at once later */
4414 3338 : if (!VARATT_IS_EXTENDED(chunk))
4415 3338 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4416 0 : else if (VARATT_IS_SHORT(chunk))
4417 : /* could happen due to heap_form_tuple doing its thing */
4418 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4419 : else
4420 0 : elog(ERROR, "unexpected type of toast chunk");
4421 :
4422 3338 : ent->size += chunksize;
4423 3338 : ent->last_chunk_seq = chunk_seq;
4424 3338 : ent->num_chunks++;
4425 3338 : dlist_push_tail(&ent->chunks, &change->node);
4426 3338 : }
4427 :
4428 : /*
4429 : * Rejigger change->newtuple to point to in-memory toast tuples instead to
4430 : * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4431 : *
4432 : * We cannot replace unchanged toast tuples though, so those will still point
4433 : * to on-disk toast data.
4434 : *
4435 : * While updating the existing change with detoasted tuple data, we need to
4436 : * update the memory accounting info, because the change size will differ.
4437 : * Otherwise the accounting may get out of sync, triggering serialization
4438 : * at unexpected times.
4439 : *
4440 : * We simply subtract size of the change before rejiggering the tuple, and
4441 : * then adding the new size. This makes it look like the change was removed
4442 : * and then added back, except it only tweaks the accounting info.
4443 : *
4444 : * In particular it can't trigger serialization, which would be pointless
4445 : * anyway as it happens during commit processing right before handing
4446 : * the change to the output plugin.
4447 : */
4448 : static void
4449 637388 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
4450 : Relation relation, ReorderBufferChange *change)
4451 : {
4452 : TupleDesc desc;
4453 : int natt;
4454 : Datum *attrs;
4455 : bool *isnull;
4456 : bool *free;
4457 : HeapTuple tmphtup;
4458 : Relation toast_rel;
4459 : TupleDesc toast_desc;
4460 : MemoryContext oldcontext;
4461 : ReorderBufferTupleBuf *newtup;
4462 :
4463 : /* no toast tuples changed */
4464 637388 : if (txn->toast_hash == NULL)
4465 1274312 : return;
4466 :
4467 : /*
4468 : * We're going to modify the size of the change, so to make sure the
4469 : * accounting is correct we'll make it look like we're removing the change
4470 : * now (with the old size), and then re-add it at the end.
4471 : */
4472 464 : ReorderBufferChangeMemoryUpdate(rb, change, false);
4473 :
4474 464 : oldcontext = MemoryContextSwitchTo(rb->context);
4475 :
4476 : /* we should only have toast tuples in an INSERT or UPDATE */
4477 464 : Assert(change->data.tp.newtuple);
4478 :
4479 464 : desc = RelationGetDescr(relation);
4480 :
4481 464 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4482 464 : if (!RelationIsValid(toast_rel))
4483 0 : elog(ERROR, "could not open relation with OID %u",
4484 : relation->rd_rel->reltoastrelid);
4485 :
4486 464 : toast_desc = RelationGetDescr(toast_rel);
4487 :
4488 : /* should we allocate from stack instead? */
4489 464 : attrs = palloc0(sizeof(Datum) * desc->natts);
4490 464 : isnull = palloc0(sizeof(bool) * desc->natts);
4491 464 : free = palloc0(sizeof(bool) * desc->natts);
4492 :
4493 464 : newtup = change->data.tp.newtuple;
4494 :
4495 464 : heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4496 :
4497 1456 : for (natt = 0; natt < desc->natts; natt++)
4498 : {
4499 992 : Form_pg_attribute attr = TupleDescAttr(desc, natt);
4500 : ReorderBufferToastEnt *ent;
4501 : struct varlena *varlena;
4502 :
4503 : /* va_rawsize is the size of the original datum -- including header */
4504 : struct varatt_external toast_pointer;
4505 : struct varatt_indirect redirect_pointer;
4506 992 : struct varlena *new_datum = NULL;
4507 : struct varlena *reconstructed;
4508 : dlist_iter it;
4509 992 : Size data_done = 0;
4510 :
4511 : /* system columns aren't toasted */
4512 992 : if (attr->attnum < 0)
4513 924 : continue;
4514 :
4515 992 : if (attr->attisdropped)
4516 0 : continue;
4517 :
4518 : /* not a varlena datatype */
4519 992 : if (attr->attlen != -1)
4520 480 : continue;
4521 :
4522 : /* no data */
4523 512 : if (isnull[natt])
4524 24 : continue;
4525 :
4526 : /* ok, we know we have a toast datum */
4527 488 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4528 :
4529 : /* no need to do anything if the tuple isn't external */
4530 488 : if (!VARATT_IS_EXTERNAL(varlena))
4531 404 : continue;
4532 :
4533 84 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4534 :
4535 : /*
4536 : * Check whether the toast tuple changed, replace if so.
4537 : */
4538 84 : ent = (ReorderBufferToastEnt *)
4539 84 : hash_search(txn->toast_hash,
4540 : (void *) &toast_pointer.va_valueid,
4541 : HASH_FIND,
4542 : NULL);
4543 84 : if (ent == NULL)
4544 16 : continue;
4545 :
4546 68 : new_datum =
4547 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4548 :
4549 68 : free[natt] = true;
4550 :
4551 68 : reconstructed = palloc0(toast_pointer.va_rawsize);
4552 :
4553 68 : ent->reconstructed = reconstructed;
4554 :
4555 : /* stitch toast tuple back together from its parts */
4556 3406 : dlist_foreach(it, &ent->chunks)
4557 : {
4558 : bool isnull;
4559 : ReorderBufferChange *cchange;
4560 : ReorderBufferTupleBuf *ctup;
4561 : Pointer chunk;
4562 :
4563 3338 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
4564 3338 : ctup = cchange->data.tp.newtuple;
4565 3338 : chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull));
4566 :
4567 3338 : Assert(!isnull);
4568 3338 : Assert(!VARATT_IS_EXTERNAL(chunk));
4569 3338 : Assert(!VARATT_IS_SHORT(chunk));
4570 :
4571 6676 : memcpy(VARDATA(reconstructed) + data_done,
4572 3338 : VARDATA(chunk),
4573 3338 : VARSIZE(chunk) - VARHDRSZ);
4574 3338 : data_done += VARSIZE(chunk) - VARHDRSZ;
4575 : }
4576 68 : Assert(data_done == toast_pointer.va_extsize);
4577 :
4578 : /* make sure its marked as compressed or not */
4579 68 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4580 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4581 : else
4582 58 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4583 :
4584 68 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4585 68 : redirect_pointer.pointer = reconstructed;
4586 :
4587 68 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
4588 68 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4589 : sizeof(redirect_pointer));
4590 :
4591 68 : attrs[natt] = PointerGetDatum(new_datum);
4592 : }
4593 :
4594 : /*
4595 : * Build tuple in separate memory & copy tuple back into the tuplebuf
4596 : * passed to the output plugin. We can't directly heap_fill_tuple() into
4597 : * the tuplebuf because attrs[] will point back into the current content.
4598 : */
4599 464 : tmphtup = heap_form_tuple(desc, attrs, isnull);
4600 464 : Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4601 464 : Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4602 :
4603 464 : memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4604 464 : newtup->tuple.t_len = tmphtup->t_len;
4605 :
4606 : /*
4607 : * free resources we won't further need, more persistent stuff will be
4608 : * free'd in ReorderBufferToastReset().
4609 : */
4610 464 : RelationClose(toast_rel);
4611 464 : pfree(tmphtup);
4612 1456 : for (natt = 0; natt < desc->natts; natt++)
4613 : {
4614 992 : if (free[natt])
4615 68 : pfree(DatumGetPointer(attrs[natt]));
4616 : }
4617 464 : pfree(attrs);
4618 464 : pfree(free);
4619 464 : pfree(isnull);
4620 :
4621 464 : MemoryContextSwitchTo(oldcontext);
4622 :
4623 : /* now add the change back, with the correct size */
4624 464 : ReorderBufferChangeMemoryUpdate(rb, change, true);
4625 : }
4626 :
4627 : /*
4628 : * Free all resources allocated for toast reconstruction.
4629 : */
4630 : static void
4631 636994 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
4632 : {
4633 : HASH_SEQ_STATUS hstat;
4634 : ReorderBufferToastEnt *ent;
4635 :
4636 636994 : if (txn->toast_hash == NULL)
4637 1273928 : return;
4638 :
4639 : /* sequentially walk over the hash and free everything */
4640 60 : hash_seq_init(&hstat, txn->toast_hash);
4641 188 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4642 : {
4643 : dlist_mutable_iter it;
4644 :
4645 68 : if (ent->reconstructed != NULL)
4646 68 : pfree(ent->reconstructed);
4647 :
4648 3406 : dlist_foreach_modify(it, &ent->chunks)
4649 : {
4650 3338 : ReorderBufferChange *change =
4651 3338 : dlist_container(ReorderBufferChange, node, it.cur);
4652 :
4653 3338 : dlist_delete(&change->node);
4654 3338 : ReorderBufferReturnChange(rb, change, true);
4655 : }
4656 : }
4657 :
4658 60 : hash_destroy(txn->toast_hash);
4659 60 : txn->toast_hash = NULL;
4660 : }
4661 :
4662 :
4663 : /* ---------------------------------------
4664 : * Visibility support for logical decoding
4665 : *
4666 : *
4667 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4668 : * always rely on stored cmin/cmax values because of two scenarios:
4669 : *
4670 : * * A tuple got changed multiple times during a single transaction and thus
4671 : * has got a combocid. Combocid's are only valid for the duration of a
4672 : * single transaction.
4673 : * * A tuple with a cmin but no cmax (and thus no combocid) got
4674 : * deleted/updated in another transaction than the one which created it
4675 : * which we are looking at right now. As only one of cmin, cmax or combocid
4676 : * is actually stored in the heap we don't have access to the value we
4677 : * need anymore.
4678 : *
4679 : * To resolve those problems we have a per-transaction hash of (cmin,
4680 : * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
4681 : * (cmin, cmax) values. That also takes care of combocids by simply
4682 : * not caring about them at all. As we have the real cmin/cmax values
4683 : * combocids aren't interesting.
4684 : *
4685 : * As we only care about catalog tuples here the overhead of this
4686 : * hashtable should be acceptable.
4687 : *
4688 : * Heap rewrites complicate this a bit, check rewriteheap.c for
4689 : * details.
4690 : * -------------------------------------------------------------------------
4691 : */
4692 :
4693 : /* struct for sorting mapping files by LSN efficiently */
4694 : typedef struct RewriteMappingFile
4695 : {
4696 : XLogRecPtr lsn;
4697 : char fname[MAXPGPATH];
4698 : } RewriteMappingFile;
4699 :
4700 : #ifdef NOT_USED
4701 : static void
4702 : DisplayMapping(HTAB *tuplecid_data)
4703 : {
4704 : HASH_SEQ_STATUS hstat;
4705 : ReorderBufferTupleCidEnt *ent;
4706 :
4707 : hash_seq_init(&hstat, tuplecid_data);
4708 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4709 : {
4710 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4711 : ent->key.relnode.dbNode,
4712 : ent->key.relnode.spcNode,
4713 : ent->key.relnode.relNode,
4714 : ItemPointerGetBlockNumber(&ent->key.tid),
4715 : ItemPointerGetOffsetNumber(&ent->key.tid),
4716 : ent->cmin,
4717 : ent->cmax
4718 : );
4719 : }
4720 : }
4721 : #endif
4722 :
4723 : /*
4724 : * Apply a single mapping file to tuplecid_data.
4725 : *
4726 : * The mapping file has to have been verified to be a) committed b) for our
4727 : * transaction c) applied in LSN order.
4728 : */
4729 : static void
4730 44 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
4731 : {
4732 : char path[MAXPGPATH];
4733 : int fd;
4734 : int readBytes;
4735 : LogicalRewriteMappingData map;
4736 :
4737 44 : sprintf(path, "pg_logical/mappings/%s", fname);
4738 44 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4739 44 : if (fd < 0)
4740 0 : ereport(ERROR,
4741 : (errcode_for_file_access(),
4742 : errmsg("could not open file \"%s\": %m", path)));
4743 :
4744 : while (true)
4745 : {
4746 : ReorderBufferTupleCidKey key;
4747 : ReorderBufferTupleCidEnt *ent;
4748 : ReorderBufferTupleCidEnt *new_ent;
4749 : bool found;
4750 :
4751 : /* be careful about padding */
4752 282 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
4753 :
4754 : /* read all mappings till the end of the file */
4755 282 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
4756 282 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
4757 282 : pgstat_report_wait_end();
4758 :
4759 282 : if (readBytes < 0)
4760 0 : ereport(ERROR,
4761 : (errcode_for_file_access(),
4762 : errmsg("could not read file \"%s\": %m",
4763 : path)));
4764 282 : else if (readBytes == 0) /* EOF */
4765 44 : break;
4766 238 : else if (readBytes != sizeof(LogicalRewriteMappingData))
4767 0 : ereport(ERROR,
4768 : (errcode_for_file_access(),
4769 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
4770 : path, readBytes,
4771 : (int32) sizeof(LogicalRewriteMappingData))));
4772 :
4773 238 : key.relnode = map.old_node;
4774 238 : ItemPointerCopy(&map.old_tid,
4775 : &key.tid);
4776 :
4777 :
4778 238 : ent = (ReorderBufferTupleCidEnt *)
4779 : hash_search(tuplecid_data,
4780 : (void *) &key,
4781 : HASH_FIND,
4782 : NULL);
4783 :
4784 : /* no existing mapping, no need to update */
4785 238 : if (!ent)
4786 0 : continue;
4787 :
4788 238 : key.relnode = map.new_node;
4789 238 : ItemPointerCopy(&map.new_tid,
4790 : &key.tid);
4791 :
4792 238 : new_ent = (ReorderBufferTupleCidEnt *)
4793 : hash_search(tuplecid_data,
4794 : (void *) &key,
4795 : HASH_ENTER,
4796 : &found);
4797 :
4798 238 : if (found)
4799 : {
4800 : /*
4801 : * Make sure the existing mapping makes sense. We sometime update
4802 : * old records that did not yet have a cmax (e.g. pg_class' own
4803 : * entry while rewriting it) during rewrites, so allow that.
4804 : */
4805 12 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
4806 12 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
4807 : }
4808 : else
4809 : {
4810 : /* update mapping */
4811 226 : new_ent->cmin = ent->cmin;
4812 226 : new_ent->cmax = ent->cmax;
4813 226 : new_ent->combocid = ent->combocid;
4814 : }
4815 238 : }
4816 :
4817 44 : if (CloseTransientFile(fd) != 0)
4818 0 : ereport(ERROR,
4819 : (errcode_for_file_access(),
4820 : errmsg("could not close file \"%s\": %m", path)));
4821 44 : }
4822 :
4823 :
4824 : /*
4825 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
4826 : */
4827 : static bool
4828 580 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
4829 : {
4830 580 : return bsearch(&xid, xip, num,
4831 : sizeof(TransactionId), xidComparator) != NULL;
4832 : }
4833 :
4834 : /*
4835 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
4836 : */
4837 : static int
4838 34 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
4839 : {
4840 34 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
4841 34 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
4842 :
4843 34 : if (a->lsn < b->lsn)
4844 34 : return -1;
4845 0 : else if (a->lsn > b->lsn)
4846 0 : return 1;
4847 0 : return 0;
4848 : }
4849 :
4850 : /*
4851 : * Apply any existing logical remapping files if there are any targeted at our
4852 : * transaction for relid.
4853 : */
4854 : static void
4855 10 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
4856 : {
4857 : DIR *mapping_dir;
4858 : struct dirent *mapping_de;
4859 10 : List *files = NIL;
4860 : ListCell *file;
4861 10 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
4862 :
4863 10 : mapping_dir = AllocateDir("pg_logical/mappings");
4864 930 : while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
4865 : {
4866 : Oid f_dboid;
4867 : Oid f_relid;
4868 : TransactionId f_mapped_xid;
4869 : TransactionId f_create_xid;
4870 : XLogRecPtr f_lsn;
4871 : uint32 f_hi,
4872 : f_lo;
4873 : RewriteMappingFile *f;
4874 :
4875 1810 : if (strcmp(mapping_de->d_name, ".") == 0 ||
4876 900 : strcmp(mapping_de->d_name, "..") == 0)
4877 886 : continue;
4878 :
4879 : /* Ignore files that aren't ours */
4880 890 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
4881 0 : continue;
4882 :
4883 890 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
4884 : &f_dboid, &f_relid, &f_hi, &f_lo,
4885 : &f_mapped_xid, &f_create_xid) != 6)
4886 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
4887 :
4888 890 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
4889 :
4890 : /* mapping for another database */
4891 890 : if (f_dboid != dboid)
4892 0 : continue;
4893 :
4894 : /* mapping for another relation */
4895 890 : if (f_relid != relid)
4896 90 : continue;
4897 :
4898 : /* did the creating transaction abort? */
4899 800 : if (!TransactionIdDidCommit(f_create_xid))
4900 220 : continue;
4901 :
4902 : /* not for our transaction */
4903 580 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
4904 536 : continue;
4905 :
4906 : /* ok, relevant, queue for apply */
4907 44 : f = palloc(sizeof(RewriteMappingFile));
4908 44 : f->lsn = f_lsn;
4909 44 : strcpy(f->fname, mapping_de->d_name);
4910 44 : files = lappend(files, f);
4911 : }
4912 10 : FreeDir(mapping_dir);
4913 :
4914 : /* sort files so we apply them in LSN order */
4915 10 : list_sort(files, file_sort_by_lsn);
4916 :
4917 54 : foreach(file, files)
4918 : {
4919 44 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
4920 :
4921 44 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
4922 : snapshot->subxip[0]);
4923 44 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
4924 44 : pfree(f);
4925 : }
4926 10 : }
4927 :
4928 : /*
4929 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
4930 : * combocids.
4931 : */
4932 : bool
4933 998 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
4934 : Snapshot snapshot,
4935 : HeapTuple htup, Buffer buffer,
4936 : CommandId *cmin, CommandId *cmax)
4937 : {
4938 : ReorderBufferTupleCidKey key;
4939 : ReorderBufferTupleCidEnt *ent;
4940 : ForkNumber forkno;
4941 : BlockNumber blockno;
4942 998 : bool updated_mapping = false;
4943 :
4944 : /*
4945 : * Return unresolved if tuplecid_data is not valid. That's because when
4946 : * streaming in-progress transactions we may run into tuples with the CID
4947 : * before actually decoding them. Think e.g. about INSERT followed by
4948 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
4949 : * INSERT. So in such cases, we assume the CID is from the future
4950 : * command.
4951 : */
4952 998 : if (tuplecid_data == NULL)
4953 18 : return false;
4954 :
4955 : /* be careful about padding */
4956 980 : memset(&key, 0, sizeof(key));
4957 :
4958 980 : Assert(!BufferIsLocal(buffer));
4959 :
4960 : /*
4961 : * get relfilenode from the buffer, no convenient way to access it other
4962 : * than that.
4963 : */
4964 980 : BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
4965 :
4966 : /* tuples can only be in the main fork */
4967 980 : Assert(forkno == MAIN_FORKNUM);
4968 980 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
4969 :
4970 980 : ItemPointerCopy(&htup->t_self,
4971 : &key.tid);
4972 :
4973 : restart:
4974 990 : ent = (ReorderBufferTupleCidEnt *)
4975 : hash_search(tuplecid_data,
4976 : (void *) &key,
4977 : HASH_FIND,
4978 : NULL);
4979 :
4980 : /*
4981 : * failed to find a mapping, check whether the table was rewritten and
4982 : * apply mapping if so, but only do that once - there can be no new
4983 : * mappings while we are in here since we have to hold a lock on the
4984 : * relation.
4985 : */
4986 990 : if (ent == NULL && !updated_mapping)
4987 : {
4988 10 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
4989 : /* now check but don't update for a mapping again */
4990 10 : updated_mapping = true;
4991 10 : goto restart;
4992 : }
4993 980 : else if (ent == NULL)
4994 0 : return false;
4995 :
4996 980 : if (cmin)
4997 980 : *cmin = ent->cmin;
4998 980 : if (cmax)
4999 980 : *cmax = ent->cmax;
5000 980 : return true;
5001 : }
|