Index: src/backend/storage/buffer/bufmgr.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v retrieving revision 1.228 diff -c -r1.228 bufmgr.c *** src/backend/storage/buffer/bufmgr.c 1 Jan 2008 19:45:51 -0000 1.228 --- src/backend/storage/buffer/bufmgr.c 4 May 2008 01:11:08 -0000 *************** *** 56,61 **** --- 56,68 ---- #define BUF_WRITTEN 0x01 #define BUF_REUSABLE 0x02 + /* Struct for BufferSync's internal to-do list */ + typedef struct BufAndTag + { + int buf_id; + BufferTag tag; + } BufAndTag; + /* GUC variables */ bool zero_damaged_pages = false; *************** *** 986,991 **** --- 993,1025 ---- } /* + * qsort comparator for BufferSync + */ + static int + bufandtagcmp(const void *a, const void *b) + { + const BufAndTag *lhs = (const BufAndTag *) a; + const BufAndTag *rhs = (const BufAndTag *) b; + int r; + + /* + * We don't much care about the order in which different relations get + * written, so memcmp is enough for comparing the relfilenodes, + * even though its behavior will be platform-dependent. + */ + r = memcmp(&lhs->tag.rnode, &rhs->tag.rnode, sizeof(lhs->tag.rnode)); + if (r != 0) + return r; + + /* We do want blocks within a relation to be ordered accurately */ + if (lhs->tag.blockNum < rhs->tag.blockNum) + return -1; + if (lhs->tag.blockNum > rhs->tag.blockNum) + return 1; + return 0; + } + + /* * BufferSync -- Write out all dirty buffers in the pool. * * This is called at checkpoint time to write out all dirty shared buffers. *************** *** 995,1004 **** static void BufferSync(int flags) { int buf_id; - int num_to_scan; int num_to_write; int num_written; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); --- 1029,1056 ---- static void BufferSync(int flags) { + static BufAndTag *bufs_to_write = NULL; int buf_id; int num_to_write; int num_written; + int i; + + /* + * We allocate the bufs_to_write[] array on first call and keep it + * around for the life of the process. This is okay because in normal + * operation this function is only called within the bgwriter, so + * we won't have lots of large arrays floating around. We prefer this + * way because we don't want checkpoints to suddenly start failing + * when the system gets under memory pressure. + */ + if (bufs_to_write == NULL) + { + bufs_to_write = (BufAndTag *) malloc(NBuffers * sizeof(BufAndTag)); + if (bufs_to_write == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); *************** *** 1033,1038 **** --- 1085,1092 ---- if (bufHdr->flags & BM_DIRTY) { bufHdr->flags |= BM_CHECKPOINT_NEEDED; + bufs_to_write[num_to_write].buf_id = buf_id; + bufs_to_write[num_to_write].tag = bufHdr->tag; num_to_write++; } *************** *** 1043,1061 **** return; /* nothing to do */ /* ! * Loop over all buffers again, and write the ones (still) marked with ! * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep point ! * since we might as well dump soon-to-be-recycled buffers first. ! * ! * Note that we don't read the buffer alloc count here --- that should be ! * left untouched till the next BgBufferSync() call. */ - buf_id = StrategySyncStart(NULL, NULL); - num_to_scan = NBuffers; num_written = 0; ! while (num_to_scan-- > 0) { ! volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id]; /* * We don't need to acquire the lock here, because we're only looking --- 1097,1120 ---- return; /* nothing to do */ /* ! * Sort the buffers-to-be-written into order by file and block number. ! * This improves sequentiality of access for the upcoming I/O. ! */ ! qsort(bufs_to_write, num_to_write, sizeof(BufAndTag), bufandtagcmp); ! ! /* ! * Loop over all buffers to be written, and write the ones (still) marked ! * with BM_CHECKPOINT_NEEDED. Note that we don't need to recheck the ! * buffer tag, because if the buffer has been reassigned it cannot have ! * BM_CHECKPOINT_NEEDED still set. */ num_written = 0; ! for (i = 0; i < num_to_write; i++) { ! volatile BufferDesc *bufHdr; ! ! buf_id = bufs_to_write[i].buf_id; ! bufHdr = &BufferDescriptors[buf_id]; /* * We don't need to acquire the lock here, because we're only looking *************** *** 1077,1096 **** num_written++; /* - * We know there are at most num_to_write buffers with - * BM_CHECKPOINT_NEEDED set; so we can stop scanning if - * num_written reaches num_to_write. - * - * Note that num_written doesn't include buffers written by - * other backends, or by the bgwriter cleaning scan. That - * means that the estimate of how much progress we've made is - * conservative, and also that this test will often fail to - * trigger. But it seems worth making anyway. - */ - if (num_written >= num_to_write) - break; - - /* * Perform normal bgwriter duties and sleep to throttle our * I/O rate. */ --- 1136,1141 ---- *************** *** 1098,1110 **** (double) num_written / num_to_write); } } - - if (++buf_id >= NBuffers) - buf_id = 0; } /* ! * Update checkpoint statistics. As noted above, this doesn't include * buffers written by other backends or bgwriter scan. */ CheckpointStats.ckpt_bufs_written += num_written; --- 1143,1152 ---- (double) num_written / num_to_write); } } } /* ! * Update checkpoint statistics. The num_written count doesn't include * buffers written by other backends or bgwriter scan. */ CheckpointStats.ckpt_bufs_written += num_written; Index: src/backend/storage/buffer/freelist.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v retrieving revision 1.64 diff -c -r1.64 freelist.c *** src/backend/storage/buffer/freelist.c 1 Jan 2008 19:45:51 -0000 1.64 --- src/backend/storage/buffer/freelist.c 4 May 2008 01:11:08 -0000 *************** *** 241,250 **** } /* ! * StrategySyncStart -- tell BufferSync where to start syncing * ! * The result is the buffer index of the best buffer to sync first. ! * BufferSync() will proceed circularly around the buffer array from there. * * In addition, we return the completed-pass count (which is effectively * the higher-order bits of nextVictimBuffer) and the count of recent buffer --- 241,251 ---- } /* ! * StrategySyncStart -- tell BgBufferSync where we are reclaiming buffers * ! * The result is the buffer index of the next possible victim buffer. ! * BgBufferSync() tries to keep the buffers immediately in front of this ! * point clean. * * In addition, we return the completed-pass count (which is effectively * the higher-order bits of nextVictimBuffer) and the count of recent buffer