*** src/backend/storage/smgr/md.c.orig Wed Jan 17 11:25:01 2007 --- src/backend/storage/smgr/md.c Tue Apr 10 16:38:27 2007 *************** *** 122,135 **** BlockNumber segno; /* which segment */ } PendingOperationTag; typedef struct { PendingOperationTag tag; /* hash table key (must be first!) */ ! int failures; /* number of failed attempts to fsync */ } PendingOperationEntry; static HTAB *pendingOpsTable = NULL; typedef enum /* behavior for mdopen & _mdfd_getseg */ { --- 122,140 ---- BlockNumber segno; /* which segment */ } PendingOperationTag; + typedef uint16 CycleCtr; /* can be any convenient integer size */ + typedef struct { PendingOperationTag tag; /* hash table key (must be first!) */ ! bool canceled; /* T => request canceled, not yet removed */ ! CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */ } PendingOperationEntry; static HTAB *pendingOpsTable = NULL; + static CycleCtr mdsync_cycle_ctr = 0; + typedef enum /* behavior for mdopen & _mdfd_getseg */ { *************** *** 856,926 **** /* * mdsync() -- Sync previous writes to stable storage. - * - * This is only called during checkpoints, and checkpoints should only - * occur in processes that have created a pendingOpsTable. */ void mdsync(void) { ! bool need_retry; if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* ! * The fsync table could contain requests to fsync relations that have ! * been deleted (unlinked) by the time we get to them. Rather than ! * just hoping an ENOENT (or EACCES on Windows) error can be ignored, ! * what we will do is retry the whole process after absorbing fsync ! * request messages again. Since mdunlink() queues a "revoke" message ! * before actually unlinking, the fsync request is guaranteed to be gone ! * the second time if it really was this case. DROP DATABASE likewise ! * has to tell us to forget fsync requests before it starts deletions. */ ! do { ! HASH_SEQ_STATUS hstat; ! PendingOperationEntry *entry; ! int absorb_counter; ! need_retry = false; /* ! * If we are in the bgwriter, the sync had better include all fsync ! * requests that were queued by backends before the checkpoint REDO ! * point was determined. We go that a little better by accepting all ! * requests queued up to the point where we start fsync'ing. */ ! AbsorbFsyncRequests(); ! absorb_counter = FSYNCS_PER_ABSORB; ! hash_seq_init(&hstat, pendingOpsTable); ! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* ! * If fsync is off then we don't have to bother opening the file ! * at all. (We delay checking until this point so that changing ! * fsync on the fly behaves sensibly.) */ ! if (enableFsync) { SMgrRelation reln; MdfdVec *seg; /* - * If in bgwriter, we want to absorb pending requests every so - * often to prevent overflow of the fsync request queue. This - * could result in deleting the current entry out from under - * our hashtable scan, so the procedure is to fall out of the - * scan and start over from the top of the function. - */ - if (--absorb_counter <= 0) - { - need_retry = true; - break; - } - - /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference --- 861,985 ---- /* * mdsync() -- Sync previous writes to stable storage. */ void mdsync(void) { ! static bool mdsync_in_progress = false; ! ! HASH_SEQ_STATUS hstat; ! PendingOperationEntry *entry; ! int absorb_counter; + /* + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOpsTable. + */ if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* ! * If we are in the bgwriter, the sync had better include all fsync ! * requests that were queued by backends before the checkpoint REDO ! * point was determined. We go that a little better by accepting all ! * requests queued up to the point where we start fsync'ing. */ ! AbsorbFsyncRequests(); ! /* ! * To avoid excess fsync'ing, we want to ignore fsync requests that are ! * entered into the hashtable during the main mdsync loop (they should ! * be processed next time, instead). We use mdsync_cycle_ctr to tell ! * old entries apart from new ones: new ones will have cycle_ctr equal ! * to the incremented value of mdsync_cycle_ctr. ! * ! * In normal circumstances, all entries present in the table at this ! * point will have cycle_ctr exactly equal to the current (about to be old) ! * value of mdsync_cycle_ctr. However, if we fail partway through the ! * fsync'ing loop, then older values of cycle_ctr might remain when we ! * come back here to try again. Repeated checkpoint failures would ! * eventually wrap the counter around to the point where an old entry ! * might appear new, causing us to skip it, possibly allowing a checkpoint ! * to succeed that should not have. To forestall wraparound, any time ! * the previous mdsync() failed to complete, run through the table and ! * forcibly set cycle_ctr = mdsync_cycle_ctr. ! * ! * Think not to merge this loop with the main loop, as the problem is ! * exactly that that loop may fail before having visited all the entries. ! * From a performance point of view it doesn't matter anyway, as this ! * path will never be taken in a system that's functioning normally. ! */ ! if (mdsync_in_progress) ! { ! /* prior try failed, so update any stale cycle_ctr values */ ! hash_seq_init(&hstat, pendingOpsTable); ! while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) ! { ! entry->cycle_ctr = mdsync_cycle_ctr; ! } ! } + /* Advance counter so that new hashtable entries are distinguishable */ + mdsync_cycle_ctr++; + + /* Set flag to detect failure if we don't reach the end of the loop */ + mdsync_in_progress = true; + + /* Now scan the hashtable for fsync requests to process */ + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { /* ! * If the entry is new then don't process it this time. Note that ! * "continue" bypasses the hash-remove call at the bottom of the loop. */ ! if (entry->cycle_ctr == mdsync_cycle_ctr) ! continue; ! /* Else assert we haven't missed it */ ! Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); ! ! /* ! * If fsync is off then we don't have to bother opening the file ! * at all. (We delay checking until this point so that changing ! * fsync on the fly behaves sensibly.) Also, if the entry is ! * marked canceled, fall through to delete it. ! */ ! if (enableFsync && !entry->canceled) { + int failures; + /* ! * If in bgwriter, we want to absorb pending requests every so ! * often to prevent overflow of the fsync request queue. It is ! * unspecified whether newly-added entries will be visited by ! * hash_seq_search, but we don't care since we don't need to ! * process them anyway. */ ! if (--absorb_counter <= 0) ! { ! AbsorbFsyncRequests(); ! absorb_counter = FSYNCS_PER_ABSORB; ! } ! ! /* ! * The fsync table could contain requests to fsync segments that ! * have been deleted (unlinked) by the time we get to them. ! * Rather than just hoping an ENOENT (or EACCES on Windows) error ! * can be ignored, what we do on error is absorb pending requests ! * and then retry. Since mdunlink() queues a "revoke" message ! * before actually unlinking, the fsync request is guaranteed to ! * be marked canceled after the absorb if it really was this case. ! * DROP DATABASE likewise has to tell us to forget fsync requests ! * before it starts deletions. ! */ ! for (failures = 0; ; failures++) /* loop exits at "break" */ { SMgrRelation reln; MdfdVec *seg; /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference *************** *** 940,946 **** /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, ! * allow ENOENT, but only if we didn't fail once already on * this file. This applies both during _mdfd_getseg() and * during FileSync, since fd.c might have closed the file * behind our back. --- 999,1005 ---- /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, ! * allow ENOENT, but only if we didn't fail already on * this file. This applies both during _mdfd_getseg() and * during FileSync, since fd.c might have closed the file * behind our back. *************** *** 948,989 **** seg = _mdfd_getseg(reln, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), false, EXTENSION_RETURN_NULL); ! if (seg == NULL || ! FileSync(seg->mdfd_vfd) < 0) ! { ! /* ! * XXX is there any point in allowing more than one try? ! * Don't see one at the moment, but easy to change the ! * test here if so. ! */ ! if (!FILE_POSSIBLY_DELETED(errno) || ! ++(entry->failures) > 1) ! ereport(ERROR, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! else ! ereport(DEBUG1, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! need_retry = true; ! continue; /* don't delete the hashtable entry */ ! } ! } ! /* Okay, delete this entry */ ! if (hash_search(pendingOpsTable, &entry->tag, ! HASH_REMOVE, NULL) == NULL) ! elog(ERROR, "pendingOpsTable corrupted"); } ! } while (need_retry); } /* --- 1007,1062 ---- seg = _mdfd_getseg(reln, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), false, EXTENSION_RETURN_NULL); ! if (seg != NULL && ! FileSync(seg->mdfd_vfd) >= 0) ! break; /* success; break out of retry loop */ ! /* ! * XXX is there any point in allowing more than one retry? ! * Don't see one at the moment, but easy to change the ! * test here if so. ! */ ! if (!FILE_POSSIBLY_DELETED(errno) || ! failures > 0) ! ereport(ERROR, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! else ! ereport(DEBUG1, ! (errcode_for_file_access(), ! errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", ! entry->tag.segno, ! entry->tag.rnode.spcNode, ! entry->tag.rnode.dbNode, ! entry->tag.rnode.relNode))); ! ! /* ! * Absorb incoming requests and check to see if canceled. ! */ ! AbsorbFsyncRequests(); ! absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ ! ! if (entry->canceled) ! break; ! } /* end retry loop */ } ! ! /* ! * If we get here, either we fsync'd successfully, we don't have ! * to because enableFsync is off, or the entry is (now) marked ! * canceled. Okay to delete it. ! */ ! if (hash_search(pendingOpsTable, &entry->tag, ! HASH_REMOVE, NULL) == NULL) ! elog(ERROR, "pendingOpsTable corrupted"); ! } /* end loop over hashtable entries */ ! ! /* Flag successful completion of mdsync */ ! mdsync_in_progress = false; } /* *************** *** 1027,1034 **** * * The range of possible segment numbers is way less than the range of * BlockNumber, so we can reserve high values of segno for special purposes. ! * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for ! * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for * a whole database. (These are a tad slow because the hash table has to be * searched linearly, but it doesn't seem worth rethinking the table structure * for them.) --- 1100,1107 ---- * * The range of possible segment numbers is way less than the range of * BlockNumber, so we can reserve high values of segno for special purposes. ! * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for ! * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for * a whole database. (These are a tad slow because the hash table has to be * searched linearly, but it doesn't seem worth rethinking the table structure * for them.) *************** *** 1049,1058 **** { if (RelFileNodeEquals(entry->tag.rnode, rnode)) { ! /* Okay, delete this entry */ ! if (hash_search(pendingOpsTable, &entry->tag, ! HASH_REMOVE, NULL) == NULL) ! elog(ERROR, "pendingOpsTable corrupted"); } } } --- 1122,1129 ---- { if (RelFileNodeEquals(entry->tag.rnode, rnode)) { ! /* Okay, cancel this entry */ ! entry->canceled = true; } } } *************** *** 1067,1076 **** { if (entry->tag.rnode.dbNode == rnode.dbNode) { ! /* Okay, delete this entry */ ! if (hash_search(pendingOpsTable, &entry->tag, ! HASH_REMOVE, NULL) == NULL) ! elog(ERROR, "pendingOpsTable corrupted"); } } } --- 1138,1145 ---- { if (entry->tag.rnode.dbNode == rnode.dbNode) { ! /* Okay, cancel this entry */ ! entry->canceled = true; } } } *************** *** 1091,1097 **** HASH_ENTER, &found); if (!found) /* new entry, so initialize it */ ! entry->failures = 0; } } --- 1160,1175 ---- HASH_ENTER, &found); if (!found) /* new entry, so initialize it */ ! { ! entry->canceled = false; ! entry->cycle_ctr = mdsync_cycle_ctr; ! } ! /* ! * NB: it's intentional that we don't change cycle_ctr if the entry ! * already exists. The fsync request must be treated as old, even ! * though the new request will be satisfied too by our subsequent ! * fsync. ! */ } }