diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 938c554..0f2e4e0 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1217,6 +1217,12 @@ BufferSync(int flags) int num_to_write; int num_written; int mask = BM_DIRTY; + RelFileNode target_rnode = { 0, 0, 0 }; + ForkNumber target_forkNum = InvalidForkNumber; + int target_segno = 0; + bool has_target = false; + int outer_num_to_scan = 0; + int outer_buf_id = 0; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); @@ -1281,10 +1287,30 @@ BufferSync(int flags) buf_id = StrategySyncStart(NULL, NULL); num_to_scan = NBuffers; num_written = 0; - while (num_to_scan-- > 0) + + for (;;) { volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id]; + if (num_to_scan == 0) + { + if (has_target) + { + /* + * We have finished writing out buffers belonging to this + * relation segment. fsync it now. + */ + mdsync_seg(target_rnode, target_forkNum, target_segno); + + /* continue the outer scan where we left */ + num_to_scan = outer_num_to_scan; + buf_id = outer_buf_id; + has_target = false; + } + else + break; /* all done! */ + } + /* * We don't need to acquire the lock here, because we're only looking * at a single bit. It's possible that someone else writes the buffer @@ -1299,7 +1325,36 @@ BufferSync(int flags) */ if (bufHdr->flags & BM_CHECKPOINT_NEEDED) { - if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) + RelFileNode this_rnode = bufHdr->tag.rnode; + ForkNumber this_forkNum = bufHdr->tag.forkNum; + int this_segno = bufHdr->tag.blockNum / (RELSEG_SIZE / BLCKSZ); + bool skip = false; + + if (has_target) + { + if (!RelFileNodeEquals(this_rnode, target_rnode) || + this_forkNum != target_forkNum || + this_segno != target_segno) + { + /* + * This buffer belongs to another relation than the one + * we're targeting right now. We'll revisit it later. + */ + skip = true; + } + } + else + { + target_rnode = this_rnode; + target_forkNum = this_forkNum; + target_segno = this_segno; + has_target = true; + /* remember where we left the outer scan */ + outer_buf_id = buf_id; + outer_num_to_scan = num_to_scan; + } + + if (!skip && SyncOneBuffer(buf_id, false) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; @@ -1328,6 +1383,7 @@ BufferSync(int flags) if (++buf_id >= NBuffers) buf_id = 0; + num_to_scan--; } /* diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 167d61c..9dedef9 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1257,14 +1257,192 @@ mdsync(void) } /* end loop over hashtable entries */ /* Return sync performance metrics for report at checkpoint end */ - CheckpointStats.ckpt_sync_rels = processed; - CheckpointStats.ckpt_longest_sync = longest; - CheckpointStats.ckpt_agg_sync_time = total_elapsed; + CheckpointStats.ckpt_sync_rels += processed; + if (longest > CheckpointStats.ckpt_longest_sync) + CheckpointStats.ckpt_longest_sync = longest; + CheckpointStats.ckpt_agg_sync_time += total_elapsed; /* Flag successful completion of mdsync */ mdsync_in_progress = false; } + +/* + * mdsync_seg() -- Sync previous writes to a particular relation segment to stable storage. + */ +void +mdsync_seg(RelFileNode rnode, ForkNumber forknum, int segno) +{ + PendingOperationEntry *entry; + + /* Statistics on sync times */ + int processed = 0; + instr_time sync_start, + sync_end, + sync_diff; + uint64 elapsed; + uint64 longest = 0; + uint64 total_elapsed = 0; + + /* + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOpsTable. + */ + if (!pendingOpsTable) + elog(ERROR, "cannot sync without a pendingOpsTable"); + + /* + * If fsync is off, do nothing. (We'll clear the pendingOpsTable later, + * in mdsync) + */ + if (!enableFsync) + return; + + /* + * If we are in the checkpointer, the sync had better include all fsync + * requests that were queued by backends up to this point. The tightest + * race condition that could occur is that a buffer that must be written + * and fsync'd for the checkpoint could have been dumped by a backend just + * before it was visited by BufferSync(). We know the backend will have + * queued an fsync request before clearing the buffer's dirtybit, so we + * are safe as long as we do an Absorb after completing BufferSync(). + */ + AbsorbFsyncRequests(); + + /* Now scan the hashtable for fsync requests to process */ + entry = (PendingOperationEntry *) hash_search(pendingOpsTable, + &rnode, + HASH_FIND, + NULL); + if (entry) + { + if (bms_is_member(segno, entry->requests[forknum])) + { + int failures; + + /* + * The fsync table could contain requests to fsync segments + * that have been deleted (unlinked) by the time we get to + * them. Rather than just hoping an ENOENT (or EACCES on + * Windows) error can be ignored, what we do on error is + * absorb pending requests and then retry. Since mdunlink() + * queues a "cancel" message before actually unlinking, the + * fsync request is guaranteed to be marked canceled after the + * absorb if it really was this case. DROP DATABASE likewise + * has to tell us to forget fsync requests before it starts + * deletions. + */ + for (failures = 0;; failures++) /* loop exits at "break" */ + { + SMgrRelation reln; + MdfdVec *seg; + char *path; + int save_errno; + + /* + * Find or create an smgr hash entry for this relation. + * This may seem a bit unclean -- md calling smgr? But + * it's really the best solution. It ensures that the + * open file reference isn't permanently leaked if we get + * an error here. (You may say "but an unreferenced + * SMgrRelation is still a leak!" Not really, because the + * only case in which a checkpoint is done by a process + * that isn't about to shut down is in the checkpointer, + * and it will periodically do smgrcloseall(). This fact + * justifies our not closing the reln in the success path + * either, which is a good thing since in non-checkpointer + * cases we couldn't safely do that.) + */ + reln = smgropen(entry->rnode, InvalidBackendId); + + /* Attempt to open and fsync the target segment */ + seg = _mdfd_getseg(reln, forknum, + (BlockNumber) segno * (BlockNumber) RELSEG_SIZE, + false, EXTENSION_RETURN_NULL); + + INSTR_TIME_SET_CURRENT(sync_start); + + if (seg != NULL && + FileSync(seg->mdfd_vfd) >= 0) + { + /* Success; update statistics about sync timing */ + INSTR_TIME_SET_CURRENT(sync_end); + sync_diff = sync_end; + INSTR_TIME_SUBTRACT(sync_diff, sync_start); + elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); + if (elapsed > longest) + longest = elapsed; + total_elapsed += elapsed; + processed++; + if (log_checkpoints) + elog(LOG, "checkpoint sync: number=%d file=%s time=%.3f msec", + processed, + FilePathName(seg->mdfd_vfd), + (double) elapsed / 1000); + + entry->requests[forknum] = + bms_del_member(entry->requests[forknum], segno); + + break; /* out of retry loop */ + } + + /* Compute file name for use in message */ + save_errno = errno; + path = _mdfd_segpath(reln, forknum, (BlockNumber) segno); + errno = save_errno; + + /* + * It is possible that the relation has been dropped or + * truncated since the fsync request was entered. + * Therefore, allow ENOENT, but only if we didn't fail + * already on this file. This applies both for + * _mdfd_getseg() and for FileSync, since fd.c might have + * closed the file behind our back. + * + * XXX is there any point in allowing more than one retry? + * Don't see one at the moment, but easy to change the + * test here if so. + */ + if (!FILE_POSSIBLY_DELETED(errno) || + failures > 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\" but retrying: %m", + path))); + pfree(path); + + /* + * Absorb incoming requests and check to see if a cancel + * arrived for this relation fork. + */ + AbsorbFsyncRequests(); + + if (entry->canceled[forknum]) + break; + } /* end retry loop */ + } + + /* + * XXX: We don't bother removing the pendingOpsTable entry here, if + * it's now empty. The mdsync() call later in the checkpoint will do + * that. + */ + } /* end loop over hashtable entries */ + + + /* Return sync performance metrics for report at checkpoint end */ + CheckpointStats.ckpt_sync_rels += processed; + if (longest > CheckpointStats.ckpt_longest_sync) + CheckpointStats.ckpt_longest_sync = longest; + CheckpointStats.ckpt_agg_sync_time += total_elapsed; +} + + /* * mdpreckpt() -- Do pre-checkpoint work * diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index ba7c909..ebfa218 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -128,6 +128,7 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdpreckpt(void); extern void mdsync(void); +extern void mdsync_seg(RelFileNode rnode, ForkNumber forknum, int segno); extern void mdpostckpt(void); extern void SetForwardFsyncRequests(void);