From 1fb3944166a0d588d07df6a6e7950415b1c8039e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 15 Dec 2023 16:12:02 +0200 Subject: [PATCH 1/4] On demand downloading of SLRU segments --- src/backend/access/transam/slru.c | 112 ++++++++++++++++++++++++++---- src/backend/storage/smgr/smgr.c | 7 ++ src/include/access/slru.h | 2 + src/include/storage/smgr.h | 10 +++ 4 files changed, 118 insertions(+), 13 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index af57fe9e53a..05a2a58046d 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -59,6 +59,7 @@ #include "pgstat.h" #include "storage/fd.h" #include "storage/shmem.h" +#include "storage/smgr.h" #define SlruFileName(ctl, path, seg) \ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) @@ -267,6 +268,15 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, ctl->shared = shared; ctl->sync_handler = sync_handler; strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); + + if (strcmp(subdir, "pg_xact") == 0) + ctl->kind = SLRU_CLOG; + else if (strcmp(subdir, "pg_multixact/members") == 0) + ctl->kind = SLRU_MULTIXACT_MEMBERS; + else if (strcmp(subdir, "pg_multixact/offsets") == 0) + ctl->kind = SLRU_MULTIXACT_OFFSETS; + else + ctl->kind = SLRU_OTHER; } /* @@ -617,6 +627,64 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) SlruInternalWritePage(ctl, slotno, NULL); } + +static int +SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) +{ + int segno; + int fd = -1; + int n_blocks; + char* buffer; + + static SMgrRelationData dummy_smgr_rel = {0}; + + if (ctl->kind == SLRU_OTHER) /* Only CLOG/multixact can be downloaded from page server */ + return -1; + + /* If page is beyond latest written page, then do not try to download segment from server */ + if (pageno > ctl->shared->latest_page_number) + return -1; + + if (!dummy_smgr_rel.smgr) + { + RelFileNode rnode = {0}; + dummy_smgr_rel.smgr = smgr(InvalidBackendId, rnode); + } + segno = pageno / SLRU_PAGES_PER_SEGMENT; + + buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); + n_blocks = smgr_read_slru_segment(&dummy_smgr_rel, ctl->kind, segno, buffer); + if (n_blocks > 0) + { + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + pfree(buffer); + return -1; + } + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); + if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ) + { + pgstat_report_wait_end(); + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + slru_errcause = SLRU_WRITE_FAILED; + slru_errno = errno; + + CloseTransientFile(fd); + pfree(buffer); + return -1; + } + pgstat_report_wait_end(); + } + pfree(buffer); + return fd; +} + /* * Return whether the given page exists on disk. * @@ -644,12 +712,18 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) { /* expected: file doesn't exist */ if (errno == ENOENT) - return false; - - /* report error normally */ - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - SlruReportIOError(ctl, pageno, 0); + { + fd = SimpleLruDownloadSegment(ctl, pageno, path); + if (fd < 0) + return false; + } + else + { + /* report error normally */ + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); + } } if ((endpos = lseek(fd, 0, SEEK_END)) < 0) @@ -703,18 +777,30 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) { - if (errno != ENOENT || !InRecovery) + if (errno != ENOENT) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } - - ereport(LOG, - (errmsg("file \"%s\" doesn't exist, reading as zeroes", - path))); - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); - return true; + fd = SimpleLruDownloadSegment(ctl, pageno, path); + if (fd < 0) + { + if (!InRecovery) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + else + { + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; + } + } } errno = 0; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index b8679b73700..fa3b33681fc 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -570,6 +570,13 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, buffer, skipFsync); } +int +smgr_read_slru_segment(SMgrRelation reln, SlruKind kind, int segno, void* buffer) +{ + return (*reln->smgr).smgr_read_slru_segment && (*reln->smgr).smgr_read_slru_segment(reln, kind, segno, buffer); +} + + /* * smgrwriteback() -- Trigger kernel writeback for the supplied range of diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 130c41c8632..a59ff52bd03 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -16,6 +16,7 @@ #include "access/xlogdefs.h" #include "storage/lwlock.h" #include "storage/sync.h" +#include "storage/smgr.h" /* @@ -134,6 +135,7 @@ typedef struct SlruCtlData * it's always the same, it doesn't need to be in shared memory. */ char Dir[64]; + SlruKind kind; } SlruCtlData; typedef SlruCtlData *SlruCtl; diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 2a29dcd194b..5bea39de393 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -88,6 +88,12 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileNodeBackendIsTemp((smgr)->smgr_rnode) +typedef enum { + SLRU_CLOG, + SLRU_MULTIXACT_MEMBERS, + SLRU_MULTIXACT_OFFSETS, + SLRU_OTHER +} SlruKind; /* * This struct of function pointers defines the API between smgr.c and @@ -129,6 +135,8 @@ typedef struct f_smgr void (*smgr_start_unlogged_build) (SMgrRelation reln); void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); void (*smgr_end_unlogged_build) (SMgrRelation reln); + + int (*smgr_read_slru_segment) (SMgrRelation reln, SlruKind kind, int segno, void* buffer); } f_smgr; typedef void (*smgr_init_hook_type) (void); @@ -183,4 +191,6 @@ extern void smgr_start_unlogged_build(SMgrRelation reln); extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); extern void smgr_end_unlogged_build(SMgrRelation reln); +extern int smgr_read_slru_segment(SMgrRelation reln, SlruKind kind, int segno, void* buffer); + #endif /* SMGR_H */ From 2e45c5ef1b46c5345bc7fad4d8d10d0d9c725cfd Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sat, 16 Dec 2023 22:20:48 +0200 Subject: [PATCH 2/4] Fix smgr_read_slru_segment --- src/backend/storage/smgr/smgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index fa3b33681fc..15f157912e8 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -573,7 +573,7 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int smgr_read_slru_segment(SMgrRelation reln, SlruKind kind, int segno, void* buffer) { - return (*reln->smgr).smgr_read_slru_segment && (*reln->smgr).smgr_read_slru_segment(reln, kind, segno, buffer); + return (*reln->smgr).smgr_read_slru_segment ? (*reln->smgr).smgr_read_slru_segment(reln, kind, segno, buffer) : 0; } From aa0e745b144eec92c7e89af5e8f6f97fc56c5267 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 27 Dec 2023 18:28:56 +0200 Subject: [PATCH 3/4] Determine SLRU kind in extension --- src/backend/access/transam/slru.c | 14 +------------- src/backend/storage/smgr/smgr.c | 4 ++-- src/include/access/slru.h | 2 -- src/include/storage/smgr.h | 11 ++--------- 4 files changed, 5 insertions(+), 26 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 05a2a58046d..bf054302e1f 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -268,15 +268,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, ctl->shared = shared; ctl->sync_handler = sync_handler; strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); - - if (strcmp(subdir, "pg_xact") == 0) - ctl->kind = SLRU_CLOG; - else if (strcmp(subdir, "pg_multixact/members") == 0) - ctl->kind = SLRU_MULTIXACT_MEMBERS; - else if (strcmp(subdir, "pg_multixact/offsets") == 0) - ctl->kind = SLRU_MULTIXACT_OFFSETS; - else - ctl->kind = SLRU_OTHER; } /* @@ -638,9 +629,6 @@ SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) static SMgrRelationData dummy_smgr_rel = {0}; - if (ctl->kind == SLRU_OTHER) /* Only CLOG/multixact can be downloaded from page server */ - return -1; - /* If page is beyond latest written page, then do not try to download segment from server */ if (pageno > ctl->shared->latest_page_number) return -1; @@ -653,7 +641,7 @@ SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) segno = pageno / SLRU_PAGES_PER_SEGMENT; buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); - n_blocks = smgr_read_slru_segment(&dummy_smgr_rel, ctl->kind, segno, buffer); + n_blocks = smgr_read_slru_segment(&dummy_smgr_rel, path, segno, buffer); if (n_blocks > 0) { fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 15f157912e8..fa1c1465c32 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -571,9 +571,9 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } int -smgr_read_slru_segment(SMgrRelation reln, SlruKind kind, int segno, void* buffer) +smgr_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) { - return (*reln->smgr).smgr_read_slru_segment ? (*reln->smgr).smgr_read_slru_segment(reln, kind, segno, buffer) : 0; + return (*reln->smgr).smgr_read_slru_segment ? (*reln->smgr).smgr_read_slru_segment(reln, path, segno, buffer) : 0; } diff --git a/src/include/access/slru.h b/src/include/access/slru.h index a59ff52bd03..130c41c8632 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -16,7 +16,6 @@ #include "access/xlogdefs.h" #include "storage/lwlock.h" #include "storage/sync.h" -#include "storage/smgr.h" /* @@ -135,7 +134,6 @@ typedef struct SlruCtlData * it's always the same, it doesn't need to be in shared memory. */ char Dir[64]; - SlruKind kind; } SlruCtlData; typedef SlruCtlData *SlruCtl; diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 5bea39de393..ab6a961d471 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -88,13 +88,6 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileNodeBackendIsTemp((smgr)->smgr_rnode) -typedef enum { - SLRU_CLOG, - SLRU_MULTIXACT_MEMBERS, - SLRU_MULTIXACT_OFFSETS, - SLRU_OTHER -} SlruKind; - /* * This struct of function pointers defines the API between smgr.c and * any individual storage manager module. Note that smgr subfunctions are @@ -136,7 +129,7 @@ typedef struct f_smgr void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); void (*smgr_end_unlogged_build) (SMgrRelation reln); - int (*smgr_read_slru_segment) (SMgrRelation reln, SlruKind kind, int segno, void* buffer); + int (*smgr_read_slru_segment) (SMgrRelation reln, const char *path, int segno, void* buffer); } f_smgr; typedef void (*smgr_init_hook_type) (void); @@ -191,6 +184,6 @@ extern void smgr_start_unlogged_build(SMgrRelation reln); extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); extern void smgr_end_unlogged_build(SMgrRelation reln); -extern int smgr_read_slru_segment(SMgrRelation reln, SlruKind kind, int segno, void* buffer); +extern int smgr_read_slru_segment(SMgrRelation reln, const char *path, int segno, void* buffer); #endif /* SMGR_H */ From 07550c76af0f840bbede65cd3a18f8584972ae5e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 11 Jan 2024 17:29:20 +0200 Subject: [PATCH 4/4] Use ctl->PagePrecedes for SLRU page comparison in SimpleLruDownloadSegment to address wraparround --- src/backend/access/transam/slru.c | 9 +++++++-- src/backend/storage/smgr/smgr.c | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index bf054302e1f..0b88bd03ebc 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -619,6 +619,11 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) } +/* + * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer + * to download them on demand to reduce startup time. + * If SLRU segment is not found, we try to download it from page server + */ static int SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) { @@ -629,8 +634,8 @@ SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) static SMgrRelationData dummy_smgr_rel = {0}; - /* If page is beyond latest written page, then do not try to download segment from server */ - if (pageno > ctl->shared->latest_page_number) + /* If page is greater than latest written page, then do not try to download segment from server */ + if (ctl->PagePrecedes(ctl->shared->latest_page_number, pageno)) return -1; if (!dummy_smgr_rel.smgr) diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index fa1c1465c32..8ba58a4d9a7 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -570,6 +570,15 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, buffer, skipFsync); } +/* + * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer + * to download them on demand to reduce startup time. + * If SLRU segment is not found, we try to download it from page server + * + * This function returns number of blocks in segment. Usually it should be SLRU_PAGES_PER_SEGMENT but in case + * of partial segment, it can be smaller. Zero value means that segment doesn't exist. + * From Postgres point of view empty segment is the same as absent segment. + */ int smgr_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) {