From 8b8625704e48146c106f9a88c4c43ba0be24f458 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Mon, 17 Jun 2024 07:09:57 +0100 Subject: [PATCH 01/13] Added a Percona specific definition to pg_config, allowing our extension to see if custom features can be used --- src/include/pg_config_manual.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index f941ee2faf86b..9dc31cb64a632 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -383,3 +383,8 @@ * Enable tracing of syncscan operations (see also the trace_syncscan GUC var). */ /* #define TRACE_SYNCSCAN */ + +/* + * Enable Percona specific features, should always be defined in this fork + */ +#define PERCONA_FORK 1 From 5b0d92a0dbc6fd772d9dbce27f21fddd947d5107 Mon Sep 17 00:00:00 2001 From: Andrew Pogrebnoy Date: Wed, 24 Apr 2024 19:59:14 +0300 Subject: [PATCH 02/13] Make XLog storage extensible and allow extensions to override it For now, it extends on `pread` and `pwrite` from/into segment files. This is the minimum we need for full XLog encryption with pg_de. --- src/backend/access/transam/xlog.c | 3 ++- src/backend/access/transam/xlogreader.c | 18 ++++++++++++++++- src/backend/access/transam/xlogrecovery.c | 3 ++- src/backend/replication/walreceiver.c | 3 ++- src/include/access/xlog_internal.h | 4 +++- src/include/access/xlog_smgr.h | 24 +++++++++++++++++++++++ 6 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 src/include/access/xlog_smgr.h diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7f1360262776b..c225e778bde2c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -61,6 +61,7 @@ #include "access/xloginsert.h" #include "access/xlogreader.h" #include "access/xlogrecovery.h" +#include "access/xlog_smgr.h" #include "access/xlogutils.h" #include "backup/basebackup.h" #include "catalog/catversion.h" @@ -2442,7 +2443,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) INSTR_TIME_SET_ZERO(start); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - written = pg_pwrite(openLogFile, from, nleft, startoffset); + written = xlog_smgr->seg_write(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); /* diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 37d2a57961d0d..5920c70a88dba 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -29,6 +29,7 @@ #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "access/xlogrecord.h" +#include "access/xlog_smgr.h" #include "catalog/pg_control.h" #include "common/pg_lzcompress.h" #include "replication/origin.h" @@ -63,6 +64,21 @@ static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, */ #define DEFAULT_DECODE_BUFFER_SIZE (64 * 1024) +/* + * XLog storage manager + * + * TODO: should be in xlog.c or new xlog_smgr.c ? + * Now it's here because pg_rewind and other tools compile only + * w/ xlogreader.c + */ +XLogSmgr *xlog_smgr = &xlog_smgr_standard; + +void +SetXLogSmgr(XLogSmgr *xlsmgr) +{ + xlog_smgr = xlsmgr; +} + /* * Construct a string in state->errormsg_buf explaining what's wrong with * the current record being read. @@ -1557,7 +1573,7 @@ WALRead(XLogReaderState *state, /* Reset errno first; eases reporting non-errno-affecting errors */ errno = 0; - readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + readbytes = xlog_smgr->seg_read(state->seg.ws_file, p, segbytes, (off_t) startoff); #ifndef FRONTEND pgstat_report_wait_end(); diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index b45b833172005..11050da260795 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -39,6 +39,7 @@ #include "access/xlogprefetcher.h" #include "access/xlogreader.h" #include "access/xlogrecovery.h" +#include "access/xlog_smgr.h" #include "access/xlogutils.h" #include "backup/basebackup.h" #include "catalog/pg_control.h" @@ -3397,7 +3398,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, readOff = targetPageOff; pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + r = xlog_smgr->seg_read(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); if (r != XLOG_BLCKSZ) { char fname[MAXFNAMELEN]; diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index acda5f68d9a8b..fd347b129ff93 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -57,6 +57,7 @@ #include "access/xlog_internal.h" #include "access/xlogarchive.h" #include "access/xlogrecovery.h" +#include "access/xlog_smgr.h" #include "catalog/pg_authid.h" #include "funcapi.h" #include "libpq/pqformat.h" @@ -941,7 +942,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, TimeLineID tli) /* OK to write the logs */ errno = 0; - byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); + byteswritten = xlog_smgr->seg_write(recvFile, buf, segbytes, (off_t) startoff); if (byteswritten <= 0) { char xlogfname[MAXFNAMELEN]; diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index c6a91fb456055..1b164a3b5a774 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -78,8 +78,10 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader; #define XLP_BKP_REMOVABLE 0x0004 /* Replaces a missing contrecord; see CreateOverwriteContrecordRecord */ #define XLP_FIRST_IS_OVERWRITE_CONTRECORD 0x0008 +/* The page is encrypted */ +#define XLP_ENCRYPTED 0x0010 /* All defined flag bits in xlp_info (used for validity checking of header) */ -#define XLP_ALL_FLAGS 0x000F +#define XLP_ALL_FLAGS 0x001F #define XLogPageHeaderSize(hdr) \ (((hdr)->xlp_info & XLP_LONG_HEADER) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD) diff --git a/src/include/access/xlog_smgr.h b/src/include/access/xlog_smgr.h new file mode 100644 index 0000000000000..49554320a4b99 --- /dev/null +++ b/src/include/access/xlog_smgr.h @@ -0,0 +1,24 @@ +#ifndef XLOG_SMGR_H +#define XLOG_SMGR_H + +#include "postgres.h" + +#include + +/* XLog storage manager interface */ +typedef struct XLogSmgr { + ssize_t (*seg_read) (int fd, void *buf, size_t count, off_t offset); + + ssize_t (*seg_write) (int fd, const void *buf, size_t count, off_t offset); +} XLogSmgr; + +/* Default (standard) XLog storage manager */ +static const XLogSmgr xlog_smgr_standard = { + .seg_read = pg_pread, + .seg_write = pg_pwrite, +}; + +extern XLogSmgr *xlog_smgr; +extern void SetXLogSmgr(XLogSmgr *xlsmgr); + +#endif /* XLOG_SMGR_H */ From d3b6712b840b1d637fde0c60250d37687d3cbc5f Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Mon, 15 Apr 2024 19:52:59 +0100 Subject: [PATCH 03/13] Added pg_tde as submodule following the smgr branch --- .gitmodules | 4 ++++ contrib/pg_tde | 1 + 2 files changed, 5 insertions(+) create mode 100644 .gitmodules create mode 160000 contrib/pg_tde diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..21078cb7f88f3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "contrib/pg_tde"] + path = contrib/pg_tde + url = https://github.com/Percona-Lab/pg_tde.git + branch = smgr diff --git a/contrib/pg_tde b/contrib/pg_tde new file mode 160000 index 0000000000000..36f6d6bff8f67 --- /dev/null +++ b/contrib/pg_tde @@ -0,0 +1 @@ +Subproject commit 36f6d6bff8f67ffa9b3f84a5d548512f86f0d7b7 From 087cf92b30c8b7ed92314e40c8e71dd65e24b3be Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Mon, 8 Apr 2024 16:33:16 +0100 Subject: [PATCH 04/13] added pg_tde to meson build --- contrib/meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/meson.build b/contrib/meson.build index 14a8906865063..ce5630d64aee5 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -51,6 +51,7 @@ subdir('pgrowlocks') subdir('pg_stat_statements') subdir('pgstattuple') subdir('pg_surgery') +subdir('pg_tde') subdir('pg_trgm') subdir('pg_visibility') subdir('pg_walinspect') From 11271a15d14bf9ce7be738447ed03244d29e66f6 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sun, 7 Apr 2024 18:46:22 +0100 Subject: [PATCH 05/13] Applied patch --- contrib/Makefile | 1 + contrib/fsync_checker/fsync_checker.control | 5 + contrib/fsync_checker/fsync_checker_smgr.c | 250 ++++++++++++++++++++ contrib/fsync_checker/meson.build | 22 ++ contrib/meson.build | 1 + src/backend/access/transam/xlog.c | 5 + src/backend/postmaster/postmaster.c | 5 + src/backend/storage/smgr/md.c | 173 +++++++++----- src/backend/storage/smgr/smgr.c | 134 ++++++----- src/backend/utils/init/miscinit.c | 15 ++ src/include/access/xlog.h | 4 + src/include/miscadmin.h | 1 + src/include/storage/md.h | 6 + src/include/storage/smgr.h | 62 ++++- 14 files changed, 551 insertions(+), 133 deletions(-) create mode 100644 contrib/fsync_checker/fsync_checker.control create mode 100644 contrib/fsync_checker/fsync_checker_smgr.c create mode 100644 contrib/fsync_checker/meson.build diff --git a/contrib/Makefile b/contrib/Makefile index abd780f277405..091dd9e33228a 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -19,6 +19,7 @@ SUBDIRS = \ dict_int \ dict_xsyn \ earthdistance \ + fsync_checker \ file_fdw \ fuzzystrmatch \ hstore \ diff --git a/contrib/fsync_checker/fsync_checker.control b/contrib/fsync_checker/fsync_checker.control new file mode 100644 index 0000000000000..7d0e36434bfaf --- /dev/null +++ b/contrib/fsync_checker/fsync_checker.control @@ -0,0 +1,5 @@ +# fsync_checker extension +comment = 'SMGR extension for checking volatile writes' +default_version = '1.0' +module_pathname = '$libdir/fsync_checker' +relocatable = true diff --git a/contrib/fsync_checker/fsync_checker_smgr.c b/contrib/fsync_checker/fsync_checker_smgr.c new file mode 100644 index 0000000000000..17d0accb1eeba --- /dev/null +++ b/contrib/fsync_checker/fsync_checker_smgr.c @@ -0,0 +1,250 @@ +#include "postgres.h" + +#include "access/xlog.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/md.h" +#include "utils/hsearch.h" + +PG_MODULE_MAGIC; + +typedef struct volatileRelnKey +{ + RelFileLocator locator; + ForkNumber forknum; +} volatileRelnKey; + +typedef struct volatileRelnEntry +{ + volatileRelnKey key; + XLogRecPtr lsn; +} volatileRelnEntry; + +void _PG_init(void); + +static void fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync); +static void fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum); +static void fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void **buffers, + BlockNumber nblocks, bool skipFsync); +static void fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +static void fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); + +static void fsync_checker_checkpoint_create(const CheckPoint *checkPoint); +static void fsync_checker_shmem_request(void); +static void fsync_checker_shmem_startup(void); + +static void add_reln(SMgrRelation reln, ForkNumber forknum); +static void remove_reln(SMgrRelation reln, ForkNumber forknum); + +static SMgrId fsync_checker_smgr_id; +static const struct f_smgr fsync_checker_smgr = { + .name = "fsync_checker", + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, + .smgr_close = mdclose, + .smgr_create = mdcreate, + .smgr_exists = mdexists, + .smgr_unlink = mdunlink, + .smgr_extend = fsync_checker_extend, + .smgr_zeroextend = fsync_checker_zeroextend, + .smgr_prefetch = mdprefetch, + .smgr_readv = mdreadv, + .smgr_writev = fsync_checker_writev, + .smgr_writeback = fsync_checker_writeback, + .smgr_nblocks = mdnblocks, + .smgr_truncate = mdtruncate, + .smgr_immedsync = fsync_checker_immedsync, + .smgr_registersync = mdregistersync, +}; + +static HTAB *volatile_relns; +static LWLock *volatile_relns_lock; +static shmem_request_hook_type prev_shmem_request_hook; +static shmem_startup_hook_type prev_shmem_startup_hook; +static checkpoint_create_hook_type prev_checkpoint_create_hook; + +void +_PG_init(void) +{ + prev_checkpoint_create_hook = checkpoint_create_hook; + checkpoint_create_hook = fsync_checker_checkpoint_create; + + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = fsync_checker_shmem_request; + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = fsync_checker_shmem_startup; + + /* + * Relation size of 0 means we can just defer to md, but it would be nice + * to just expose this functionality, so if I needed my own relation, I + * could use MdSmgrRelation as the parent. + */ + fsync_checker_smgr_id = smgr_register(&fsync_checker_smgr, 0); + + storage_manager_id = fsync_checker_smgr_id; +} + +static void +fsync_checker_checkpoint_create(const CheckPoint *checkPoint) +{ + long num_entries; + HASH_SEQ_STATUS status; + volatileRelnEntry *entry; + + if (prev_checkpoint_create_hook) + prev_checkpoint_create_hook(checkPoint); + + LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); + + hash_seq_init(&status, volatile_relns); + + num_entries = hash_get_num_entries(volatile_relns); + elog(INFO, "Analyzing %ld volatile relations", num_entries); + while ((entry = hash_seq_search(&status))) + { + if (entry->lsn < checkPoint->redo) + { + char *path; + + path = relpathperm(entry->key.locator, entry->key.forknum); + + elog(WARNING, "Relation not previously synced: %s", path); + + pfree(path); + } + } + + LWLockRelease(volatile_relns_lock); +} + +static void +fsync_checker_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(hash_estimate_size(1024, sizeof(volatileRelnEntry))); + RequestNamedLWLockTranche("fsync_checker volatile relns lock", 1); +} + +static void +fsync_checker_shmem_startup(void) +{ + HASHCTL ctl; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + ctl.keysize = sizeof(volatileRelnKey); + ctl.entrysize = sizeof(volatileRelnEntry); + volatile_relns = NULL; + volatile_relns_lock = NULL; + + /* + * Create or attach to the shared memory state, including hash table + */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + + volatile_relns = ShmemInitHash("fsync_checker volatile relns", + 1024, 1024, &ctl, HASH_BLOBS | HASH_ELEM); + volatile_relns_lock = &GetNamedLWLockTranche("fsync_checker volatile relns lock")->lock; + + LWLockRelease(AddinShmemInitLock); +} + +static void +add_reln(SMgrRelation reln, ForkNumber forknum) +{ + bool found; + XLogRecPtr lsn; + volatileRelnKey key; + volatileRelnEntry *entry; + + key.locator = reln->smgr_rlocator.locator; + key.forknum = forknum; + + lsn = GetXLogWriteRecPtr(); + + LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); + + entry = hash_search(volatile_relns, &key, HASH_ENTER, &found); + if (!found) + entry->lsn = lsn; + + LWLockRelease(volatile_relns_lock); +} + +static void +remove_reln(SMgrRelation reln, ForkNumber forknum) +{ + volatileRelnKey key; + + key.locator = reln->smgr_rlocator.locator; + key.forknum = forknum; + + LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); + + hash_search(volatile_relns, &key, HASH_REMOVE, NULL); + + LWLockRelease(volatile_relns_lock); +} + +static void +fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + if (!SmgrIsTemp(reln) && !skipFsync) + add_reln(reln, forknum); + + mdextend(reln, forknum, blocknum, buffer, skipFsync); +} + +static void +fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + if (!SmgrIsTemp(reln)) + remove_reln(reln, forknum); + + mdimmedsync(reln, forknum); +} + +static void +fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void **buffers, + BlockNumber nblocks, bool skipFsync) +{ + if (!SmgrIsTemp(reln) && !skipFsync) + add_reln(reln, forknum); + + mdwritev(reln, forknum, blocknum, buffers, nblocks, skipFsync); +} + +static void +fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + if (!SmgrIsTemp(reln)) + remove_reln(reln, forknum); + + mdwriteback(reln, forknum, blocknum, nblocks); +} + +static void +fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + if (!SmgrIsTemp(reln) && !skipFsync) + add_reln(reln, forknum); + + mdzeroextend(reln, forknum, blocknum, nblocks, skipFsync); +} diff --git a/contrib/fsync_checker/meson.build b/contrib/fsync_checker/meson.build new file mode 100644 index 0000000000000..ce6ed7fe90bbb --- /dev/null +++ b/contrib/fsync_checker/meson.build @@ -0,0 +1,22 @@ +# Copyright (c) 2023, PostgreSQL Global Development Group + +fsync_checker_sources = files( + 'fsync_checker_smgr.c', +) + +if host_system == 'windows' + fsync_checker_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'fsync_checker', + '--FILEDESC', 'fsync_checker - SMGR extension for checking volatile relations',]) +endif + +fsync_checker = shared_module('fsync_checker', + fsync_checker_sources, + kwargs: contrib_mod_args, +) +contrib_targets += fsync_checker + +install_data( + 'fsync_checker.control', + kwargs: contrib_data_args, +) diff --git a/contrib/meson.build b/contrib/meson.build index ce5630d64aee5..907c3f4fd18a8 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -28,6 +28,7 @@ subdir('dict_int') subdir('dict_xsyn') subdir('earthdistance') subdir('file_fdw') +subdir('fsync_checker') subdir('fuzzystrmatch') subdir('hstore') subdir('hstore_plperl') diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c225e778bde2c..9d3ca4dc7e431 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -209,6 +209,8 @@ const struct config_enum_entry archive_mode_options[] = { */ CheckpointStatsData CheckpointStats; +checkpoint_create_hook_type checkpoint_create_hook = NULL; + /* * During recovery, lastFullPageWrites keeps track of full_page_writes that * the replayed WAL records indicate. It's initialized with full_page_writes @@ -7095,6 +7097,9 @@ CreateCheckPoint(int flags) */ END_CRIT_SECTION(); + if (checkpoint_create_hook != NULL) + checkpoint_create_hook(&checkPoint); + /* * In some cases there are groups of actions that must all occur on one * side or the other of a checkpoint record. Before flushing the diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index bf0241aed0ced..36f0bcc75a3a6 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -907,6 +907,11 @@ PostmasterMain(int argc, char *argv[]) */ ApplyLauncherRegister(); + /* + * Register built-in managers that are not part of static arrays + */ + register_builtin_dynamic_managers(); + /* * process any libraries that should be preloaded at postmaster start */ diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 6796756358f34..f3e52b2b15884 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -84,6 +84,21 @@ typedef struct _MdfdVec } MdfdVec; static MemoryContext MdCxt; /* context for all MdfdVec objects */ +SMgrId MdSMgrId; + +typedef struct MdSMgrRelationData +{ + /* parent data */ + SMgrRelationData reln; + /* + * for md.c; per-fork arrays of the number of open segments + * (md_num_open_segs) and the segments themselves (md_seg_fds). + */ + int md_num_open_segs[MAX_FORKNUM + 1]; + struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; +} MdSMgrRelationData; + +typedef MdSMgrRelationData *MdSMgrRelation; /* Populate a file tag describing an md.c segment file. */ @@ -118,26 +133,53 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ #define EXTENSION_DONT_OPEN (1 << 5) +void mdsmgr_register(void) +{ + /* magnetic disk */ + f_smgr md_smgr = (f_smgr) { + .name = MdSMgrName, + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, + .smgr_close = mdclose, + .smgr_create = mdcreate, + .smgr_exists = mdexists, + .smgr_unlink = mdunlink, + .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, + .smgr_prefetch = mdprefetch, + .smgr_readv = mdreadv, + .smgr_writev = mdwritev, + .smgr_writeback = mdwriteback, + .smgr_nblocks = mdnblocks, + .smgr_truncate = mdtruncate, + .smgr_immedsync = mdimmedsync, + .smgr_registersync = mdregistersync, + }; + + MdSMgrId = smgr_register(&md_smgr, sizeof(MdSMgrRelationData)); +} + /* local routines */ static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); -static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); -static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior); +static void register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg); static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno); static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno); -static void _fdvec_resize(SMgrRelation reln, +static void _fdvec_resize(MdSMgrRelation reln, ForkNumber forknum, int nseg); -static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, +static char *_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno); -static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags); -static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior); -static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, +static BlockNumber _mdnblocks(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg); static inline int @@ -170,6 +212,8 @@ mdinit(void) bool mdexists(SMgrRelation reln, ForkNumber forknum) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + /* * Close it first, to ensure that we notice if the fork has been unlinked * since we opened it. As an optimization, we can skip that in recovery, @@ -178,7 +222,7 @@ mdexists(SMgrRelation reln, ForkNumber forknum) if (!InRecovery) mdclose(reln, forknum); - return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL); + return (mdopenfork(mdreln, forknum, EXTENSION_RETURN_NULL) != NULL); } /* @@ -192,11 +236,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) MdfdVec *mdfd; char *path; File fd; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + // Assert(reln->smgr_which == MdSMgrId); - if (isRedo && reln->md_num_open_segs[forknum] > 0) + if (isRedo && mdreln->md_num_open_segs[forknum] > 0) return; /* created and opened already... */ - Assert(reln->md_num_open_segs[forknum] == 0); + Assert(mdreln->md_num_open_segs[forknum] == 0); /* * We may be using the target table space for the first time in this @@ -233,13 +279,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) pfree(path); - _fdvec_resize(reln, forknum, 1); - mdfd = &reln->md_seg_fds[forknum][0]; + _fdvec_resize(mdreln, forknum, 1); + mdfd = &mdreln->md_seg_fds[forknum][0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, mdfd); + register_dirty_segment(mdreln, forknum, mdfd); } /* @@ -463,6 +509,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, off_t seekpos; int nbytes; MdfdVec *v; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; /* If this build supports direct I/O, the buffer must be I/O aligned. */ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) @@ -486,7 +533,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forknum), InvalidBlockNumber))); - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); + v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, EXTENSION_CREATE); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -510,9 +557,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); } /* @@ -528,6 +575,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, MdfdVec *v; BlockNumber curblocknum = blocknum; int remblocks = nblocks; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; Assert(nblocks > 0); @@ -559,7 +607,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, else numblocks = remblocks; - v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + v = _mdfd_getseg(mdreln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); Assert(segstartblock < RELSEG_SIZE); Assert(segstartblock + numblocks <= RELSEG_SIZE); @@ -614,9 +662,9 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, } if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); remblocks -= numblocks; curblocknum += numblocks; @@ -634,7 +682,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * invent one out of whole cloth. */ static MdfdVec * -mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) +mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior) { MdfdVec *mdfd; char *path; @@ -644,7 +692,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) if (reln->md_num_open_segs[forknum] > 0) return &reln->md_seg_fds[forknum][0]; - path = relpath(reln->smgr_rlocator, forknum); + path = relpath(reln->reln.smgr_rlocator, forknum); fd = PathNameOpenFile(path, _mdfd_open_flags()); @@ -679,9 +727,10 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) void mdopen(SMgrRelation reln) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; /* mark it not open */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) - reln->md_num_open_segs[forknum] = 0; + mdreln->md_num_open_segs[forknum] = 0; } /* @@ -690,7 +739,8 @@ mdopen(SMgrRelation reln) void mdclose(SMgrRelation reln, ForkNumber forknum) { - int nopensegs = reln->md_num_open_segs[forknum]; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + int nopensegs = mdreln->md_num_open_segs[forknum]; /* No work if already closed */ if (nopensegs == 0) @@ -699,10 +749,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum) /* close segments starting from the end */ while (nopensegs > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; + MdfdVec *v = &mdreln->md_seg_fds[forknum][nopensegs - 1]; FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, nopensegs - 1); + _fdvec_resize(mdreln, forknum, nopensegs - 1); nopensegs--; } } @@ -715,6 +765,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { #ifdef USE_PREFETCH + MdSMgrRelation mdreln = (MdSMgrRelation) reln; Assert((io_direct_flags & IO_DIRECT_DATA) == 0); @@ -727,7 +778,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MdfdVec *v; int nblocks_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, false, + v = _mdfd_getseg(mdreln, forknum, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) return false; @@ -810,6 +861,8 @@ void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; @@ -821,7 +874,7 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, size_t transferred_this_segment; size_t size_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, false, + v = _mdfd_getseg(mdreln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -928,6 +981,8 @@ void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; + /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum)); @@ -944,7 +999,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, size_t transferred_this_segment; size_t size_this_segment; - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, + v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -1011,7 +1066,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); nblocks -= nblocks_this_segment; buffers += nblocks_this_segment; @@ -1030,6 +1085,7 @@ void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { + MdSMgrRelation mdreln = (MdSMgrRelation) reln; Assert((io_direct_flags & IO_DIRECT_DATA) == 0); /* @@ -1044,7 +1100,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, int segnum_start, segnum_end; - v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + v = _mdfd_getseg(mdreln, forknum, blocknum, true /* not used */ , EXTENSION_DONT_OPEN); /* @@ -1091,11 +1147,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) MdfdVec *v; BlockNumber nblocks; BlockNumber segno; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; - mdopenfork(reln, forknum, EXTENSION_FAIL); + mdopenfork(mdreln, forknum, EXTENSION_FAIL); /* mdopen has opened the first segment */ - Assert(reln->md_num_open_segs[forknum] > 0); + Assert(mdreln->md_num_open_segs[forknum] > 0); /* * Start from the last open segments, to avoid redundant seeks. We have @@ -1110,12 +1167,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * that's OK because the checkpointer never needs to compute relation * size.) */ - segno = reln->md_num_open_segs[forknum] - 1; - v = &reln->md_seg_fds[forknum][segno]; + segno = mdreln->md_num_open_segs[forknum] - 1; + v = &mdreln->md_seg_fds[forknum][segno]; for (;;) { - nblocks = _mdnblocks(reln, forknum, v); + nblocks = _mdnblocks(mdreln, forknum, v); if (nblocks > ((BlockNumber) RELSEG_SIZE)) elog(FATAL, "segment too big"); if (nblocks < ((BlockNumber) RELSEG_SIZE)) @@ -1133,7 +1190,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * undermines _mdfd_getseg's attempts to notice and report an error * upon access to a missing segment. */ - v = _mdfd_openseg(reln, forknum, segno, 0); + v = _mdfd_openseg(mdreln, forknum, segno, 0); if (v == NULL) return segno * ((BlockNumber) RELSEG_SIZE); } @@ -1148,6 +1205,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) BlockNumber curnblk; BlockNumber priorblocks; int curopensegs; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1171,14 +1229,14 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * Truncate segments, starting at the last one. Starting at the end makes * managing the memory for the fd array easier, should there be errors. */ - curopensegs = reln->md_num_open_segs[forknum]; + curopensegs = mdreln->md_num_open_segs[forknum]; while (curopensegs > 0) { MdfdVec *v; priorblocks = (curopensegs - 1) * RELSEG_SIZE; - v = &reln->md_seg_fds[forknum][curopensegs - 1]; + v = &mdreln->md_seg_fds[forknum][curopensegs - 1]; if (priorblocks > nblocks) { @@ -1193,13 +1251,13 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) FilePathName(v->mdfd_vfd)))); if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); /* we never drop the 1st segment */ - Assert(v != &reln->md_seg_fds[forknum][0]); + Assert(v != &mdreln->md_seg_fds[forknum][0]); FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, curopensegs - 1); + _fdvec_resize(mdreln, forknum, curopensegs - 1); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { @@ -1219,7 +1277,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) FilePathName(v->mdfd_vfd), nblocks))); if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); } else { @@ -1292,6 +1350,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) { int segno; int min_inactive_seg; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1299,7 +1358,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) */ mdnblocks(reln, forknum); - min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + min_inactive_seg = segno = mdreln->md_num_open_segs[forknum]; /* * Temporarily open inactive segments, then close them after sync. There @@ -1307,12 +1366,12 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * is harmless. We don't bother to clean them up and take a risk of * further trouble. The next mdclose() will soon close them. */ - while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + while (_mdfd_openseg(mdreln, forknum, segno, 0) != NULL) segno++; while (segno > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + MdfdVec *v = &mdreln->md_seg_fds[forknum][segno - 1]; /* * fsyncs done through mdimmedsync() should be tracked in a separate @@ -1333,7 +1392,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) if (segno > min_inactive_seg) { FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, segno - 1); + _fdvec_resize(mdreln, forknum, segno - 1); } segno--; @@ -1350,14 +1409,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * enough to be a performance problem). */ static void -register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { FileTag tag; - INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno); + INIT_MD_FILETAG(tag, reln->reln.smgr_rlocator.locator, forknum, seg->mdfd_segno); /* Temp relations should never be fsync'd */ - Assert(!SmgrIsTemp(reln)); + Assert(!SmgrIsTemp(&reln->reln)); if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { @@ -1475,7 +1534,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) * _fdvec_resize() -- Resize the fork's open segments array */ static void -_fdvec_resize(SMgrRelation reln, +_fdvec_resize(MdSMgrRelation reln, ForkNumber forknum, int nseg) { @@ -1513,12 +1572,12 @@ _fdvec_resize(SMgrRelation reln, * returned string is palloc'd. */ static char * -_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) +_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno) { char *path, *fullpath; - path = relpath(reln->smgr_rlocator, forknum); + path = relpath(reln->reln.smgr_rlocator, forknum); if (segno > 0) { @@ -1536,7 +1595,7 @@ _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) * and make a MdfdVec object for it. Returns NULL on failure. */ static MdfdVec * -_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, +_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags) { MdfdVec *v; @@ -1581,7 +1640,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, * EXTENSION_CREATE case. */ static MdfdVec * -_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior) { MdfdVec *v; @@ -1655,7 +1714,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); - mdextend(reln, forknum, + mdextend((SMgrRelation) reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, zerobuf, skipFsync); pfree(zerobuf); @@ -1712,7 +1771,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * Get number of blocks present in a single disk file */ static BlockNumber -_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +_mdnblocks(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { off_t len; @@ -1735,7 +1794,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER); + MdSMgrRelation reln = (MdSMgrRelation) smgropen(ftag->rlocator, INVALID_PROC_NUMBER); File file; instr_time io_start; bool need_to_close; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index a691aed1f405f..42576d266f717 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -53,81 +53,25 @@ #include "access/xlogutils.h" #include "lib/ilist.h" +#include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/md.h" #include "storage/smgr.h" +#include "port/atomics.h" #include "utils/hsearch.h" #include "utils/inval.h" +#include "utils/memutils.h" -/* - * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are - * generally expected to report problems via elog(ERROR). An exception is - * that smgr_unlink should use elog(WARNING), rather than erroring out, - * because we normally unlink relations during post-commit/abort cleanup, - * and so it's too late to raise an error. Also, various conditions that - * would normally be errors should be allowed during bootstrap and/or WAL - * recovery --- see comments in md.c for details. - */ -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); - void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks); - void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - void **buffers, BlockNumber nblocks); - void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, - const void **buffers, BlockNumber nblocks, - bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); -} f_smgr; - -static const f_smgr smgrsw[] = { - /* magnetic disk */ - { - .smgr_init = mdinit, - .smgr_shutdown = NULL, - .smgr_open = mdopen, - .smgr_close = mdclose, - .smgr_create = mdcreate, - .smgr_exists = mdexists, - .smgr_unlink = mdunlink, - .smgr_extend = mdextend, - .smgr_zeroextend = mdzeroextend, - .smgr_prefetch = mdprefetch, - .smgr_readv = mdreadv, - .smgr_writev = mdwritev, - .smgr_writeback = mdwriteback, - .smgr_nblocks = mdnblocks, - .smgr_truncate = mdtruncate, - .smgr_immedsync = mdimmedsync, - .smgr_registersync = mdregistersync, - } -}; +static f_smgr *smgrsw; -static const int NSmgr = lengthof(smgrsw); +static int NSmgr = 0; + +static Size LargestSMgrRelationSize = 0; + +char *storage_manager_string; +SMgrId storage_manager_id; /* * Each backend has a hashtable that stores all extant SMgrRelation objects. @@ -141,6 +85,57 @@ static dlist_head unpinned_relns; static void smgrshutdown(int code, Datum arg); static void smgrdestroy(SMgrRelation reln); +SMgrId +smgr_register(const f_smgr *smgr, Size smgrrelation_size) +{ + SMgrId my_id; + MemoryContext old; + + if (process_shared_preload_libraries_done) + elog(FATAL, "SMgrs must be registered in the shared_preload_libraries phase"); + if (NSmgr == MaxSMgrId) + elog(FATAL, "Too many smgrs registered"); + if (smgr->name == NULL || *smgr->name == 0) + elog(FATAL, "smgr registered with invalid name"); + + Assert(smgr->smgr_open != NULL); + Assert(smgr->smgr_close != NULL); + Assert(smgr->smgr_create != NULL); + Assert(smgr->smgr_exists != NULL); + Assert(smgr->smgr_unlink != NULL); + Assert(smgr->smgr_extend != NULL); + Assert(smgr->smgr_zeroextend != NULL); + Assert(smgr->smgr_prefetch != NULL); + Assert(smgr->smgr_readv != NULL); + Assert(smgr->smgr_writev != NULL); + Assert(smgr->smgr_writeback != NULL); + Assert(smgr->smgr_nblocks != NULL); + Assert(smgr->smgr_truncate != NULL); + Assert(smgr->smgr_immedsync != NULL); + old = MemoryContextSwitchTo(TopMemoryContext); + + my_id = NSmgr++; + if (my_id == 0) + smgrsw = palloc(sizeof(f_smgr)); + else + smgrsw = repalloc(smgrsw, sizeof(f_smgr) * NSmgr); + + MemoryContextSwitchTo(old); + + pg_compiler_barrier(); + + if (!smgrsw) + { + NSmgr--; + elog(FATAL, "Failed to extend smgr array"); + } + + memcpy(&smgrsw[my_id], smgr, sizeof(f_smgr)); + + LargestSMgrRelationSize = Max(LargestSMgrRelationSize, smgrrelation_size); + + return my_id; +} /* * smgrinit(), smgrshutdown() -- Initialize or shut down storage @@ -207,9 +202,11 @@ smgropen(RelFileLocator rlocator, ProcNumber backend) { /* First time through: initialize the hash table */ HASHCTL ctl; + LargestSMgrRelationSize = MAXALIGN(LargestSMgrRelationSize); + Assert(NSmgr > 0); ctl.keysize = sizeof(RelFileLocatorBackend); - ctl.entrysize = sizeof(SMgrRelationData); + ctl.entrysize = LargestSMgrRelationSize; SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_BLOBS); dlist_init(&unpinned_relns); @@ -229,7 +226,8 @@ smgropen(RelFileLocator rlocator, ProcNumber backend) reln->smgr_targblock = InvalidBlockNumber; for (int i = 0; i <= MAX_FORKNUM; ++i) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + reln->smgr_which = storage_manager_id; /* implementation-specific initialization */ smgrsw[reln->smgr_which].smgr_open(reln); diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 537d92c0cfde2..7b52f104c8754 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -42,6 +42,7 @@ #include "replication/slotsync.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/md.h" #include "storage/latch.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -198,6 +199,9 @@ InitStandaloneProcess(const char *argv0) InitProcessLocalLatch(); InitializeLatchWaitSet(); + /* Initialize smgrs */ + register_builtin_dynamic_managers(); + /* * For consistency with InitPostmasterChild, initialize signal mask here. * But we don't unblock SIGQUIT or provide a default handler for it. @@ -1864,6 +1868,17 @@ process_session_preload_libraries(void) true); } +/* + * Register any internal managers. + */ +void +register_builtin_dynamic_managers(void) +{ + mdsmgr_register(); + + storage_manager_id = MdSMgrId; +} + /* * process any shared memory requests from preloaded libraries */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 2c507ea618c37..4df3d76ad8a1d 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -13,6 +13,7 @@ #include "access/xlogbackup.h" #include "access/xlogdefs.h" +#include "catalog/pg_control.h" #include "datatype/timestamp.h" #include "lib/stringinfo.h" #include "nodes/pg_list.h" @@ -57,6 +58,9 @@ extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +typedef void (*checkpoint_create_hook_type)(const CheckPoint *); +extern PGDLLIMPORT checkpoint_create_hook_type checkpoint_create_hook; + /* Archive modes */ typedef enum ArchiveMode { diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 90f9b21b2584d..7cc4a33e1786c 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -500,6 +500,7 @@ extern void TouchSocketLockFiles(void); extern void AddToDataDirLockFile(int target_line, const char *str); extern bool RecheckDataDirLockFile(void); extern void ValidatePgVersion(const char *path); +extern void register_builtin_dynamic_managers(void); extern void process_shared_preload_libraries(void); extern void process_session_preload_libraries(void); extern void process_shmem_requests(void); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 620f10abdeb58..d36eaf6451f37 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -19,6 +19,12 @@ #include "storage/smgr.h" #include "storage/sync.h" +#define MdSMgrName "md" + +/* registration function for md storage manager */ +extern void mdsmgr_register(void); +extern SMgrId MdSMgrId; + /* md storage manager functionality */ extern void mdinit(void); extern void mdopen(SMgrRelation reln); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index e15b20a566a0d..2df94be088fe7 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,6 +18,12 @@ #include "storage/block.h" #include "storage/relfilelocator.h" +typedef uint8 SMgrId; + +#define MaxSMgrId UINT8_MAX + +extern PGDLLIMPORT SMgrId storage_manager_id; + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -51,14 +57,8 @@ typedef struct SMgrRelationData * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. */ - int smgr_which; /* storage manager selector */ - - /* - * for md.c; per-fork arrays of the number of open segments - * (md_num_open_segs) and the segments themselves (md_seg_fds). - */ - int md_num_open_segs[MAX_FORKNUM + 1]; - struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + SMgrId smgr_which; /* storage manager selector */ + int smgrrelation_size; /* size of this struct, incl. smgr-specific data */ /* * Pinning support. If unpinned (ie. pincount == 0), 'node' is a list @@ -73,6 +73,52 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator) +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + const char *name; + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); + void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + void **buffers, BlockNumber nblocks); + void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, + bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); +} f_smgr; + +extern SMgrId smgr_register(const f_smgr *smgr, Size smgrrelation_size); + extern void smgrinit(void); extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); From 5dad05c271b05824dc424f9a02d7e276db0da38c Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sun, 7 Apr 2024 18:45:18 +0100 Subject: [PATCH 06/13] Downloaded smgr patch --- ...-to-extensions-for-manual-implementa.patch | 911 ++++++++++++++++++ ...ons-to-override-the-global-storage-m.patch | 93 ++ .../v1-0003-Add-checkpoint_create_hook.patch | 60 ++ .../v1-0004-Add-contrib-fsync_checker.patch | 341 +++++++ 4 files changed, 1405 insertions(+) create mode 100644 smgr_patch/v1-0001-Expose-f_smgr-to-extensions-for-manual-implementa.patch create mode 100644 smgr_patch/v1-0002-Allow-extensions-to-override-the-global-storage-m.patch create mode 100644 smgr_patch/v1-0003-Add-checkpoint_create_hook.patch create mode 100644 smgr_patch/v1-0004-Add-contrib-fsync_checker.patch diff --git a/smgr_patch/v1-0001-Expose-f_smgr-to-extensions-for-manual-implementa.patch b/smgr_patch/v1-0001-Expose-f_smgr-to-extensions-for-manual-implementa.patch new file mode 100644 index 0000000000000..c68d36b26236f --- /dev/null +++ b/smgr_patch/v1-0001-Expose-f_smgr-to-extensions-for-manual-implementa.patch @@ -0,0 +1,911 @@ +From 5ffbc7c35bb3248501b2517d26f99afe02fb53d6 Mon Sep 17 00:00:00 2001 +From: Matthias van de Meent +Date: Tue, 27 Jun 2023 15:59:23 +0200 +Subject: [PATCH v1 1/5] Expose f_smgr to extensions for manual implementation + +There are various reasons why one would want to create their own +implementation of a storage manager, among which are block-level compression, +encryption and offloading to cold storage. This patch is a first patch that +allows extensions to register their own SMgr. + +Note, however, that this SMgr is not yet used - only the first SMgr to register +is used, and this is currently the md.c smgr. Future commits will include +facilities to select an SMgr for each tablespace. +--- + src/backend/postmaster/postmaster.c | 5 + + src/backend/storage/smgr/md.c | 172 +++++++++++++++++++--------- + src/backend/storage/smgr/smgr.c | 129 ++++++++++----------- + src/backend/utils/init/miscinit.c | 13 +++ + src/include/miscadmin.h | 1 + + src/include/storage/md.h | 4 + + src/include/storage/smgr.h | 59 ++++++++-- + 7 files changed, 252 insertions(+), 131 deletions(-) + +diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c +index feb471dd1d..a0e46fe1f2 100644 +--- a/src/backend/postmaster/postmaster.c ++++ b/src/backend/postmaster/postmaster.c +@@ -1010,6 +1010,11 @@ PostmasterMain(int argc, char *argv[]) + */ + ApplyLauncherRegister(); + ++ /* ++ * Register built-in managers that are not part of static arrays ++ */ ++ register_builtin_dynamic_managers(); ++ + /* + * process any libraries that should be preloaded at postmaster start + */ +diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c +index b1e9932a29..66a93101ab 100644 +--- a/src/backend/storage/smgr/md.c ++++ b/src/backend/storage/smgr/md.c +@@ -87,6 +87,21 @@ typedef struct _MdfdVec + } MdfdVec; + + static MemoryContext MdCxt; /* context for all MdfdVec objects */ ++SMgrId MdSMgrId; ++ ++typedef struct MdSMgrRelationData ++{ ++ /* parent data */ ++ SMgrRelationData reln; ++ /* ++ * for md.c; per-fork arrays of the number of open segments ++ * (md_num_open_segs) and the segments themselves (md_seg_fds). ++ */ ++ int md_num_open_segs[MAX_FORKNUM + 1]; ++ struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; ++} MdSMgrRelationData; ++ ++typedef MdSMgrRelationData *MdSMgrRelation; + + + /* Populate a file tag describing an md.c segment file. */ +@@ -121,26 +136,52 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ + #define EXTENSION_DONT_OPEN (1 << 5) + + ++void mdsmgr_register(void) ++{ ++ /* magnetic disk */ ++ f_smgr md_smgr = (f_smgr) { ++ .name = "md", ++ .smgr_init = mdinit, ++ .smgr_shutdown = NULL, ++ .smgr_open = mdopen, ++ .smgr_close = mdclose, ++ .smgr_create = mdcreate, ++ .smgr_exists = mdexists, ++ .smgr_unlink = mdunlink, ++ .smgr_extend = mdextend, ++ .smgr_zeroextend = mdzeroextend, ++ .smgr_prefetch = mdprefetch, ++ .smgr_readv = mdreadv, ++ .smgr_writev = mdwritev, ++ .smgr_writeback = mdwriteback, ++ .smgr_nblocks = mdnblocks, ++ .smgr_truncate = mdtruncate, ++ .smgr_immedsync = mdimmedsync, ++ }; ++ ++ MdSMgrId = smgr_register(&md_smgr, sizeof(MdSMgrRelationData)); ++} ++ + /* local routines */ + static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo); +-static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); +-static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, ++static MdfdVec *mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior); ++static void register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, + MdfdVec *seg); + static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, + BlockNumber segno); + static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, + BlockNumber segno); +-static void _fdvec_resize(SMgrRelation reln, ++static void _fdvec_resize(MdSMgrRelation reln, + ForkNumber forknum, + int nseg); +-static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, ++static char *_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, + BlockNumber segno); +-static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, ++static MdfdVec *_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, + BlockNumber segno, int oflags); +-static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, ++static MdfdVec *_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, + BlockNumber blkno, bool skipFsync, int behavior); +-static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, ++static BlockNumber _mdnblocks(MdSMgrRelation reln, ForkNumber forknum, + MdfdVec *seg); + + static inline int +@@ -173,6 +214,8 @@ mdinit(void) + bool + mdexists(SMgrRelation reln, ForkNumber forknum) + { ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; ++ + /* + * Close it first, to ensure that we notice if the fork has been unlinked + * since we opened it. As an optimization, we can skip that in recovery, +@@ -181,7 +224,7 @@ mdexists(SMgrRelation reln, ForkNumber forknum) + if (!InRecovery) + mdclose(reln, forknum); + +- return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL); ++ return (mdopenfork(mdreln, forknum, EXTENSION_RETURN_NULL) != NULL); + } + + /* +@@ -195,11 +238,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) + MdfdVec *mdfd; + char *path; + File fd; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; ++ // Assert(reln->smgr_which == MdSMgrId); + +- if (isRedo && reln->md_num_open_segs[forknum] > 0) ++ if (isRedo && mdreln->md_num_open_segs[forknum] > 0) + return; /* created and opened already... */ + +- Assert(reln->md_num_open_segs[forknum] == 0); ++ Assert(mdreln->md_num_open_segs[forknum] == 0); + + /* + * We may be using the target table space for the first time in this +@@ -236,13 +281,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) + + pfree(path); + +- _fdvec_resize(reln, forknum, 1); +- mdfd = &reln->md_seg_fds[forknum][0]; ++ _fdvec_resize(mdreln, forknum, 1); ++ mdfd = &mdreln->md_seg_fds[forknum][0]; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; + + if (!SmgrIsTemp(reln)) +- register_dirty_segment(reln, forknum, mdfd); ++ register_dirty_segment(mdreln, forknum, mdfd); + } + + /* +@@ -466,6 +511,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + off_t seekpos; + int nbytes; + MdfdVec *v; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) +@@ -489,7 +535,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + relpath(reln->smgr_rlocator, forknum), + InvalidBlockNumber))); + +- v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); ++ v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, EXTENSION_CREATE); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + +@@ -513,9 +559,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + } + + if (!skipFsync && !SmgrIsTemp(reln)) +- register_dirty_segment(reln, forknum, v); ++ register_dirty_segment(mdreln, forknum, v); + +- Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); ++ Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + } + + /* +@@ -531,6 +577,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + + Assert(nblocks > 0); + +@@ -562,7 +609,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, + else + numblocks = remblocks; + +- v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); ++ v = _mdfd_getseg(mdreln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); +@@ -617,9 +664,9 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, + } + + if (!skipFsync && !SmgrIsTemp(reln)) +- register_dirty_segment(reln, forknum, v); ++ register_dirty_segment(mdreln, forknum, v); + +- Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); ++ Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + + remblocks -= numblocks; + curblocknum += numblocks; +@@ -637,7 +684,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, + * invent one out of whole cloth. + */ + static MdfdVec * +-mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) ++mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior) + { + MdfdVec *mdfd; + char *path; +@@ -647,7 +694,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) + if (reln->md_num_open_segs[forknum] > 0) + return &reln->md_seg_fds[forknum][0]; + +- path = relpath(reln->smgr_rlocator, forknum); ++ path = relpath(reln->reln.smgr_rlocator, forknum); + + fd = PathNameOpenFile(path, _mdfd_open_flags()); + +@@ -682,9 +729,10 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) + void + mdopen(SMgrRelation reln) + { ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + /* mark it not open */ + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) +- reln->md_num_open_segs[forknum] = 0; ++ mdreln->md_num_open_segs[forknum] = 0; + } + + /* +@@ -693,7 +741,8 @@ mdopen(SMgrRelation reln) + void + mdclose(SMgrRelation reln, ForkNumber forknum) + { +- int nopensegs = reln->md_num_open_segs[forknum]; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; ++ int nopensegs = mdreln->md_num_open_segs[forknum]; + + /* No work if already closed */ + if (nopensegs == 0) +@@ -702,10 +751,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum) + /* close segments starting from the end */ + while (nopensegs > 0) + { +- MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; ++ MdfdVec *v = &mdreln->md_seg_fds[forknum][nopensegs - 1]; + + FileClose(v->mdfd_vfd); +- _fdvec_resize(reln, forknum, nopensegs - 1); ++ _fdvec_resize(mdreln, forknum, nopensegs - 1); + nopensegs--; + } + } +@@ -718,6 +767,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) + { + #ifdef USE_PREFETCH ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + +@@ -730,7 +780,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + MdfdVec *v; + int nblocks_this_segment; + +- v = _mdfd_getseg(reln, forknum, blocknum, false, ++ v = _mdfd_getseg(mdreln, forknum, blocknum, false, + InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); + if (v == NULL) + return false; +@@ -813,6 +863,8 @@ void + mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void **buffers, BlockNumber nblocks) + { ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; ++ + while (nblocks > 0) + { + struct iovec iov[PG_IOV_MAX]; +@@ -824,7 +876,7 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + size_t transferred_this_segment; + size_t size_this_segment; + +- v = _mdfd_getseg(reln, forknum, blocknum, false, ++ v = _mdfd_getseg(mdreln, forknum, blocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); +@@ -931,6 +983,8 @@ void + mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void **buffers, BlockNumber nblocks, bool skipFsync) + { ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; ++ + /* This assert is too expensive to have on normally ... */ + #ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum < mdnblocks(reln, forknum)); +@@ -947,7 +1001,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + size_t transferred_this_segment; + size_t size_this_segment; + +- v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, ++ v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); +@@ -1014,7 +1068,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + } + + if (!skipFsync && !SmgrIsTemp(reln)) +- register_dirty_segment(reln, forknum, v); ++ register_dirty_segment(mdreln, forknum, v); + + nblocks -= nblocks_this_segment; + buffers += nblocks_this_segment; +@@ -1033,6 +1087,7 @@ void + mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) + { ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + + /* +@@ -1047,7 +1102,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, + int segnum_start, + segnum_end; + +- v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , ++ v = _mdfd_getseg(mdreln, forknum, blocknum, true /* not used */ , + EXTENSION_DONT_OPEN); + + /* +@@ -1094,11 +1149,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) + MdfdVec *v; + BlockNumber nblocks; + BlockNumber segno; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + +- mdopenfork(reln, forknum, EXTENSION_FAIL); ++ mdopenfork(mdreln, forknum, EXTENSION_FAIL); + + /* mdopen has opened the first segment */ +- Assert(reln->md_num_open_segs[forknum] > 0); ++ Assert(mdreln->md_num_open_segs[forknum] > 0); + + /* + * Start from the last open segments, to avoid redundant seeks. We have +@@ -1113,12 +1169,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) + * that's OK because the checkpointer never needs to compute relation + * size.) + */ +- segno = reln->md_num_open_segs[forknum] - 1; +- v = &reln->md_seg_fds[forknum][segno]; ++ segno = mdreln->md_num_open_segs[forknum] - 1; ++ v = &mdreln->md_seg_fds[forknum][segno]; + + for (;;) + { +- nblocks = _mdnblocks(reln, forknum, v); ++ nblocks = _mdnblocks(mdreln, forknum, v); + if (nblocks > ((BlockNumber) RELSEG_SIZE)) + elog(FATAL, "segment too big"); + if (nblocks < ((BlockNumber) RELSEG_SIZE)) +@@ -1136,7 +1192,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) + * undermines _mdfd_getseg's attempts to notice and report an error + * upon access to a missing segment. + */ +- v = _mdfd_openseg(reln, forknum, segno, 0); ++ v = _mdfd_openseg(mdreln, forknum, segno, 0); + if (v == NULL) + return segno * ((BlockNumber) RELSEG_SIZE); + } +@@ -1151,6 +1207,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) + BlockNumber curnblk; + BlockNumber priorblocks; + int curopensegs; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + + /* + * NOTE: mdnblocks makes sure we have opened all active segments, so that +@@ -1174,14 +1231,14 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) + * Truncate segments, starting at the last one. Starting at the end makes + * managing the memory for the fd array easier, should there be errors. + */ +- curopensegs = reln->md_num_open_segs[forknum]; ++ curopensegs = mdreln->md_num_open_segs[forknum]; + while (curopensegs > 0) + { + MdfdVec *v; + + priorblocks = (curopensegs - 1) * RELSEG_SIZE; + +- v = &reln->md_seg_fds[forknum][curopensegs - 1]; ++ v = &mdreln->md_seg_fds[forknum][curopensegs - 1]; + + if (priorblocks > nblocks) + { +@@ -1196,13 +1253,13 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) + FilePathName(v->mdfd_vfd)))); + + if (!SmgrIsTemp(reln)) +- register_dirty_segment(reln, forknum, v); ++ register_dirty_segment(mdreln, forknum, v); + + /* we never drop the 1st segment */ +- Assert(v != &reln->md_seg_fds[forknum][0]); ++ Assert(v != &mdreln->md_seg_fds[forknum][0]); + + FileClose(v->mdfd_vfd); +- _fdvec_resize(reln, forknum, curopensegs - 1); ++ _fdvec_resize(mdreln, forknum, curopensegs - 1); + } + else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) + { +@@ -1222,7 +1279,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) + FilePathName(v->mdfd_vfd), + nblocks))); + if (!SmgrIsTemp(reln)) +- register_dirty_segment(reln, forknum, v); ++ register_dirty_segment(mdreln, forknum, v); + } + else + { +@@ -1252,6 +1309,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) + { + int segno; + int min_inactive_seg; ++ MdSMgrRelation mdreln = (MdSMgrRelation) reln; + + /* + * NOTE: mdnblocks makes sure we have opened all active segments, so that +@@ -1259,7 +1317,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) + */ + mdnblocks(reln, forknum); + +- min_inactive_seg = segno = reln->md_num_open_segs[forknum]; ++ min_inactive_seg = segno = mdreln->md_num_open_segs[forknum]; + + /* + * Temporarily open inactive segments, then close them after sync. There +@@ -1267,12 +1325,12 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) + * is harmless. We don't bother to clean them up and take a risk of + * further trouble. The next mdclose() will soon close them. + */ +- while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) ++ while (_mdfd_openseg(mdreln, forknum, segno, 0) != NULL) + segno++; + + while (segno > 0) + { +- MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; ++ MdfdVec *v = &mdreln->md_seg_fds[forknum][segno - 1]; + + /* + * fsyncs done through mdimmedsync() should be tracked in a separate +@@ -1293,7 +1351,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) + if (segno > min_inactive_seg) + { + FileClose(v->mdfd_vfd); +- _fdvec_resize(reln, forknum, segno - 1); ++ _fdvec_resize(mdreln, forknum, segno - 1); + } + + segno--; +@@ -1310,14 +1368,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) + * enough to be a performance problem). + */ + static void +-register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) ++register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg) + { + FileTag tag; + +- INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno); ++ INIT_MD_FILETAG(tag, reln->reln.smgr_rlocator.locator, forknum, seg->mdfd_segno); + + /* Temp relations should never be fsync'd */ +- Assert(!SmgrIsTemp(reln)); ++ Assert(!SmgrIsTemp(&reln->reln)); + + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) + { +@@ -1435,7 +1493,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) + * _fdvec_resize() -- Resize the fork's open segments array + */ + static void +-_fdvec_resize(SMgrRelation reln, ++_fdvec_resize(MdSMgrRelation reln, + ForkNumber forknum, + int nseg) + { +@@ -1473,12 +1531,12 @@ _fdvec_resize(SMgrRelation reln, + * returned string is palloc'd. + */ + static char * +-_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) ++_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno) + { + char *path, + *fullpath; + +- path = relpath(reln->smgr_rlocator, forknum); ++ path = relpath(reln->reln.smgr_rlocator, forknum); + + if (segno > 0) + { +@@ -1496,7 +1554,7 @@ _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) + * and make a MdfdVec object for it. Returns NULL on failure. + */ + static MdfdVec * +-_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, ++_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno, + int oflags) + { + MdfdVec *v; +@@ -1541,7 +1599,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, + * EXTENSION_CREATE case. + */ + static MdfdVec * +-_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, ++_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + bool skipFsync, int behavior) + { + MdfdVec *v; +@@ -1615,7 +1673,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, + MCXT_ALLOC_ZERO); + +- mdextend(reln, forknum, ++ mdextend((SMgrRelation) reln, forknum, + nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, + zerobuf, skipFsync); + pfree(zerobuf); +@@ -1672,7 +1730,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + * Get number of blocks present in a single disk file + */ + static BlockNumber +-_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) ++_mdnblocks(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg) + { + off_t len; + +@@ -1695,7 +1753,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) + int + mdsyncfiletag(const FileTag *ftag, char *path) + { +- SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId); ++ MdSMgrRelation reln = (MdSMgrRelation) smgropen(ftag->rlocator, InvalidBackendId); + File file; + instr_time io_start; + bool need_to_close; +diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c +index 563a0be5c7..b586e6e25a 100644 +--- a/src/backend/storage/smgr/smgr.c ++++ b/src/backend/storage/smgr/smgr.c +@@ -19,80 +19,23 @@ + + #include "access/xlogutils.h" + #include "lib/ilist.h" ++#include "miscadmin.h" + #include "storage/bufmgr.h" + #include "storage/fd.h" + #include "storage/ipc.h" + #include "storage/md.h" + #include "storage/smgr.h" ++#include "port/atomics.h" + #include "utils/hsearch.h" + #include "utils/inval.h" ++#include "utils/memutils.h" + + +-/* +- * This struct of function pointers defines the API between smgr.c and +- * any individual storage manager module. Note that smgr subfunctions are +- * generally expected to report problems via elog(ERROR). An exception is +- * that smgr_unlink should use elog(WARNING), rather than erroring out, +- * because we normally unlink relations during post-commit/abort cleanup, +- * and so it's too late to raise an error. Also, various conditions that +- * would normally be errors should be allowed during bootstrap and/or WAL +- * recovery --- see comments in md.c for details. +- */ +-typedef struct f_smgr +-{ +- void (*smgr_init) (void); /* may be NULL */ +- void (*smgr_shutdown) (void); /* may be NULL */ +- void (*smgr_open) (SMgrRelation reln); +- void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); +- void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, +- bool isRedo); +- bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); +- void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, +- bool isRedo); +- void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber blocknum, const void *buffer, bool skipFsync); +- void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber blocknum, int nblocks, bool skipFsync); +- bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber blocknum, int nblocks); +- void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber blocknum, +- void **buffers, BlockNumber nblocks); +- void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber blocknum, +- const void **buffers, BlockNumber nblocks, +- bool skipFsync); +- void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber blocknum, BlockNumber nblocks); +- BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); +- void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, +- BlockNumber nblocks); +- void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); +-} f_smgr; +- +-static const f_smgr smgrsw[] = { +- /* magnetic disk */ +- { +- .smgr_init = mdinit, +- .smgr_shutdown = NULL, +- .smgr_open = mdopen, +- .smgr_close = mdclose, +- .smgr_create = mdcreate, +- .smgr_exists = mdexists, +- .smgr_unlink = mdunlink, +- .smgr_extend = mdextend, +- .smgr_zeroextend = mdzeroextend, +- .smgr_prefetch = mdprefetch, +- .smgr_readv = mdreadv, +- .smgr_writev = mdwritev, +- .smgr_writeback = mdwriteback, +- .smgr_nblocks = mdnblocks, +- .smgr_truncate = mdtruncate, +- .smgr_immedsync = mdimmedsync, +- } +-}; ++static f_smgr *smgrsw; + +-static const int NSmgr = lengthof(smgrsw); ++static int NSmgr = 0; ++ ++static Size LargestSMgrRelationSize = 0; + + /* + * Each backend has a hashtable that stores all extant SMgrRelation objects. +@@ -105,6 +48,57 @@ static dlist_head unowned_relns; + /* local function prototypes */ + static void smgrshutdown(int code, Datum arg); + ++SMgrId ++smgr_register(const f_smgr *smgr, Size smgrrelation_size) ++{ ++ SMgrId my_id; ++ MemoryContext old; ++ ++ if (process_shared_preload_libraries_done) ++ elog(FATAL, "SMgrs must be registered in the shared_preload_libraries phase"); ++ if (NSmgr == MaxSMgrId) ++ elog(FATAL, "Too many smgrs registered"); ++ if (smgr->name == NULL || *smgr->name == 0) ++ elog(FATAL, "smgr registered with invalid name"); ++ ++ Assert(smgr->smgr_open != NULL); ++ Assert(smgr->smgr_close != NULL); ++ Assert(smgr->smgr_create != NULL); ++ Assert(smgr->smgr_exists != NULL); ++ Assert(smgr->smgr_unlink != NULL); ++ Assert(smgr->smgr_extend != NULL); ++ Assert(smgr->smgr_zeroextend != NULL); ++ Assert(smgr->smgr_prefetch != NULL); ++ Assert(smgr->smgr_readv != NULL); ++ Assert(smgr->smgr_writev != NULL); ++ Assert(smgr->smgr_writeback != NULL); ++ Assert(smgr->smgr_nblocks != NULL); ++ Assert(smgr->smgr_truncate != NULL); ++ Assert(smgr->smgr_immedsync != NULL); ++ old = MemoryContextSwitchTo(TopMemoryContext); ++ ++ my_id = NSmgr++; ++ if (my_id == 0) ++ smgrsw = palloc(sizeof(f_smgr)); ++ else ++ smgrsw = repalloc(smgrsw, sizeof(f_smgr) * NSmgr); ++ ++ MemoryContextSwitchTo(old); ++ ++ pg_compiler_barrier(); ++ ++ if (!smgrsw) ++ { ++ NSmgr--; ++ elog(FATAL, "Failed to extend smgr array"); ++ } ++ ++ memcpy(&smgrsw[my_id], smgr, sizeof(f_smgr)); ++ ++ LargestSMgrRelationSize = Max(LargestSMgrRelationSize, smgrrelation_size); ++ ++ return my_id; ++} + + /* + * smgrinit(), smgrshutdown() -- Initialize or shut down storage +@@ -162,9 +156,11 @@ smgropen(RelFileLocator rlocator, BackendId backend) + { + /* First time through: initialize the hash table */ + HASHCTL ctl; ++ LargestSMgrRelationSize = MAXALIGN(LargestSMgrRelationSize); ++ Assert(NSmgr > 0); + + ctl.keysize = sizeof(RelFileLocatorBackend); +- ctl.entrysize = sizeof(SMgrRelationData); ++ ctl.entrysize = LargestSMgrRelationSize; + SMgrRelationHash = hash_create("smgr relation table", 400, + &ctl, HASH_ELEM | HASH_BLOBS); + dlist_init(&unowned_relns); +@@ -185,7 +181,8 @@ smgropen(RelFileLocator rlocator, BackendId backend) + reln->smgr_targblock = InvalidBlockNumber; + for (int i = 0; i <= MAX_FORKNUM; ++i) + reln->smgr_cached_nblocks[i] = InvalidBlockNumber; +- reln->smgr_which = 0; /* we only have md.c at present */ ++ ++ reln->smgr_which = MdSMgrId; /* we only have md.c at present */ + + /* implementation-specific initialization */ + smgrsw[reln->smgr_which].smgr_open(reln); +diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c +index 23f77a59e5..4ec7619302 100644 +--- a/src/backend/utils/init/miscinit.c ++++ b/src/backend/utils/init/miscinit.c +@@ -42,6 +42,7 @@ + #include "postmaster/postmaster.h" + #include "storage/fd.h" + #include "storage/ipc.h" ++#include "storage/md.h" + #include "storage/latch.h" + #include "storage/pg_shmem.h" + #include "storage/pmsignal.h" +@@ -198,6 +199,9 @@ InitStandaloneProcess(const char *argv0) + InitProcessLocalLatch(); + InitializeLatchWaitSet(); + ++ /* Initialize smgrs */ ++ register_builtin_dynamic_managers(); ++ + /* + * For consistency with InitPostmasterChild, initialize signal mask here. + * But we don't unblock SIGQUIT or provide a default handler for it. +@@ -1860,6 +1864,15 @@ process_session_preload_libraries(void) + true); + } + ++/* ++ * Register any internal managers. ++ */ ++void ++register_builtin_dynamic_managers(void) ++{ ++ mdsmgr_register(); ++} ++ + /* + * process any shared memory requests from preloaded libraries + */ +diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h +index 0b01c1f093..d0d4ba38ef 100644 +--- a/src/include/miscadmin.h ++++ b/src/include/miscadmin.h +@@ -493,6 +493,7 @@ extern void TouchSocketLockFiles(void); + extern void AddToDataDirLockFile(int target_line, const char *str); + extern bool RecheckDataDirLockFile(void); + extern void ValidatePgVersion(const char *path); ++extern void register_builtin_dynamic_managers(void); + extern void process_shared_preload_libraries(void); + extern void process_session_preload_libraries(void); + extern void process_shmem_requests(void); +diff --git a/src/include/storage/md.h b/src/include/storage/md.h +index 7c181e5a17..734bae07e1 100644 +--- a/src/include/storage/md.h ++++ b/src/include/storage/md.h +@@ -19,6 +19,10 @@ + #include "storage/smgr.h" + #include "storage/sync.h" + ++/* registration function for md storage manager */ ++extern void mdsmgr_register(void); ++extern SMgrId MdSMgrId; ++ + /* md storage manager functionality */ + extern void mdinit(void); + extern void mdopen(SMgrRelation reln); +diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h +index 527cd2a056..95927b8bdd 100644 +--- a/src/include/storage/smgr.h ++++ b/src/include/storage/smgr.h +@@ -18,6 +18,10 @@ + #include "storage/block.h" + #include "storage/relfilelocator.h" + ++typedef uint8 SMgrId; ++ ++#define MaxSMgrId UINT8_MAX ++ + /* + * smgr.c maintains a table of SMgrRelation objects, which are essentially + * cached file handles. An SMgrRelation is created (if not already present) +@@ -59,14 +63,8 @@ typedef struct SMgrRelationData + * Fields below here are intended to be private to smgr.c and its + * submodules. Do not touch them from elsewhere. + */ +- int smgr_which; /* storage manager selector */ +- +- /* +- * for md.c; per-fork arrays of the number of open segments +- * (md_num_open_segs) and the segments themselves (md_seg_fds). +- */ +- int md_num_open_segs[MAX_FORKNUM + 1]; +- struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; ++ SMgrId smgr_which; /* storage manager selector */ ++ int smgrrelation_size; /* size of this struct, incl. smgr-specific data */ + + /* if unowned, list link in list of all unowned SMgrRelations */ + dlist_node node; +@@ -77,6 +75,51 @@ typedef SMgrRelationData *SMgrRelation; + #define SmgrIsTemp(smgr) \ + RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator) + ++/* ++ * This struct of function pointers defines the API between smgr.c and ++ * any individual storage manager module. Note that smgr subfunctions are ++ * generally expected to report problems via elog(ERROR). An exception is ++ * that smgr_unlink should use elog(WARNING), rather than erroring out, ++ * because we normally unlink relations during post-commit/abort cleanup, ++ * and so it's too late to raise an error. Also, various conditions that ++ * would normally be errors should be allowed during bootstrap and/or WAL ++ * recovery --- see comments in md.c for details. ++ */ ++typedef struct f_smgr ++{ ++ const char *name; ++ void (*smgr_init) (void); /* may be NULL */ ++ void (*smgr_shutdown) (void); /* may be NULL */ ++ void (*smgr_open) (SMgrRelation reln); ++ void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); ++ void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, ++ bool isRedo); ++ bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); ++ void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, ++ bool isRedo); ++ void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, const void *buffer, bool skipFsync); ++ void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, int nblocks, bool skipFsync); ++ bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, int nblocks); ++ void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, ++ void **buffers, BlockNumber nblocks); ++ void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, ++ const void **buffers, BlockNumber nblocks, ++ bool skipFsync); ++ void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, BlockNumber nblocks); ++ BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); ++ void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, ++ BlockNumber nblocks); ++ void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); ++} f_smgr; ++ ++extern SMgrId smgr_register(const f_smgr *smgr, Size smgrrelation_size); ++ + extern void smgrinit(void); + extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend); + extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); +-- +Tristan Partin +Neon (https://neon.tech) + diff --git a/smgr_patch/v1-0002-Allow-extensions-to-override-the-global-storage-m.patch b/smgr_patch/v1-0002-Allow-extensions-to-override-the-global-storage-m.patch new file mode 100644 index 0000000000000..b5557178b3d43 --- /dev/null +++ b/smgr_patch/v1-0002-Allow-extensions-to-override-the-global-storage-m.patch @@ -0,0 +1,93 @@ +From 59a667f079c9b040c23921e4c43fae94b88776f2 Mon Sep 17 00:00:00 2001 +From: Tristan Partin +Date: Fri, 13 Oct 2023 14:00:44 -0500 +Subject: [PATCH v1 2/5] Allow extensions to override the global storage + manager + +--- + src/backend/storage/smgr/md.c | 2 +- + src/backend/storage/smgr/smgr.c | 5 ++++- + src/backend/utils/init/miscinit.c | 2 ++ + src/include/storage/md.h | 2 ++ + src/include/storage/smgr.h | 2 ++ + 5 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c +index 66a93101ab..13ec9da236 100644 +--- a/src/backend/storage/smgr/md.c ++++ b/src/backend/storage/smgr/md.c +@@ -140,7 +140,7 @@ void mdsmgr_register(void) + { + /* magnetic disk */ + f_smgr md_smgr = (f_smgr) { +- .name = "md", ++ .name = MdSMgrName, + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, +diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c +index b586e6e25a..0814330b8a 100644 +--- a/src/backend/storage/smgr/smgr.c ++++ b/src/backend/storage/smgr/smgr.c +@@ -37,6 +37,9 @@ static int NSmgr = 0; + + static Size LargestSMgrRelationSize = 0; + ++char *storage_manager_string; ++SMgrId storage_manager_id; ++ + /* + * Each backend has a hashtable that stores all extant SMgrRelation objects. + * In addition, "unowned" SMgrRelation objects are chained together in a list. +@@ -182,7 +185,7 @@ smgropen(RelFileLocator rlocator, BackendId backend) + for (int i = 0; i <= MAX_FORKNUM; ++i) + reln->smgr_cached_nblocks[i] = InvalidBlockNumber; + +- reln->smgr_which = MdSMgrId; /* we only have md.c at present */ ++ reln->smgr_which = storage_manager_id; + + /* implementation-specific initialization */ + smgrsw[reln->smgr_which].smgr_open(reln); +diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c +index 4ec7619302..f44f511f69 100644 +--- a/src/backend/utils/init/miscinit.c ++++ b/src/backend/utils/init/miscinit.c +@@ -1871,6 +1871,8 @@ void + register_builtin_dynamic_managers(void) + { + mdsmgr_register(); ++ ++ storage_manager_id = MdSMgrId; + } + + /* +diff --git a/src/include/storage/md.h b/src/include/storage/md.h +index 734bae07e1..fdafb2c8e3 100644 +--- a/src/include/storage/md.h ++++ b/src/include/storage/md.h +@@ -19,6 +19,8 @@ + #include "storage/smgr.h" + #include "storage/sync.h" + ++#define MdSMgrName "md" ++ + /* registration function for md storage manager */ + extern void mdsmgr_register(void); + extern SMgrId MdSMgrId; +diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h +index 95927b8bdd..ee4fc27265 100644 +--- a/src/include/storage/smgr.h ++++ b/src/include/storage/smgr.h +@@ -22,6 +22,8 @@ typedef uint8 SMgrId; + + #define MaxSMgrId UINT8_MAX + ++extern PGDLLIMPORT SMgrId storage_manager_id; ++ + /* + * smgr.c maintains a table of SMgrRelation objects, which are essentially + * cached file handles. An SMgrRelation is created (if not already present) +-- +Tristan Partin +Neon (https://neon.tech) + diff --git a/smgr_patch/v1-0003-Add-checkpoint_create_hook.patch b/smgr_patch/v1-0003-Add-checkpoint_create_hook.patch new file mode 100644 index 0000000000000..99eb31a0b5ae1 --- /dev/null +++ b/smgr_patch/v1-0003-Add-checkpoint_create_hook.patch @@ -0,0 +1,60 @@ +From 9ed9b8ca36cdb75b44deccdfea619c7494fcc6ef Mon Sep 17 00:00:00 2001 +From: Tristan Partin +Date: Fri, 13 Oct 2023 13:57:18 -0500 +Subject: [PATCH v1 3/5] Add checkpoint_create_hook + +Allows an extension to hook into CheckPointCreate(). +--- + src/backend/access/transam/xlog.c | 5 +++++ + src/include/access/xlog.h | 4 ++++ + 2 files changed, 9 insertions(+) + +diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c +index 478377c4a2..61ae5b63b8 100644 +--- a/src/backend/access/transam/xlog.c ++++ b/src/backend/access/transam/xlog.c +@@ -212,6 +212,8 @@ const struct config_enum_entry archive_mode_options[] = { + */ + CheckpointStatsData CheckpointStats; + ++checkpoint_create_hook_type checkpoint_create_hook = NULL; ++ + /* + * During recovery, lastFullPageWrites keeps track of full_page_writes that + * the replayed WAL records indicate. It's initialized with full_page_writes +@@ -6875,6 +6877,9 @@ CreateCheckPoint(int flags) + */ + END_CRIT_SECTION(); + ++ if (checkpoint_create_hook != NULL) ++ checkpoint_create_hook(&checkPoint); ++ + /* + * In some cases there are groups of actions that must all occur on one + * side or the other of a checkpoint record. Before flushing the +diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h +index 301c5fa11f..437f2a994b 100644 +--- a/src/include/access/xlog.h ++++ b/src/include/access/xlog.h +@@ -13,6 +13,7 @@ + + #include "access/xlogbackup.h" + #include "access/xlogdefs.h" ++#include "catalog/pg_control.h" + #include "datatype/timestamp.h" + #include "lib/stringinfo.h" + #include "nodes/pg_list.h" +@@ -57,6 +58,9 @@ extern PGDLLIMPORT int wal_decode_buffer_size; + + extern PGDLLIMPORT int CheckPointSegments; + ++typedef void (*checkpoint_create_hook_type)(const CheckPoint *); ++extern PGDLLIMPORT checkpoint_create_hook_type checkpoint_create_hook; ++ + /* Archive modes */ + typedef enum ArchiveMode + { +-- +Tristan Partin +Neon (https://neon.tech) + diff --git a/smgr_patch/v1-0004-Add-contrib-fsync_checker.patch b/smgr_patch/v1-0004-Add-contrib-fsync_checker.patch new file mode 100644 index 0000000000000..579d6efdf5b97 --- /dev/null +++ b/smgr_patch/v1-0004-Add-contrib-fsync_checker.patch @@ -0,0 +1,341 @@ +From d46b41d7c89deb23a6a1afec9d7fe3544b9a3327 Mon Sep 17 00:00:00 2001 +From: Tristan Partin +Date: Wed, 20 Sep 2023 14:23:38 -0500 +Subject: [PATCH v1 4/5] Add contrib/fsync_checker + +fsync_checker is an extension which overrides the global storage manager +to check for volatile relations, those which have been written but not +synced to disk. +--- + contrib/Makefile | 1 + + contrib/fsync_checker/fsync_checker.control | 5 + + contrib/fsync_checker/fsync_checker_smgr.c | 249 ++++++++++++++++++++ + contrib/fsync_checker/meson.build | 22 ++ + contrib/meson.build | 1 + + 5 files changed, 278 insertions(+) + create mode 100644 contrib/fsync_checker/fsync_checker.control + create mode 100644 contrib/fsync_checker/fsync_checker_smgr.c + create mode 100644 contrib/fsync_checker/meson.build + +diff --git a/contrib/Makefile b/contrib/Makefile +index da4e2316a3..c55ced6ec0 100644 +--- a/contrib/Makefile ++++ b/contrib/Makefile +@@ -20,6 +20,7 @@ SUBDIRS = \ + dict_int \ + dict_xsyn \ + earthdistance \ ++ fsync_checker \ + file_fdw \ + fuzzystrmatch \ + hstore \ +diff --git a/contrib/fsync_checker/fsync_checker.control b/contrib/fsync_checker/fsync_checker.control +new file mode 100644 +index 0000000000..7d0e36434b +--- /dev/null ++++ b/contrib/fsync_checker/fsync_checker.control +@@ -0,0 +1,5 @@ ++# fsync_checker extension ++comment = 'SMGR extension for checking volatile writes' ++default_version = '1.0' ++module_pathname = '$libdir/fsync_checker' ++relocatable = true +diff --git a/contrib/fsync_checker/fsync_checker_smgr.c b/contrib/fsync_checker/fsync_checker_smgr.c +new file mode 100644 +index 0000000000..feef2f7d3e +--- /dev/null ++++ b/contrib/fsync_checker/fsync_checker_smgr.c +@@ -0,0 +1,249 @@ ++#include "postgres.h" ++ ++#include "access/xlog.h" ++#include "fmgr.h" ++#include "miscadmin.h" ++#include "storage/ipc.h" ++#include "storage/lwlock.h" ++#include "storage/shmem.h" ++#include "storage/smgr.h" ++#include "storage/md.h" ++#include "utils/hsearch.h" ++ ++PG_MODULE_MAGIC; ++ ++typedef struct volatileRelnKey ++{ ++ RelFileLocator locator; ++ ForkNumber forknum; ++} volatileRelnKey; ++ ++typedef struct volatileRelnEntry ++{ ++ volatileRelnKey key; ++ XLogRecPtr lsn; ++} volatileRelnEntry; ++ ++void _PG_init(void); ++ ++static void fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ++ const void *buffer, bool skipFsync); ++static void fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum); ++static void fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, const void **buffers, ++ BlockNumber nblocks, bool skipFsync); ++static void fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, BlockNumber nblocks); ++static void fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, int nblocks, bool skipFsync); ++ ++static void fsync_checker_checkpoint_create(const CheckPoint *checkPoint); ++static void fsync_checker_shmem_request(void); ++static void fsync_checker_shmem_startup(void); ++ ++static void add_reln(SMgrRelation reln, ForkNumber forknum); ++static void remove_reln(SMgrRelation reln, ForkNumber forknum); ++ ++static SMgrId fsync_checker_smgr_id; ++static const struct f_smgr fsync_checker_smgr = { ++ .name = "fsync_checker", ++ .smgr_init = mdinit, ++ .smgr_shutdown = NULL, ++ .smgr_open = mdopen, ++ .smgr_close = mdclose, ++ .smgr_create = mdcreate, ++ .smgr_exists = mdexists, ++ .smgr_unlink = mdunlink, ++ .smgr_extend = fsync_checker_extend, ++ .smgr_zeroextend = fsync_checker_zeroextend, ++ .smgr_prefetch = mdprefetch, ++ .smgr_readv = mdreadv, ++ .smgr_writev = fsync_checker_writev, ++ .smgr_writeback = fsync_checker_writeback, ++ .smgr_nblocks = mdnblocks, ++ .smgr_truncate = mdtruncate, ++ .smgr_immedsync = fsync_checker_immedsync, ++}; ++ ++static HTAB *volatile_relns; ++static LWLock *volatile_relns_lock; ++static shmem_request_hook_type prev_shmem_request_hook; ++static shmem_startup_hook_type prev_shmem_startup_hook; ++static checkpoint_create_hook_type prev_checkpoint_create_hook; ++ ++void ++_PG_init(void) ++{ ++ prev_checkpoint_create_hook = checkpoint_create_hook; ++ checkpoint_create_hook = fsync_checker_checkpoint_create; ++ ++ prev_shmem_request_hook = shmem_request_hook; ++ shmem_request_hook = fsync_checker_shmem_request; ++ ++ prev_shmem_startup_hook = shmem_startup_hook; ++ shmem_startup_hook = fsync_checker_shmem_startup; ++ ++ /* ++ * Relation size of 0 means we can just defer to md, but it would be nice ++ * to just expose this functionality, so if I needed my own relation, I ++ * could use MdSmgrRelation as the parent. ++ */ ++ fsync_checker_smgr_id = smgr_register(&fsync_checker_smgr, 0); ++ ++ storage_manager_id = fsync_checker_smgr_id; ++} ++ ++static void ++fsync_checker_checkpoint_create(const CheckPoint *checkPoint) ++{ ++ long num_entries; ++ HASH_SEQ_STATUS status; ++ volatileRelnEntry *entry; ++ ++ if (prev_checkpoint_create_hook) ++ prev_checkpoint_create_hook(checkPoint); ++ ++ LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); ++ ++ hash_seq_init(&status, volatile_relns); ++ ++ num_entries = hash_get_num_entries(volatile_relns); ++ elog(INFO, "Analyzing %ld volatile relations", num_entries); ++ while ((entry = hash_seq_search(&status))) ++ { ++ if (entry->lsn < checkPoint->redo) ++ { ++ char *path; ++ ++ path = relpathperm(entry->key.locator, entry->key.forknum); ++ ++ elog(WARNING, "Relation not previously synced: %s", path); ++ ++ pfree(path); ++ } ++ } ++ ++ LWLockRelease(volatile_relns_lock); ++} ++ ++static void ++fsync_checker_shmem_request(void) ++{ ++ if (prev_shmem_request_hook) ++ prev_shmem_request_hook(); ++ ++ RequestAddinShmemSpace(hash_estimate_size(1024, sizeof(volatileRelnEntry))); ++ RequestNamedLWLockTranche("fsync_checker volatile relns lock", 1); ++} ++ ++static void ++fsync_checker_shmem_startup(void) ++{ ++ HASHCTL ctl; ++ ++ if (prev_shmem_startup_hook) ++ prev_shmem_startup_hook(); ++ ++ ctl.keysize = sizeof(volatileRelnKey); ++ ctl.entrysize = sizeof(volatileRelnEntry); ++ volatile_relns = NULL; ++ volatile_relns_lock = NULL; ++ ++ /* ++ * Create or attach to the shared memory state, including hash table ++ */ ++ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); ++ ++ volatile_relns = ShmemInitHash("fsync_checker volatile relns", ++ 1024, 1024, &ctl, HASH_BLOBS | HASH_ELEM); ++ volatile_relns_lock = &GetNamedLWLockTranche("fsync_checker volatile relns lock")->lock; ++ ++ LWLockRelease(AddinShmemInitLock); ++} ++ ++static void ++add_reln(SMgrRelation reln, ForkNumber forknum) ++{ ++ bool found; ++ XLogRecPtr lsn; ++ volatileRelnKey key; ++ volatileRelnEntry *entry; ++ ++ key.locator = reln->smgr_rlocator.locator; ++ key.forknum = forknum; ++ ++ lsn = GetXLogWriteRecPtr(); ++ ++ LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); ++ ++ entry = hash_search(volatile_relns, &key, HASH_ENTER, &found); ++ if (!found) ++ entry->lsn = lsn; ++ ++ LWLockRelease(volatile_relns_lock); ++} ++ ++static void ++remove_reln(SMgrRelation reln, ForkNumber forknum) ++{ ++ volatileRelnKey key; ++ ++ key.locator = reln->smgr_rlocator.locator; ++ key.forknum = forknum; ++ ++ LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); ++ ++ hash_search(volatile_relns, &key, HASH_REMOVE, NULL); ++ ++ LWLockRelease(volatile_relns_lock); ++} ++ ++static void ++fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ++ const void *buffer, bool skipFsync) ++{ ++ if (!SmgrIsTemp(reln) && !skipFsync) ++ add_reln(reln, forknum); ++ ++ mdextend(reln, forknum, blocknum, buffer, skipFsync); ++} ++ ++static void ++fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum) ++{ ++ if (!SmgrIsTemp(reln)) ++ remove_reln(reln, forknum); ++ ++ mdimmedsync(reln, forknum); ++} ++ ++static void ++fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, const void **buffers, ++ BlockNumber nblocks, bool skipFsync) ++{ ++ if (!SmgrIsTemp(reln) && !skipFsync) ++ add_reln(reln, forknum); ++ ++ mdwritev(reln, forknum, blocknum, buffers, nblocks, skipFsync); ++} ++ ++static void ++fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, BlockNumber nblocks) ++{ ++ if (!SmgrIsTemp(reln)) ++ remove_reln(reln, forknum); ++ ++ mdwriteback(reln, forknum, blocknum, nblocks); ++} ++ ++static void ++fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, ++ BlockNumber blocknum, int nblocks, bool skipFsync) ++{ ++ if (!SmgrIsTemp(reln) && !skipFsync) ++ add_reln(reln, forknum); ++ ++ mdzeroextend(reln, forknum, blocknum, nblocks, skipFsync); ++} +diff --git a/contrib/fsync_checker/meson.build b/contrib/fsync_checker/meson.build +new file mode 100644 +index 0000000000..ce6ed7fe90 +--- /dev/null ++++ b/contrib/fsync_checker/meson.build +@@ -0,0 +1,22 @@ ++# Copyright (c) 2023, PostgreSQL Global Development Group ++ ++fsync_checker_sources = files( ++ 'fsync_checker_smgr.c', ++) ++ ++if host_system == 'windows' ++ fsync_checker_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ ++ '--NAME', 'fsync_checker', ++ '--FILEDESC', 'fsync_checker - SMGR extension for checking volatile relations',]) ++endif ++ ++fsync_checker = shared_module('fsync_checker', ++ fsync_checker_sources, ++ kwargs: contrib_mod_args, ++) ++contrib_targets += fsync_checker ++ ++install_data( ++ 'fsync_checker.control', ++ kwargs: contrib_data_args, ++) +diff --git a/contrib/meson.build b/contrib/meson.build +index c12dc906ca..e5d872494a 100644 +--- a/contrib/meson.build ++++ b/contrib/meson.build +@@ -29,6 +29,7 @@ subdir('dict_int') + subdir('dict_xsyn') + subdir('earthdistance') + subdir('file_fdw') ++subdir('fsync_checker') + subdir('fuzzystrmatch') + subdir('hstore') + subdir('hstore_plperl') +-- +Tristan Partin +Neon (https://neon.tech) + From c3c1ae6295b9f69c60586f199e641930e938db06 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Wed, 31 Jul 2024 20:50:15 +0100 Subject: [PATCH 07/13] fixing rebase bug --- src/backend/storage/smgr/md.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index f3e52b2b15884..a2dcd2068c959 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1299,6 +1299,7 @@ mdregistersync(SMgrRelation reln, ForkNumber forknum) { int segno; int min_inactive_seg; + MdSMgrRelation mdreln = (MdSMgrRelation) reln; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1306,7 +1307,7 @@ mdregistersync(SMgrRelation reln, ForkNumber forknum) */ mdnblocks(reln, forknum); - min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + min_inactive_seg = segno = mdreln->md_num_open_segs[forknum]; /* * Temporarily open inactive segments, then close them after sync. There @@ -1314,20 +1315,20 @@ mdregistersync(SMgrRelation reln, ForkNumber forknum) * harmless. We don't bother to clean them up and take a risk of further * trouble. The next mdclose() will soon close them. */ - while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + while (_mdfd_openseg(mdreln, forknum, segno, 0) != NULL) segno++; while (segno > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + MdfdVec *v = &mdreln->md_seg_fds[forknum][segno - 1]; - register_dirty_segment(reln, forknum, v); + register_dirty_segment(mdreln, forknum, v); /* Close inactive segments immediately */ if (segno > min_inactive_seg) { FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, segno - 1); + _fdvec_resize(mdreln, forknum, segno - 1); } segno--; From c239a8c111bf71f6f0df89486119de9ca7146c4a Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Wed, 28 Aug 2024 20:24:26 +0100 Subject: [PATCH 08/13] Removed fsync_checker extension from patch as we do not want to include it --- contrib/Makefile | 1 - contrib/fsync_checker/fsync_checker.control | 5 - contrib/fsync_checker/fsync_checker_smgr.c | 250 -------------------- contrib/fsync_checker/meson.build | 22 -- contrib/meson.build | 1 - 5 files changed, 279 deletions(-) delete mode 100644 contrib/fsync_checker/fsync_checker.control delete mode 100644 contrib/fsync_checker/fsync_checker_smgr.c delete mode 100644 contrib/fsync_checker/meson.build diff --git a/contrib/Makefile b/contrib/Makefile index 091dd9e33228a..abd780f277405 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -19,7 +19,6 @@ SUBDIRS = \ dict_int \ dict_xsyn \ earthdistance \ - fsync_checker \ file_fdw \ fuzzystrmatch \ hstore \ diff --git a/contrib/fsync_checker/fsync_checker.control b/contrib/fsync_checker/fsync_checker.control deleted file mode 100644 index 7d0e36434bfaf..0000000000000 --- a/contrib/fsync_checker/fsync_checker.control +++ /dev/null @@ -1,5 +0,0 @@ -# fsync_checker extension -comment = 'SMGR extension for checking volatile writes' -default_version = '1.0' -module_pathname = '$libdir/fsync_checker' -relocatable = true diff --git a/contrib/fsync_checker/fsync_checker_smgr.c b/contrib/fsync_checker/fsync_checker_smgr.c deleted file mode 100644 index 17d0accb1eeba..0000000000000 --- a/contrib/fsync_checker/fsync_checker_smgr.c +++ /dev/null @@ -1,250 +0,0 @@ -#include "postgres.h" - -#include "access/xlog.h" -#include "fmgr.h" -#include "miscadmin.h" -#include "storage/ipc.h" -#include "storage/lwlock.h" -#include "storage/shmem.h" -#include "storage/smgr.h" -#include "storage/md.h" -#include "utils/hsearch.h" - -PG_MODULE_MAGIC; - -typedef struct volatileRelnKey -{ - RelFileLocator locator; - ForkNumber forknum; -} volatileRelnKey; - -typedef struct volatileRelnEntry -{ - volatileRelnKey key; - XLogRecPtr lsn; -} volatileRelnEntry; - -void _PG_init(void); - -static void fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void *buffer, bool skipFsync); -static void fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum); -static void fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void **buffers, - BlockNumber nblocks, bool skipFsync); -static void fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); -static void fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); - -static void fsync_checker_checkpoint_create(const CheckPoint *checkPoint); -static void fsync_checker_shmem_request(void); -static void fsync_checker_shmem_startup(void); - -static void add_reln(SMgrRelation reln, ForkNumber forknum); -static void remove_reln(SMgrRelation reln, ForkNumber forknum); - -static SMgrId fsync_checker_smgr_id; -static const struct f_smgr fsync_checker_smgr = { - .name = "fsync_checker", - .smgr_init = mdinit, - .smgr_shutdown = NULL, - .smgr_open = mdopen, - .smgr_close = mdclose, - .smgr_create = mdcreate, - .smgr_exists = mdexists, - .smgr_unlink = mdunlink, - .smgr_extend = fsync_checker_extend, - .smgr_zeroextend = fsync_checker_zeroextend, - .smgr_prefetch = mdprefetch, - .smgr_readv = mdreadv, - .smgr_writev = fsync_checker_writev, - .smgr_writeback = fsync_checker_writeback, - .smgr_nblocks = mdnblocks, - .smgr_truncate = mdtruncate, - .smgr_immedsync = fsync_checker_immedsync, - .smgr_registersync = mdregistersync, -}; - -static HTAB *volatile_relns; -static LWLock *volatile_relns_lock; -static shmem_request_hook_type prev_shmem_request_hook; -static shmem_startup_hook_type prev_shmem_startup_hook; -static checkpoint_create_hook_type prev_checkpoint_create_hook; - -void -_PG_init(void) -{ - prev_checkpoint_create_hook = checkpoint_create_hook; - checkpoint_create_hook = fsync_checker_checkpoint_create; - - prev_shmem_request_hook = shmem_request_hook; - shmem_request_hook = fsync_checker_shmem_request; - - prev_shmem_startup_hook = shmem_startup_hook; - shmem_startup_hook = fsync_checker_shmem_startup; - - /* - * Relation size of 0 means we can just defer to md, but it would be nice - * to just expose this functionality, so if I needed my own relation, I - * could use MdSmgrRelation as the parent. - */ - fsync_checker_smgr_id = smgr_register(&fsync_checker_smgr, 0); - - storage_manager_id = fsync_checker_smgr_id; -} - -static void -fsync_checker_checkpoint_create(const CheckPoint *checkPoint) -{ - long num_entries; - HASH_SEQ_STATUS status; - volatileRelnEntry *entry; - - if (prev_checkpoint_create_hook) - prev_checkpoint_create_hook(checkPoint); - - LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); - - hash_seq_init(&status, volatile_relns); - - num_entries = hash_get_num_entries(volatile_relns); - elog(INFO, "Analyzing %ld volatile relations", num_entries); - while ((entry = hash_seq_search(&status))) - { - if (entry->lsn < checkPoint->redo) - { - char *path; - - path = relpathperm(entry->key.locator, entry->key.forknum); - - elog(WARNING, "Relation not previously synced: %s", path); - - pfree(path); - } - } - - LWLockRelease(volatile_relns_lock); -} - -static void -fsync_checker_shmem_request(void) -{ - if (prev_shmem_request_hook) - prev_shmem_request_hook(); - - RequestAddinShmemSpace(hash_estimate_size(1024, sizeof(volatileRelnEntry))); - RequestNamedLWLockTranche("fsync_checker volatile relns lock", 1); -} - -static void -fsync_checker_shmem_startup(void) -{ - HASHCTL ctl; - - if (prev_shmem_startup_hook) - prev_shmem_startup_hook(); - - ctl.keysize = sizeof(volatileRelnKey); - ctl.entrysize = sizeof(volatileRelnEntry); - volatile_relns = NULL; - volatile_relns_lock = NULL; - - /* - * Create or attach to the shared memory state, including hash table - */ - LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - - volatile_relns = ShmemInitHash("fsync_checker volatile relns", - 1024, 1024, &ctl, HASH_BLOBS | HASH_ELEM); - volatile_relns_lock = &GetNamedLWLockTranche("fsync_checker volatile relns lock")->lock; - - LWLockRelease(AddinShmemInitLock); -} - -static void -add_reln(SMgrRelation reln, ForkNumber forknum) -{ - bool found; - XLogRecPtr lsn; - volatileRelnKey key; - volatileRelnEntry *entry; - - key.locator = reln->smgr_rlocator.locator; - key.forknum = forknum; - - lsn = GetXLogWriteRecPtr(); - - LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); - - entry = hash_search(volatile_relns, &key, HASH_ENTER, &found); - if (!found) - entry->lsn = lsn; - - LWLockRelease(volatile_relns_lock); -} - -static void -remove_reln(SMgrRelation reln, ForkNumber forknum) -{ - volatileRelnKey key; - - key.locator = reln->smgr_rlocator.locator; - key.forknum = forknum; - - LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE); - - hash_search(volatile_relns, &key, HASH_REMOVE, NULL); - - LWLockRelease(volatile_relns_lock); -} - -static void -fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void *buffer, bool skipFsync) -{ - if (!SmgrIsTemp(reln) && !skipFsync) - add_reln(reln, forknum); - - mdextend(reln, forknum, blocknum, buffer, skipFsync); -} - -static void -fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum) -{ - if (!SmgrIsTemp(reln)) - remove_reln(reln, forknum); - - mdimmedsync(reln, forknum); -} - -static void -fsync_checker_writev(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void **buffers, - BlockNumber nblocks, bool skipFsync) -{ - if (!SmgrIsTemp(reln) && !skipFsync) - add_reln(reln, forknum); - - mdwritev(reln, forknum, blocknum, buffers, nblocks, skipFsync); -} - -static void -fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) -{ - if (!SmgrIsTemp(reln)) - remove_reln(reln, forknum); - - mdwriteback(reln, forknum, blocknum, nblocks); -} - -static void -fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync) -{ - if (!SmgrIsTemp(reln) && !skipFsync) - add_reln(reln, forknum); - - mdzeroextend(reln, forknum, blocknum, nblocks, skipFsync); -} diff --git a/contrib/fsync_checker/meson.build b/contrib/fsync_checker/meson.build deleted file mode 100644 index ce6ed7fe90bbb..0000000000000 --- a/contrib/fsync_checker/meson.build +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2023, PostgreSQL Global Development Group - -fsync_checker_sources = files( - 'fsync_checker_smgr.c', -) - -if host_system == 'windows' - fsync_checker_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ - '--NAME', 'fsync_checker', - '--FILEDESC', 'fsync_checker - SMGR extension for checking volatile relations',]) -endif - -fsync_checker = shared_module('fsync_checker', - fsync_checker_sources, - kwargs: contrib_mod_args, -) -contrib_targets += fsync_checker - -install_data( - 'fsync_checker.control', - kwargs: contrib_data_args, -) diff --git a/contrib/meson.build b/contrib/meson.build index 907c3f4fd18a8..ce5630d64aee5 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -28,7 +28,6 @@ subdir('dict_int') subdir('dict_xsyn') subdir('earthdistance') subdir('file_fdw') -subdir('fsync_checker') subdir('fuzzystrmatch') subdir('hstore') subdir('hstore_plperl') From 5786e24b21bc8f160eeabff169d23d68ce0c9465 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Wed, 28 Aug 2024 20:26:58 +0100 Subject: [PATCH 09/13] Updated pg_tde submodule reference --- .gitmodules | 2 +- contrib/pg_tde | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 21078cb7f88f3..da75b1982f3ee 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ [submodule "contrib/pg_tde"] path = contrib/pg_tde url = https://github.com/Percona-Lab/pg_tde.git - branch = smgr + branch = main diff --git a/contrib/pg_tde b/contrib/pg_tde index 36f6d6bff8f67..058666090e0c3 160000 --- a/contrib/pg_tde +++ b/contrib/pg_tde @@ -1 +1 @@ -Subproject commit 36f6d6bff8f67ffa9b3f84a5d548512f86f0d7b7 +Subproject commit 058666090e0c31d676049d020a8a10df94ed2fe0 From 0388294643712341030294c7d1008d6c0339365c Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Thu, 29 Aug 2024 18:47:48 +0100 Subject: [PATCH 10/13] Added pg_tde to makefile --- contrib/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/Makefile b/contrib/Makefile index abd780f277405..272c59fb7613a 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -35,6 +35,7 @@ SUBDIRS = \ pg_prewarm \ pg_stat_statements \ pg_surgery \ + pg_tde \ pg_trgm \ pgrowlocks \ pgstattuple \ From 7573d1b59f4fe58c1a4664a5055b947ed9310979 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Sun, 1 Sep 2024 17:32:47 +0100 Subject: [PATCH 11/13] PG-981: Renamed PERCONA_FORK to PERCONA_EXT --- src/include/pg_config_manual.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 9dc31cb64a632..6406a599fdabf 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -387,4 +387,4 @@ /* * Enable Percona specific features, should always be defined in this fork */ -#define PERCONA_FORK 1 +#define PERCONA_EXT 1 From 3b0c25990dc1f3dc7bb8add2b207b40b9fadb7d6 Mon Sep 17 00:00:00 2001 From: Andrew Pogrebnoy Date: Mon, 2 Sep 2024 18:30:09 +0300 Subject: [PATCH 12/13] Make pg_waldump read encrypted WAL --- src/bin/Makefile.tde | 17 +++++++++++++++++ src/bin/meson.build | 20 ++++++++++++++++++++ src/bin/pg_waldump/Makefile | 7 ++++--- src/bin/pg_waldump/meson.build | 4 +++- src/bin/pg_waldump/pg_waldump.c | 22 +++++++++++++++++++++- 5 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 src/bin/Makefile.tde diff --git a/src/bin/Makefile.tde b/src/bin/Makefile.tde new file mode 100644 index 0000000000000..57f2a5ff40f34 --- /dev/null +++ b/src/bin/Makefile.tde @@ -0,0 +1,17 @@ +TDE_OBJS = \ + $(top_srcdir)/src/fe_utils/simple_list.o \ + $(top_srcdir)/contrib/pg_tde/src/access/pg_tde_tdemap.o \ + $(top_srcdir)/contrib/pg_tde/src/access/pg_tde_xlog_encrypt.o \ + $(top_srcdir)/contrib/pg_tde/src/catalog/tde_global_space.o \ + $(top_srcdir)/contrib/pg_tde/src/catalog/tde_keyring.o \ + $(top_srcdir)/contrib/pg_tde/src/catalog/tde_keyring_parse_opts.o \ + $(top_srcdir)/contrib/pg_tde/src/catalog/tde_principal_key.o \ + $(top_srcdir)/contrib/pg_tde/src/common/pg_tde_utils.o \ + $(top_srcdir)/contrib/pg_tde/src/encryption/enc_aes.o \ + $(top_srcdir)/contrib/pg_tde/src/encryption/enc_tde.o \ + $(top_srcdir)/contrib/pg_tde/src/keyring/keyring_api.o \ + $(top_srcdir)/contrib/pg_tde/src/keyring/keyring_curl.o \ + $(top_srcdir)/contrib/pg_tde/src/keyring/keyring_file.o \ + $(top_srcdir)/contrib/pg_tde/src/keyring/keyring_vault.o + +TDE_LIBS = -lcurl diff --git a/src/bin/meson.build b/src/bin/meson.build index aa60ebaa3026f..e050732d6406d 100644 --- a/src/bin/meson.build +++ b/src/bin/meson.build @@ -1,5 +1,25 @@ # Copyright (c) 2022-2024, PostgreSQL Global Development Group +tde_decrypt_sources = files( + '../../contrib/pg_tde/src/access/pg_tde_tdemap.c', + '../../contrib/pg_tde/src/access/pg_tde_xlog_encrypt.c', + '../../contrib/pg_tde/src/catalog/tde_global_space.c', + '../../contrib/pg_tde/src/catalog/tde_keyring.c', + '../../contrib/pg_tde/src/catalog/tde_keyring_parse_opts.c', + '../../contrib/pg_tde/src/catalog/tde_principal_key.c', + '../../contrib/pg_tde/src/common/pg_tde_utils.c', + '../../contrib/pg_tde/src/encryption/enc_aes.c', + '../../contrib/pg_tde/src/encryption/enc_tde.c', + '../../contrib/pg_tde/src/keyring/keyring_api.c', + '../../contrib/pg_tde/src/keyring/keyring_curl.c', + '../../contrib/pg_tde/src/keyring/keyring_file.c', + '../../contrib/pg_tde/src/keyring/keyring_vault.c', +) +tde_include = include_directories('../../contrib/pg_tde/src/include') + +curldep = dependency('libcurl') +tde_deps = [curldep] + subdir('initdb') subdir('pg_amcheck') subdir('pg_archivecleanup') diff --git a/src/bin/pg_waldump/Makefile b/src/bin/pg_waldump/Makefile index 4c1ee649501f4..b437d9c96e513 100644 --- a/src/bin/pg_waldump/Makefile +++ b/src/bin/pg_waldump/Makefile @@ -6,26 +6,27 @@ PGAPPICON=win32 subdir = src/bin/pg_waldump top_builddir = ../../.. include $(top_builddir)/src/Makefile.global +include ../Makefile.tde OBJS = \ $(RMGRDESCOBJS) \ $(WIN32RES) \ + $(TDE_OBJS) \ compat.o \ pg_waldump.o \ rmgrdesc.o \ xlogreader.o \ xlogstats.o -override CPPFLAGS := -DFRONTEND $(CPPFLAGS) +override CPPFLAGS := -I$(top_srcdir)/contrib/pg_tde/src/include -DFRONTEND $(CPPFLAGS) RMGRDESCSOURCES = $(sort $(notdir $(wildcard $(top_srcdir)/src/backend/access/rmgrdesc/*desc*.c))) RMGRDESCOBJS = $(patsubst %.c,%.o,$(RMGRDESCSOURCES)) - all: pg_waldump pg_waldump: $(OBJS) | submake-libpgport - $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) $(TDE_LIBS) -o $@$(X) xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% rm -f $@ && $(LN_S) $< . diff --git a/src/bin/pg_waldump/meson.build b/src/bin/pg_waldump/meson.build index bb30f0fe08e87..a75ad071bf0d6 100644 --- a/src/bin/pg_waldump/meson.build +++ b/src/bin/pg_waldump/meson.build @@ -9,6 +9,7 @@ pg_waldump_sources = files( pg_waldump_sources += rmgr_desc_sources pg_waldump_sources += xlogreader_sources pg_waldump_sources += files('../../backend/access/transam/xlogstats.c') +pg_waldump_sources += tde_decrypt_sources if host_system == 'windows' pg_waldump_sources += rc_bin_gen.process(win32ver_rc, extra_args: [ @@ -18,9 +19,10 @@ endif pg_waldump = executable('pg_waldump', pg_waldump_sources, - dependencies: [frontend_code, lz4, zstd], + dependencies: [frontend_code, lz4, zstd, tde_deps], c_args: ['-DFRONTEND'], # needed for xlogreader et al kwargs: default_bin_args, + include_directories: [postgres_inc, tde_include], ) bin_targets += pg_waldump diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 1f9403fc5cf4b..cfb179805d18b 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -19,6 +19,7 @@ #include #include "access/transam.h" +#include "access/pg_tde_xlog_encrypt.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "access/xlogrecord.h" @@ -770,6 +771,9 @@ usage(void) printf(_(" -p, --path=PATH directory in which to find WAL segment files or a\n" " directory with a ./pg_wal that contains such files\n" " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); + printf(_(" -k, keyring-path=PATH directory in which to find keyring config files for WAL\n" + " such files are pg_tde.map, pg_tde.dat, and pg_tde_keyrings\n" + " (default: ./global)\n")); printf(_(" -q, --quiet do not print any output, except for errors\n")); printf(_(" -r, --rmgr=RMGR only show records generated by resource manager RMGR;\n" " use --rmgr=list to list valid resource manager names\n")); @@ -800,6 +804,7 @@ main(int argc, char **argv) XLogRecord *record; XLogRecPtr first_record; char *waldir = NULL; + char *kringdir = NULL; char *errormsg; static struct option long_options[] = { @@ -812,6 +817,7 @@ main(int argc, char **argv) {"help", no_argument, NULL, '?'}, {"limit", required_argument, NULL, 'n'}, {"path", required_argument, NULL, 'p'}, + {"keyring-path", optional_argument, NULL, 'k'}, {"quiet", no_argument, NULL, 'q'}, {"relation", required_argument, NULL, 'R'}, {"rmgr", required_argument, NULL, 'r'}, @@ -885,7 +891,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:k:qr:R:s:t:wx:z", long_options, &optindex)) != -1) { switch (option) @@ -934,6 +940,9 @@ main(int argc, char **argv) case 'p': waldir = pg_strdup(optarg); break; + case 'k': + kringdir = pg_strdup(optarg); + break; case 'q': config.quiet = true; break; @@ -1106,6 +1115,17 @@ main(int argc, char **argv) } } + /* + * TDE routines init + * -------- + */ + AesInit(); + InstallFileKeyring(); + InstallVaultV2Keyring(); + TDEInitGlobalKeys(kringdir); + TDEXLogSmgrInit(); + + if (config.save_fullpage_path != NULL) create_fullpage_directory(config.save_fullpage_path); From bd789bd7f02e1a25f219a6407d7e1b4b5d9837dc Mon Sep 17 00:00:00 2001 From: Andrew Pogrebnoy Date: Wed, 4 Sep 2024 15:15:39 +0300 Subject: [PATCH 13/13] Add tests --- src/bin/pg_waldump/meson.build | 2 + src/bin/pg_waldump/pg_waldump.c | 70 +++-- src/bin/pg_waldump/t/003_basic_encrypted.pl | 247 ++++++++++++++++++ .../t/004_save_fullpage_encrypted.pl | 117 +++++++++ 4 files changed, 400 insertions(+), 36 deletions(-) create mode 100644 src/bin/pg_waldump/t/003_basic_encrypted.pl create mode 100644 src/bin/pg_waldump/t/004_save_fullpage_encrypted.pl diff --git a/src/bin/pg_waldump/meson.build b/src/bin/pg_waldump/meson.build index a75ad071bf0d6..24130f1f281fb 100644 --- a/src/bin/pg_waldump/meson.build +++ b/src/bin/pg_waldump/meson.build @@ -34,6 +34,8 @@ tests += { 'tests': [ 't/001_basic.pl', 't/002_save_fullpage.pl', + 't/003_basic_encrypted.pl', + 't/004_save_fullpage_encrypted.pl', ], }, } diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index cfb179805d18b..073ca5d550311 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -19,7 +19,6 @@ #include #include "access/transam.h" -#include "access/pg_tde_xlog_encrypt.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "access/xlogrecord.h" @@ -33,6 +32,8 @@ #include "rmgrdesc.h" #include "storage/bufpage.h" +#include "access/pg_tde_xlog_encrypt_fe.h" + /* * NOTE: For any code change or issue fix here, it is highly recommended to * give a thought about doing the same in pg_walinspect contrib module as well. @@ -761,33 +762,33 @@ usage(void) printf(_("Usage:\n")); printf(_(" %s [OPTION]... [STARTSEG [ENDSEG]]\n"), progname); printf(_("\nOptions:\n")); - printf(_(" -b, --bkp-details output detailed information about backup blocks\n")); - printf(_(" -B, --block=N with --relation, only show records that modify block N\n")); - printf(_(" -e, --end=RECPTR stop reading at WAL location RECPTR\n")); - printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); - printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" - " valid names are main, fsm, vm, init\n")); - printf(_(" -n, --limit=N number of records to display\n")); - printf(_(" -p, --path=PATH directory in which to find WAL segment files or a\n" - " directory with a ./pg_wal that contains such files\n" - " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); - printf(_(" -k, keyring-path=PATH directory in which to find keyring config files for WAL\n" - " such files are pg_tde.map, pg_tde.dat, and pg_tde_keyrings\n" - " (default: ./global)\n")); - printf(_(" -q, --quiet do not print any output, except for errors\n")); - printf(_(" -r, --rmgr=RMGR only show records generated by resource manager RMGR;\n" - " use --rmgr=list to list valid resource manager names\n")); - printf(_(" -R, --relation=T/D/R only show records that modify blocks in relation T/D/R\n")); - printf(_(" -s, --start=RECPTR start reading at WAL location RECPTR\n")); - printf(_(" -t, --timeline=TLI timeline from which to read WAL records\n" - " (default: 1 or the value used in STARTSEG)\n")); - printf(_(" -V, --version output version information, then exit\n")); - printf(_(" -w, --fullpage only show records with a full page write\n")); - printf(_(" -x, --xid=XID only show records with transaction ID XID\n")); - printf(_(" -z, --stats[=record] show statistics instead of records\n" - " (optionally, show per-record statistics)\n")); - printf(_(" --save-fullpage=DIR save full page images to DIR\n")); - printf(_(" -?, --help show this help, then exit\n")); + printf(_(" -b, --bkp-details output detailed information about backup blocks\n")); + printf(_(" -B, --block=N with --relation, only show records that modify block N\n")); + printf(_(" -e, --end=RECPTR stop reading at WAL location RECPTR\n")); + printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); + printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" + " valid names are main, fsm, vm, init\n")); + printf(_(" -n, --limit=N number of records to display\n")); + printf(_(" -p, --path=PATH directory in which to find WAL segment files or a\n" + " directory with a ./pg_wal that contains such files\n" + " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); + printf(_(" -k, --keyring-path=PATH directory in which to find keyring config files for WAL\n" + " such files are pg_tde.map, pg_tde.dat, and pg_tde_keyrings\n" + " (it will not try to decrypt WAL if not set)\n")); + printf(_(" -q, --quiet do not print any output, except for errors\n")); + printf(_(" -r, --rmgr=RMGR only show records generated by resource manager RMGR;\n" + " use --rmgr=list to list valid resource manager names\n")); + printf(_(" -R, --relation=T/D/R only show records that modify blocks in relation T/D/R\n")); + printf(_(" -s, --start=RECPTR start reading at WAL location RECPTR\n")); + printf(_(" -t, --timeline=TLI timeline from which to read WAL records\n" + " (default: 1 or the value used in STARTSEG)\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -w, --fullpage only show records with a full page write\n")); + printf(_(" -x, --xid=XID only show records with transaction ID XID\n")); + printf(_(" -z, --stats[=record] show statistics instead of records\n" + " (optionally, show per-record statistics)\n")); + printf(_(" --save-fullpage=DIR save full page images to DIR\n")); + printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT); printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); } @@ -1116,15 +1117,12 @@ main(int argc, char **argv) } /* - * TDE routines init - * -------- + * Make possible to read ecrypted WAL */ - AesInit(); - InstallFileKeyring(); - InstallVaultV2Keyring(); - TDEInitGlobalKeys(kringdir); - TDEXLogSmgrInit(); - + if (kringdir != NULL) + { + TDE_XLOG_INIT(kringdir); + } if (config.save_fullpage_path != NULL) create_fullpage_directory(config.save_fullpage_path); diff --git a/src/bin/pg_waldump/t/003_basic_encrypted.pl b/src/bin/pg_waldump/t/003_basic_encrypted.pl new file mode 100644 index 0000000000000..837452cd7e541 --- /dev/null +++ b/src/bin/pg_waldump/t/003_basic_encrypted.pl @@ -0,0 +1,247 @@ + +# Copyright (c) 2021-2024, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf( + 'postgresql.conf', q{ +autovacuum = off +checkpoint_timeout = 1h + +# for standbydesc +archive_mode=on +archive_command='' + +# for XLOG_HEAP_TRUNCATE +wal_level=logical + +# WAL Encryption +shared_preload_libraries = 'pg_tde' +pg_tde.wal_encrypt = on +}); +$node->start; + +my ($start_lsn, $start_walfile) = split /\|/, + $node->safe_psql('postgres', + q{SELECT pg_current_wal_insert_lsn(), pg_walfile_name(pg_current_wal_insert_lsn())} + ); + +$node->safe_psql( + 'postgres', q{ +-- heap, btree, hash, sequence +CREATE TABLE t1 (a int GENERATED ALWAYS AS IDENTITY, b text); +CREATE INDEX i1a ON t1 USING btree (a); +CREATE INDEX i1b ON t1 USING hash (b); +INSERT INTO t1 VALUES (default, 'one'), (default, 'two'); +DELETE FROM t1 WHERE b = 'one'; +TRUNCATE t1; + +-- abort +START TRANSACTION; +INSERT INTO t1 VALUES (default, 'three'); +ROLLBACK; + +-- unlogged/init fork +CREATE UNLOGGED TABLE t2 (x int); +CREATE INDEX i2 ON t2 USING btree (x); +INSERT INTO t2 SELECT generate_series(1, 10); + +-- gin +CREATE TABLE gin_idx_tbl (id bigserial PRIMARY KEY, data jsonb); +CREATE INDEX gin_idx ON gin_idx_tbl USING gin (data); +INSERT INTO gin_idx_tbl + WITH random_json AS ( + SELECT json_object_agg(key, trunc(random() * 10)) as json_data + FROM unnest(array['a', 'b', 'c']) as u(key)) + SELECT generate_series(1,500), json_data FROM random_json; + +-- gist, spgist +CREATE TABLE gist_idx_tbl (p point); +CREATE INDEX gist_idx ON gist_idx_tbl USING gist (p); +CREATE INDEX spgist_idx ON gist_idx_tbl USING spgist (p); +INSERT INTO gist_idx_tbl (p) VALUES (point '(1, 1)'), (point '(3, 2)'), (point '(6, 3)'); + +-- brin +CREATE TABLE brin_idx_tbl (col1 int, col2 text, col3 text ); +CREATE INDEX brin_idx ON brin_idx_tbl USING brin (col1, col2, col3) WITH (autosummarize=on); +INSERT INTO brin_idx_tbl SELECT generate_series(1, 10000), 'dummy', 'dummy'; +UPDATE brin_idx_tbl SET col2 = 'updated' WHERE col1 BETWEEN 1 AND 5000; +SELECT brin_summarize_range('brin_idx', 0); +SELECT brin_desummarize_range('brin_idx', 0); + +VACUUM; + +-- logical message +SELECT pg_logical_emit_message(true, 'foo', 'bar'); + +-- relmap +VACUUM FULL pg_authid; + +-- database +CREATE DATABASE d1; +DROP DATABASE d1; +}); + +my $tblspc_path = PostgreSQL::Test::Utils::tempdir_short(); + +$node->safe_psql( + 'postgres', qq{ +CREATE TABLESPACE ts1 LOCATION '$tblspc_path'; +DROP TABLESPACE ts1; +}); + +my ($end_lsn, $end_walfile) = split /\|/, + $node->safe_psql('postgres', + q{SELECT pg_current_wal_insert_lsn(), pg_walfile_name(pg_current_wal_insert_lsn())} + ); + +my $default_ts_oid = $node->safe_psql('postgres', + q{SELECT oid FROM pg_tablespace WHERE spcname = 'pg_default'}); +my $postgres_db_oid = $node->safe_psql('postgres', + q{SELECT oid FROM pg_database WHERE datname = 'postgres'}); +my $rel_t1_oid = $node->safe_psql('postgres', + q{SELECT oid FROM pg_class WHERE relname = 't1'}); +my $rel_i1a_oid = $node->safe_psql('postgres', + q{SELECT oid FROM pg_class WHERE relname = 'i1a'}); + +$node->stop; + + +# various ways of specifying WAL range +command_fails_like( + [ 'pg_waldump', 'foo', 'bar' ], + qr/error: could not locate WAL file "foo"/, + 'start file not found'); +command_like([ 'pg_waldump', '-k', $node->data_dir. '/global', $node->data_dir . '/pg_wal/' . $start_walfile ], + qr/./, 'runs with start segment specified'); +command_fails_like( + [ 'pg_waldump', $node->data_dir . '/pg_wal/' . $start_walfile, 'bar' ], + qr/error: could not open file "bar"/, + 'end file not found'); +command_like( + [ + 'pg_waldump', + '-k', $node->data_dir. '/global', + $node->data_dir . '/pg_wal/' . $start_walfile, + $node->data_dir . '/pg_wal/' . $end_walfile + ], + qr/./, + 'runs with start and end segment specified'); +command_fails_like( + [ 'pg_waldump', '-p', $node->data_dir ], + qr/error: no start WAL location given/, + 'path option requires start location'); +command_like( + [ + 'pg_waldump', '-p', $node->data_dir, '--start', + $start_lsn, '--end', $end_lsn, + '-k', $node->data_dir. '/global' + ], + qr/./, + 'runs with path option and start and end locations'); +command_fails_like( + [ 'pg_waldump', '-k', $node->data_dir. '/global', '-p', $node->data_dir, '--start', $start_lsn ], + qr/error: error in WAL record at/, + 'falling off the end of the WAL results in an error'); + +command_like( + [ + 'pg_waldump', '--quiet', + '-k', $node->data_dir. '/global', + $node->data_dir . '/pg_wal/' . $start_walfile + ], + qr/^$/, + 'no output with --quiet option'); +command_fails_like( + [ 'pg_waldump', '--quiet', '-k', $node->data_dir. '/global', '-p', $node->data_dir, '--start', $start_lsn ], + qr/error: error in WAL record at/, + 'errors are shown with --quiet'); + + +# Test for: Display a message that we're skipping data if `from` +# wasn't a pointer to the start of a record. +{ + # Construct a new LSN that is one byte past the original + # start_lsn. + my ($part1, $part2) = split qr{/}, $start_lsn; + my $lsn2 = hex $part2; + $lsn2++; + my $new_start = sprintf("%s/%X", $part1, $lsn2); + + my (@cmd, $stdout, $stderr, $result); + + @cmd = ( + 'pg_waldump', '-k', $node->data_dir. '/global', + '--start', $new_start, + $node->data_dir . '/pg_wal/' . $start_walfile); + $result = IPC::Run::run \@cmd, '>', \$stdout, '2>', \$stderr; + ok($result, "runs with start segment and start LSN specified"); + like($stderr, qr/first record is after/, 'info message printed'); +} + + +# Helper function to test various options. Pass options as arguments. +# Output lines are returned as array. +sub test_pg_waldump +{ + local $Test::Builder::Level = $Test::Builder::Level + 1; + my @opts = @_; + + my (@cmd, $stdout, $stderr, $result, @lines); + + @cmd = ( + 'pg_waldump', '-k', $node->data_dir. '/global', '-p', $node->data_dir, + '--start', $start_lsn, '--end', $end_lsn); + push @cmd, @opts; + $result = IPC::Run::run \@cmd, '>', \$stdout, '2>', \$stderr; + ok($result, "pg_waldump @opts: runs ok"); + is($stderr, '', "pg_waldump @opts: no stderr"); + @lines = split /\n/, $stdout; + ok(@lines > 0, "pg_waldump @opts: some lines are output"); + return @lines; +} + +my @lines; + +@lines = test_pg_waldump; +is(grep(!/^rmgr: \w/, @lines), 0, 'all output lines are rmgr lines'); + +@lines = test_pg_waldump('--limit', 6); +is(@lines, 6, 'limit option observed'); + +@lines = test_pg_waldump('--fullpage'); +is(grep(!/^rmgr:.*\bFPW\b/, @lines), 0, 'all output lines are FPW'); + +@lines = test_pg_waldump('--stats'); +like($lines[0], qr/WAL statistics/, "statistics on stdout"); +is(grep(/^rmgr:/, @lines), 0, 'no rmgr lines output'); + +@lines = test_pg_waldump('--stats=record'); +like($lines[0], qr/WAL statistics/, "statistics on stdout"); +is(grep(/^rmgr:/, @lines), 0, 'no rmgr lines output'); + +@lines = test_pg_waldump('--rmgr', 'Btree'); +is(grep(!/^rmgr: Btree/, @lines), 0, 'only Btree lines'); + +@lines = test_pg_waldump('--fork', 'init'); +is(grep(!/fork init/, @lines), 0, 'only init fork lines'); + +@lines = test_pg_waldump('--relation', + "$default_ts_oid/$postgres_db_oid/$rel_t1_oid"); +is(grep(!/rel $default_ts_oid\/$postgres_db_oid\/$rel_t1_oid/, @lines), + 0, 'only lines for selected relation'); + +@lines = + test_pg_waldump('--relation', + "$default_ts_oid/$postgres_db_oid/$rel_i1a_oid", + '--block', 1); +is(grep(!/\bblk 1\b/, @lines), 0, 'only lines for selected block'); + + +done_testing(); diff --git a/src/bin/pg_waldump/t/004_save_fullpage_encrypted.pl b/src/bin/pg_waldump/t/004_save_fullpage_encrypted.pl new file mode 100644 index 0000000000000..8f3592b13ab6f --- /dev/null +++ b/src/bin/pg_waldump/t/004_save_fullpage_encrypted.pl @@ -0,0 +1,117 @@ + +# Copyright (c) 2022-2024, PostgreSQL Global Development Group + +use strict; +use warnings FATAL => 'all'; +use File::Basename; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::RecursiveCopy; +use PostgreSQL::Test::Utils; +use Test::More; + +my ($blocksize, $walfile_name); + +# Function to extract the LSN from the given block structure +sub get_block_lsn +{ + my $path = shift; + my $blocksize = shift; + my $block; + + open my $fh, '<', $path or die "couldn't open file: $path\n"; + die "could not read block\n" + if $blocksize != read($fh, $block, $blocksize); + my ($lsn_hi, $lsn_lo) = unpack('LL', $block); + + $lsn_hi = sprintf('%08X', $lsn_hi); + $lsn_lo = sprintf('%08X', $lsn_lo); + + return ($lsn_hi, $lsn_lo); +} + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf( + 'postgresql.conf', q{ +wal_level = 'replica' +max_wal_senders = 4 + +shared_preload_libraries = 'pg_tde' +pg_tde.wal_encrypt = on +}); +$node->start; + +# Generate data/WAL to examine that will have full pages in them. +$node->safe_psql( + 'postgres', + "SELECT 'init' FROM pg_create_physical_replication_slot('regress_pg_waldump_slot', true, false); +CREATE TABLE test_table AS SELECT generate_series(1,100) a; +-- Force FPWs on the next writes. +CHECKPOINT; +UPDATE test_table SET a = a + 1; +"); + +($walfile_name, $blocksize) = split '\|' => $node->safe_psql('postgres', + "SELECT pg_walfile_name(pg_switch_wal()), current_setting('block_size')"); + +# Get the relation node, etc for the new table +my $relation = $node->safe_psql( + 'postgres', + q{SELECT format( + '%s/%s/%s', + CASE WHEN reltablespace = 0 THEN dattablespace ELSE reltablespace END, + pg_database.oid, + pg_relation_filenode(pg_class.oid)) + FROM pg_class, pg_database + WHERE relname = 'test_table' AND + datname = current_database()} +); + +my $walfile = $node->data_dir . '/pg_wal/' . $walfile_name; +my $tmp_folder = PostgreSQL::Test::Utils::tempdir; + +ok(-f $walfile, "Got a WAL file"); + +$node->command_ok( + [ + 'pg_waldump', '--quiet', + '-k', $node->data_dir. '/global', + '--save-fullpage', "$tmp_folder/raw", + '--relation', $relation, + $walfile + ], + 'pg_waldump with --save-fullpage runs'); + +# This regexp will match filenames formatted as: +# TLI-LSNh-LSNl.TBLSPCOID.DBOID.NODEOID.dd_fork with the components being: +# - Timeline ID in hex format. +# - WAL LSN in hex format, as two 8-character numbers. +# - Tablespace OID (0 for global). +# - Database OID. +# - Relfilenode. +# - Block number. +# - Fork this block came from (vm, init, fsm, or main). +my $file_re = + qr/^[0-9A-F]{8}-([0-9A-F]{8})-([0-9A-F]{8})[.][0-9]+[.][0-9]+[.][0-9]+[.][0-9]+(?:_vm|_init|_fsm|_main)?$/; + +my $file_count = 0; + +# Verify filename format matches --save-fullpage. +for my $fullpath (glob "$tmp_folder/raw/*") +{ + my $file = File::Basename::basename($fullpath); + + like($file, $file_re, "verify filename format for file $file"); + $file_count++; + + my ($hi_lsn_fn, $lo_lsn_fn) = ($file =~ $file_re); + my ($hi_lsn_bk, $lo_lsn_bk) = get_block_lsn($fullpath, $blocksize); + + # The LSN on the block comes before the file's LSN. + ok( $hi_lsn_fn . $lo_lsn_fn gt $hi_lsn_bk . $lo_lsn_bk, + 'LSN stored in the file precedes the one stored in the block'); +} + +ok($file_count > 0, 'verify that at least one block has been saved'); + +done_testing();