diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index 805e8810ecf..a56a699a8d8 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -166,7 +166,7 @@ blbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc_io_aligned(BLCKSZ, 0); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); BloomFillMetapage(index, metapage); /* @@ -178,7 +178,7 @@ blbuildempty(Relation index) */ PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO); smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BLOOM_METAPAGE_BLKNO, - (char *) metapage, true); + metapage, true); log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM, BLOOM_METAPAGE_BLKNO, metapage, true); diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index a434cf93efd..f091fd58166 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -15,6 +15,7 @@ #include "access/amapi.h" #include "access/generic_xlog.h" +#include "access/hio.h" #include "access/reloptions.h" #include "bloom.h" #include "catalog/index.h" @@ -391,7 +392,7 @@ BloomNewBuffer(Relation index) if (needLock) LockRelationForExtension(index, ExclusiveLock); - buffer = ReadBuffer(index, P_NEW); + buffer = polar_index_add_blocks(index); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (needLock) diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile index ad5a3ac5112..0e290ced337 100644 --- a/contrib/pageinspect/Makefile +++ b/contrib/pageinspect/Makefile @@ -22,7 +22,7 @@ DATA = pageinspect--1.10--1.11.sql \ pageinspect--1.0--1.1.sql PGFILEDESC = "pageinspect - functions to inspect contents of database pages" -REGRESS = page btree brin gin gist hash checksum oldextversions +REGRESS = page btree brin gin gist hash checksum oldextversions index_bulk_extend ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 62f2c1b3159..2211a8ab23f 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -520,6 +520,18 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) uargs->page = palloc(BLCKSZ); memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ); + /* + * POLAR: During bulk extend, page inspect may access zero page, which + * makes PageGetSpecialPointer Assert error. In order to handle this + * situation, we init these zero pages. + */ + if (PageIsNew(uargs->page)) + { + _bt_pageinit(uargs->page, BufferGetPageSize(buffer)); + elog(DEBUG1, "page from block" INT64_FORMAT " is new in index bulk extend", blkno); + } + /* POLAR end */ + UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out index d1adbab8ae2..2b1d54a6279 100644 --- a/contrib/pageinspect/expected/gist.out +++ b/contrib/pageinspect/expected/gist.out @@ -1,13 +1,6 @@ --- The gist_page_opaque_info() function prints the page's LSN. Normally, --- that's constant 1 (GistBuildLSN) on every page of a freshly built GiST --- index. But with wal_level=minimal, the whole relation is dumped to WAL at --- the end of the transaction if it's smaller than wal_skip_threshold, which --- updates the LSNs. Wrap the tests on gist_page_opaque_info() in the --- same transaction with the CREATE INDEX so that we see the LSNs before --- they are possibly overwritten at end of transaction. -BEGIN; --- Create a test table and GiST index. -CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM +-- The gist_page_opaque_info() function prints the page's LSN. +-- Use an unlogged index, so that the LSN is predictable. +CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM generate_series(1,1000) i; CREATE INDEX test_gist_idx ON test_gist USING gist (p); -- Page 0 is the root, the rest are leaf pages @@ -29,7 +22,6 @@ SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); 0/1 | 0/0 | 1 | {leaf} (1 row) -COMMIT; SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); itemoffset | ctid | itemlen | dead | keys ------------+-----------+---------+------+------------------------------- diff --git a/contrib/pageinspect/expected/index_bulk_extend.out b/contrib/pageinspect/expected/index_bulk_extend.out new file mode 100644 index 00000000000..3fd298e8a6d --- /dev/null +++ b/contrib/pageinspect/expected/index_bulk_extend.out @@ -0,0 +1,45 @@ +create schema test_index_bulk_extend; +set search_path to test_index_bulk_extend; +set client_min_messages to error; +create extension if not exists pageinspect; +show polar_index_bulk_extend_size; + polar_index_bulk_extend_size +------------------------------ + 1MB +(1 row) + +drop table if exists p; +create table p(a int, b varchar, c numeric,d int8); +-- INSERT DATA +insert into p select +i, +md5(i::text), +i / 982.0, +i * -1 +from +generate_series(0,100000 - 1)i; +-- INIT INDEX +create index p_c_d_idx on p(c,d); +-- DML +insert into p select +i, +i::text || 'sdha&$#*&', +i / 160.0, +i / 32 +from +generate_series(0,100000 - 1)i; +-- For index bulk extend expansion +select pg_relation_size('p_c_d_idx') < 3 * pg_relation_size('p'); + ?column? +---------- + t +(1 row) + +-- For index bulk extend core +select count(*) from generate_series(1, pg_relation_size('p_c_d_idx') / current_setting('block_size')::bigint - 1) AS blkno,bt_page_items('p_c_d_idx', blkno); + count +-------- + 202716 +(1 row) + +drop schema test_index_bulk_extend cascade; diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql index d263542ba15..85bc44b8000 100644 --- a/contrib/pageinspect/sql/gist.sql +++ b/contrib/pageinspect/sql/gist.sql @@ -1,14 +1,6 @@ --- The gist_page_opaque_info() function prints the page's LSN. Normally, --- that's constant 1 (GistBuildLSN) on every page of a freshly built GiST --- index. But with wal_level=minimal, the whole relation is dumped to WAL at --- the end of the transaction if it's smaller than wal_skip_threshold, which --- updates the LSNs. Wrap the tests on gist_page_opaque_info() in the --- same transaction with the CREATE INDEX so that we see the LSNs before --- they are possibly overwritten at end of transaction. -BEGIN; - --- Create a test table and GiST index. -CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM +-- The gist_page_opaque_info() function prints the page's LSN. +-- Use an unlogged index, so that the LSN is predictable. +CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM generate_series(1,1000) i; CREATE INDEX test_gist_idx ON test_gist USING gist (p); @@ -17,8 +9,6 @@ SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0)); SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1)); SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2)); -COMMIT; - SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx'); SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 1), 'test_gist_idx') LIMIT 5; diff --git a/contrib/pageinspect/sql/index_bulk_extend.sql b/contrib/pageinspect/sql/index_bulk_extend.sql new file mode 100644 index 00000000000..f0d473391a6 --- /dev/null +++ b/contrib/pageinspect/sql/index_bulk_extend.sql @@ -0,0 +1,38 @@ +create schema test_index_bulk_extend; +set search_path to test_index_bulk_extend; +set client_min_messages to error; + +create extension if not exists pageinspect; + +show polar_index_bulk_extend_size; + +drop table if exists p; +create table p(a int, b varchar, c numeric,d int8); + + +-- INSERT DATA +insert into p select +i, +md5(i::text), +i / 982.0, +i * -1 +from +generate_series(0,100000 - 1)i; +-- INIT INDEX +create index p_c_d_idx on p(c,d); + +-- DML +insert into p select +i, +i::text || 'sdha&$#*&', +i / 160.0, +i / 32 +from +generate_series(0,100000 - 1)i; + +-- For index bulk extend expansion +select pg_relation_size('p_c_d_idx') < 3 * pg_relation_size('p'); +-- For index bulk extend core +select count(*) from generate_series(1, pg_relation_size('p_c_d_idx') / current_setting('block_size')::bigint - 1) AS blkno,bt_page_items('p_c_d_idx', blkno); + +drop schema test_index_bulk_extend cascade; \ No newline at end of file diff --git a/contrib/pg_freespacemap/expected/pg_freespacemap.out b/contrib/pg_freespacemap/expected/pg_freespacemap.out index 5c6d50ef82b..eb574c23736 100644 --- a/contrib/pg_freespacemap/expected/pg_freespacemap.out +++ b/contrib/pg_freespacemap/expected/pg_freespacemap.out @@ -54,17 +54,12 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace DELETE FROM freespace_tab; VACUUM freespace_tab; --- In bulk extend, we will pre-extend pages. --- And these pages will not be expected to vacuum truncated to avoid --- repeating bulk extenion and truncating. --- So the relation will exist in free space map. WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace') SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail FROM rel, LATERAL pg_freespace(rel.id) AS fsm ORDER BY 1, 2; id | blkno | is_avail -----------------+-------+---------- - freespace_tab | 0 | t freespace_brin | 0 | f freespace_brin | 1 | f freespace_brin | 2 | t @@ -80,7 +75,7 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace freespace_hash | 7 | f freespace_hash | 8 | f freespace_hash | 9 | f -(16 rows) +(15 rows) -- failures with incorrect block number SELECT * FROM pg_freespace('freespace_tab', -1); diff --git a/contrib/pg_freespacemap/sql/pg_freespacemap.sql b/contrib/pg_freespacemap/sql/pg_freespacemap.sql index 5fc3ee38948..06275d8fac8 100644 --- a/contrib/pg_freespacemap/sql/pg_freespacemap.sql +++ b/contrib/pg_freespacemap/sql/pg_freespacemap.sql @@ -20,10 +20,6 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace DELETE FROM freespace_tab; VACUUM freespace_tab; --- In bulk extend, we will pre-extend pages. --- And these pages will not be expected to vacuum truncated to avoid --- repeating bulk extenion and truncating. --- So the relation will exist in free space map. WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace') SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail FROM rel, LATERAL pg_freespace(rel.id) AS fsm diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index caff5c4a80f..f50aa69eb2e 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -36,7 +36,7 @@ typedef enum PREWARM_BUFFER } PrewarmType; -static PGAlignedBlock blockbuffer; +static PGIOAlignedBlock blockbuffer; /* * pg_prewarm(regclass, mode text, fork text, diff --git a/external/Makefile b/external/Makefile index 629d1f918e9..8d0467dda55 100644 --- a/external/Makefile +++ b/external/Makefile @@ -4,7 +4,7 @@ subdir = external top_builddir = .. include $(top_builddir)/src/Makefile.global -SUBDIRS = +SUBDIRS = # multi-arch/0-dependency/fast-compile extensions can be added here # sort extention by names, less git conflict @@ -15,6 +15,7 @@ SUBDIRS += polar_monitor_preload SUBDIRS += polar_parameter_manager SUBDIRS += polar_proxy_utils SUBDIRS += polar_resource_manager +SUBDIRS += polar_smgrperf SUBDIRS += polar_stat_env SUBDIRS += polar_worker diff --git a/external/polar_monitor/polar_monitor--1.0.sql b/external/polar_monitor/polar_monitor--1.0.sql index ea75e515f12..1e4a70d803a 100644 --- a/external/polar_monitor/polar_monitor--1.0.sql +++ b/external/polar_monitor/polar_monitor--1.0.sql @@ -674,32 +674,6 @@ LANGUAGE C PARALLEL SAFE; REVOKE ALL ON FUNCTION polar_xlog_buffer_stat_reset() FROM PUBLIC; -/* Per Index */ -CREATE FUNCTION polar_pg_stat_get_bulk_create_index_extend_times( - IN oid, - OUT int8 -) -AS 'MODULE_PATHNAME', 'polar_pg_stat_get_bulk_create_index_extend_times' -LANGUAGE C PARALLEL SAFE; - --- Create View for create index extend stats -CREATE VIEW polar_pg_stat_all_index_extend_stats AS - SELECT - C.oid AS relid, - N.nspname AS schemaname, - C.relname AS relname, - polar_pg_stat_get_bulk_create_index_extend_times(C.oid) AS idx_create_extend_times - FROM pg_class C LEFT JOIN - pg_index I ON C.oid = I.indrelid LEFT JOIN - pg_class T ON C.reltoastrelid = T.oid LEFT JOIN - pg_index X ON T.oid = X.indrelid - LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE C.relkind IN ('r', 't', 'm') - GROUP BY C.oid, N.nspname, C.relname, T.oid, X.indrelid; - -REVOKE ALL ON FUNCTION polar_pg_stat_get_bulk_create_index_extend_times(IN oid,OUT int8) FROM PUBLIC; -/* POLAR end */ - CREATE FUNCTION polar_get_slot_node_type(slot_name text) RETURNS text AS 'MODULE_PATHNAME', 'polar_get_slot_node_type' diff --git a/external/polar_monitor/polar_monitor.c b/external/polar_monitor/polar_monitor.c index 55476b92aed..4161a30d344 100644 --- a/external/polar_monitor/polar_monitor.c +++ b/external/polar_monitor/polar_monitor.c @@ -220,24 +220,6 @@ polar_pg_stat_get_bulk_read_blocks_IO(PG_FUNCTION_ARGS) PG_RETURN_INT64(result); } -/* POLAR: Bulk create index extend stats */ -/* Per table (or index) */ -PG_FUNCTION_INFO_V1(polar_pg_stat_get_bulk_create_index_extend_times); -Datum -polar_pg_stat_get_bulk_create_index_extend_times(PG_FUNCTION_ARGS) -{ - Oid relid = PG_GETARG_OID(0); - int64 result; - PgStat_StatTabEntry *tabentry; - - if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) - result = 0; - else - result = (int64) (tabentry->polar_bulk_create_index_extends_times); - - PG_RETURN_INT64(result); -} - PG_FUNCTION_INFO_V1(polar_get_slot_node_type); Datum polar_get_slot_node_type(PG_FUNCTION_ARGS) diff --git a/external/polar_smgrperf/.gitignore b/external/polar_smgrperf/.gitignore new file mode 100644 index 00000000000..5dcb3ff9723 --- /dev/null +++ b/external/polar_smgrperf/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/external/polar_smgrperf/Makefile b/external/polar_smgrperf/Makefile new file mode 100644 index 00000000000..9312373f9f8 --- /dev/null +++ b/external/polar_smgrperf/Makefile @@ -0,0 +1,21 @@ +# external/polar_smgrperf/Makefile + +MODULE_big = polar_smgrperf +OBJS = polar_smgrperf.o $(WIN32RES) + +EXTENSION = polar_smgrperf +DATA = polar_smgrperf--1.0.sql +PGFILEDESC = "polar_smgrperf - perf test on smgr" + +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = external/polar_smgrperf +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/external/polar_smgrperf/polar_smgrperf--1.0.sql b/external/polar_smgrperf/polar_smgrperf--1.0.sql new file mode 100644 index 00000000000..ca4ca055201 --- /dev/null +++ b/external/polar_smgrperf/polar_smgrperf--1.0.sql @@ -0,0 +1,45 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION polar_smgrperf" to load this file. \quit + +CREATE FUNCTION polar_smgrperf_prepare( + nblocks INT DEFAULT 131072) +RETURNS VOID +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION polar_smgrperf_cleanup() +RETURNS VOID +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION polar_smgrperf_read( + bs INT DEFAULT 1, + begin_blkno INT DEFAULT 0, + end_blkno INT DEFAULT 131072, + sequential BOOLEAN DEFAULT TRUE) +RETURNS VOID +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION polar_smgrperf_write( + bs INT DEFAULT 1, + begin_blkno INT DEFAULT 0, + end_blkno INT DEFAULT 131072, + sequential BOOLEAN DEFAULT TRUE) +RETURNS VOID +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION polar_smgrperf_extend( + bs INT DEFAULT 1) +RETURNS VOID +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION polar_smgrperf_nblocks( + relnumber OID DEFAULT 1, + nblocks_cached BOOLEAN DEFAULT FALSE, + fd_cached BOOLEAN DEFAULT TRUE +) RETURNS VOID +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; diff --git a/external/polar_smgrperf/polar_smgrperf.c b/external/polar_smgrperf/polar_smgrperf.c new file mode 100644 index 00000000000..913c205137a --- /dev/null +++ b/external/polar_smgrperf/polar_smgrperf.c @@ -0,0 +1,385 @@ +/*------------------------------------------------------------------------- + * + * polar_smgrperf.c + * + * Copyright (c) 2024, Alibaba Group Holding Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * IDENTIFICATION + * external/polar_smgrperf/polar_smgrperf.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "common/file_utils.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/smgr.h" +#include "utils/timeout.h" + +PG_MODULE_MAGIC; + +#define INVALID_PROC_NUMBER InvalidBackendId +#define RelFileLocator RelFileNode + +#define MAX_RELSEG (MaxBlockNumber / RELSEG_SIZE) +#define MAX_NBLOCKS (MAX_RELSEG * RELSEG_SIZE) + +#define PERF_REL_NUMBER 1 +#define PERF_RLOCATOR(relnumber) ((RelFileLocator) {MyDatabaseTableSpace, MyDatabaseId, relnumber}) +#define PERF_SMGROPEN(relnumber) smgropen(PERF_RLOCATOR(relnumber), INVALID_PROC_NUMBER) + +#define REPORT_PERF_STATS_PREPARE(with_bandwidth_option) \ + { \ + sigjmp_buf local_sigjmp_buf; \ + perf_exception_stack = PG_exception_stack; \ + with_bandwidth = with_bandwidth_option; \ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) \ + { \ + PG_exception_stack = perf_exception_stack; \ + report_summary_perf_stats(); \ + if (perf_report_timerid != -1) \ + disable_timeout(perf_report_timerid, false); \ + pg_re_throw(); \ + } \ + PG_exception_stack = &local_sigjmp_buf; \ + MemSet(&stats, 0, sizeof(perf_stats)); \ + MemSet(&accum_stats, 0, sizeof(perf_stats)); \ + if (perf_report_timerid == -1) \ + perf_report_timerid = RegisterTimeout(USER_TIMEOUT, report_perf_stats_timeout_handler); \ + enable_timeout_after(perf_report_timerid, 1000); \ + } + +typedef struct perf_stats +{ + uint64 count; + uint64 blocks; + uint64 time; +} perf_stats; + +static perf_stats stats; +static perf_stats accum_stats; + +static ForkNumber forknum = MAIN_FORKNUM; +static BlockNumber zero_blkno = 0; +static void *zero_buffer = NULL; +static int max_bs = 0; +static int perf_report_timerid = -1; +static bool with_bandwidth = true; +static sigjmp_buf *perf_exception_stack = NULL; +static bool report_perf_stats_pending = false; +static instr_time start; + +static inline BlockNumber +select_next_blkno(BlockNumber current_blkno, BlockNumber begin_blkno, BlockNumber end_blkno, int bs, bool sequential) +{ + BlockNumber next_blkno = InvalidBlockNumber; + + if (sequential) + { + if (current_blkno == InvalidBlockNumber) + next_blkno = begin_blkno; + else + next_blkno = current_blkno + bs; + + if (next_blkno + bs > end_blkno) + next_blkno = begin_blkno; + } + else + next_blkno = begin_blkno + random() % (end_blkno - begin_blkno - bs + 1); + + return next_blkno; +} + +static void +report_perf_stats(perf_stats * stats, char *prefix) +{ + double iops, + bps, + mbps, + lat; +#define NANOPERSECOND ((uint64) 1000 * 1000 * 1000) + + if (stats->time == 0) + return; + + iops = (double) stats->count * NANOPERSECOND / stats->time; + lat = (double) stats->time / stats->count / 1000; /* to micro-second */ + + HOLD_INTERRUPTS(); + + if (with_bandwidth) + { + bps = (double) stats->blocks * NANOPERSECOND / stats->time; + mbps = (double) stats->blocks * NANOPERSECOND * BLCKSZ / 1024 / 1024 / stats->time; + + elog(INFO, "%siops=%.1f/s, lat=%.1fus, bps=%.1f/s, mbps=%.1fMB/s", + prefix, iops, lat, bps, mbps); + } + else + elog(INFO, "%siops=%.1f/s, lat=%.2fus", prefix, iops, lat); + + RESUME_INTERRUPTS(); + + MemSet(stats, 0, sizeof(perf_stats)); + +#undef NANOPERSECOND +} + +static void +report_perf_stats_timeout_handler(void) +{ + report_perf_stats_pending = true; +} + +static void +report_summary_perf_stats(void) +{ + report_perf_stats(&accum_stats, "Summary: "); +} + +static void +collect_perf_stats_begin(void) +{ + INSTR_TIME_SET_CURRENT(start); +} + +static void +collect_perf_stats_end(int blocks) +{ + instr_time duration; + + CHECK_FOR_INTERRUPTS(); + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + stats.time += INSTR_TIME_GET_NANOSEC(duration); + stats.blocks += blocks; + stats.count++; + + if (report_perf_stats_pending) + { + accum_stats.count += stats.count; + accum_stats.blocks += stats.blocks; + accum_stats.time += stats.time; + + report_perf_stats(&stats, ""); + + enable_timeout_after(perf_report_timerid, 1000); + + report_perf_stats_pending = false; + } +} + +static void +smgrperf_initialize() +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + (errmsg("must be superuser to use smgrperf functions")))); + + /* initialize zero buffer */ + zero_buffer = polar_zero_buffer; + max_bs = polar_zero_buffer_size / BLCKSZ; +} + +PG_FUNCTION_INFO_V1(polar_smgrperf_prepare); +Datum +polar_smgrperf_prepare(PG_FUNCTION_ARGS) +{ + int nblocks = PG_GETARG_INT32(0); + SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER); + + if (nblocks < 0 || nblocks > MAX_NBLOCKS) + elog(ERROR, "nblocks should be in [1, %d], current %d", MAX_NBLOCKS, nblocks); + + smgrperf_initialize(); + + if (!smgrexists(smgr, forknum)) + smgrcreate(smgr, forknum, false); + + smgrtruncate(smgr, &forknum, 1, &zero_blkno); + + smgrzeroextend(smgr, forknum, 0, nblocks, true); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(polar_smgrperf_cleanup); +Datum +polar_smgrperf_cleanup(PG_FUNCTION_ARGS) +{ + SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER); + + smgrperf_initialize(); + + smgrdounlinkall(&smgr, 1, false); + smgrclose(smgr); + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(polar_smgrperf_read); +Datum +polar_smgrperf_read(PG_FUNCTION_ARGS) +{ + int bs = PG_GETARG_INT32(0); + int begin_blkno = PG_GETARG_INT32(1); + int end_blkno = PG_GETARG_INT32(2); + bool sequential = PG_GETARG_BOOL(3); + BlockNumber current_blkno = InvalidBlockNumber; + SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER); + + smgrperf_initialize(); + + if (bs < 1 || bs > max_bs) + elog(ERROR, "bs should be in [1, %d], current %d", max_bs, bs); + + if (begin_blkno < 0 || begin_blkno >= end_blkno) + elog(ERROR, "\"begin_blkno\" should be in [0, %d), current %d", end_blkno, begin_blkno); + + if (end_blkno <= begin_blkno || end_blkno > MAX_NBLOCKS) + elog(ERROR, "\"end_blkno\" should be in (%d, %d], current %d", begin_blkno, MAX_NBLOCKS, end_blkno); + + REPORT_PERF_STATS_PREPARE(true); + + while (true) + { + current_blkno = select_next_blkno(current_blkno, begin_blkno, end_blkno, bs, sequential); + + collect_perf_stats_begin(); + polar_smgrbulkread(smgr, forknum, current_blkno, bs, zero_buffer); + collect_perf_stats_end(bs); + } + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(polar_smgrperf_write); +Datum +polar_smgrperf_write(PG_FUNCTION_ARGS) +{ + int bs = PG_GETARG_INT32(0); + int begin_blkno = PG_GETARG_INT32(1); + int end_blkno = PG_GETARG_INT32(2); + bool sequential = PG_GETARG_BOOL(3); + BlockNumber current_blkno = InvalidBlockNumber; + SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER); + + smgrperf_initialize(); + + if (bs < 1 || bs > max_bs) + elog(ERROR, "bs should be in [1, %d], current %d", max_bs, bs); + + if (begin_blkno < 0 || begin_blkno >= end_blkno) + elog(ERROR, "\"begin_blkno\" should be in [0, %d), current %d", end_blkno, begin_blkno); + + if (end_blkno <= begin_blkno || end_blkno > MAX_NBLOCKS) + elog(ERROR, "\"end_blkno\" should be in (%d, %d], current %d", begin_blkno, MAX_NBLOCKS, end_blkno); + + REPORT_PERF_STATS_PREPARE(true); + + while (true) + { + current_blkno = select_next_blkno(current_blkno, begin_blkno, end_blkno, bs, sequential); + + collect_perf_stats_begin(); + polar_smgrbulkwrite(smgr, forknum, current_blkno, bs, zero_buffer, false); + collect_perf_stats_end(bs); + } + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(polar_smgrperf_extend); +Datum +polar_smgrperf_extend(PG_FUNCTION_ARGS) +{ + int bs = PG_GETARG_INT32(0); + BlockNumber current_blkno = 0; + SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER); + + smgrperf_initialize(); + + if (bs < 1 || bs > max_bs) + elog(ERROR, "bs should be in [1, %d], current %d", max_bs, bs); + + REPORT_PERF_STATS_PREPARE(true); + + if (!smgrexists(smgr, forknum)) + smgrcreate(smgr, forknum, false); + + smgrtruncate(smgr, &forknum, 1, &zero_blkno); + + while (true) + { + if ((current_blkno + bs) >= RELSEG_SIZE) + { + smgrtruncate(smgr, &forknum, 1, &zero_blkno); + + current_blkno = 0; + } + + collect_perf_stats_begin(); + smgrzeroextend(smgr, forknum, current_blkno, bs, true); + collect_perf_stats_end(bs); + + current_blkno += bs; + } + + PG_RETURN_VOID(); +} + +PG_FUNCTION_INFO_V1(polar_smgrperf_nblocks); +Datum +polar_smgrperf_nblocks(PG_FUNCTION_ARGS) +{ + Oid relnumber = PG_GETARG_INT32(0); + bool nblocks_cached = PG_GETARG_BOOL(1); + bool fd_cached = PG_GETARG_BOOL(2); + + SMgrRelation smgr = smgropen(PERF_RLOCATOR(relnumber), INVALID_PROC_NUMBER); + + smgrperf_initialize(); + + if (relnumber == InvalidOid) + elog(ERROR, "relnumber cannot be %d", InvalidOid); + + REPORT_PERF_STATS_PREPARE(false); + + elog(INFO, "Testing smgrnblocks on file with %u blocks", smgrnblocks(smgr, forknum)); + + while (true) + { + if (!fd_cached) + { + smgrclose(smgr); + smgr = smgropen(PERF_RLOCATOR(relnumber), INVALID_PROC_NUMBER); + } + + collect_perf_stats_begin(); + + if (nblocks_cached) + smgrnblocks(smgr, forknum); + else + smgrnblocks_real(smgr, forknum); + + collect_perf_stats_end(0); + } + + PG_RETURN_VOID(); +} diff --git a/external/polar_smgrperf/polar_smgrperf.control b/external/polar_smgrperf/polar_smgrperf.control new file mode 100644 index 00000000000..c9db0f3db59 --- /dev/null +++ b/external/polar_smgrperf/polar_smgrperf.control @@ -0,0 +1,5 @@ +# polar_smgrperf extension +comment = 'smgr perf test extension' +default_version = '1.0' +module_pathname = '$libdir/polar_smgrperf' +schema = 'public' diff --git a/external/polar_smgrperf/t/001_smgrperf.pl b/external/polar_smgrperf/t/001_smgrperf.pl new file mode 100644 index 00000000000..3e6d1adcc1d --- /dev/null +++ b/external/polar_smgrperf/t/001_smgrperf.pl @@ -0,0 +1,114 @@ +#!/usr/bin/perl +# 001_smgrperf.pl +# Test smgrperf tool, for coverage. +# +# Copyright (c) 2024, Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# IDENTIFICATION +# external/polar_smgrperf/t/001_smgrperf.pl + +use strict; +use warnings; +use Config; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(); +$node_primary->append_conf('postgresql.conf', 'statement_timeout = 3s'); +$node_primary->start; + +$node_primary->safe_psql('postgres', 'CREATE EXTENSION polar_smgrperf'); + +my $stderr; + +# Run smgrperf tests +$node_primary->psql( + 'postgres', + qq[ + set polar_zero_extend_method to none; + select polar_smgrperf_extend(); + ], + stderr => \$stderr); +like( + $stderr, + qr/ERROR: canceling statement due to statement timeout/, + 'polar_smgrperf_extend canceled by statement timeout'); +like($stderr, qr/INFO: Summary:/, 'polar_smgrperf_extend (none) ok'); + +$node_primary->psql( + 'postgres', + qq[ + set polar_zero_extend_method to bulkwrite; + select polar_smgrperf_extend(); + ], + stderr => \$stderr); +like( + $stderr, + qr/ERROR: canceling statement due to statement timeout/, + 'polar_smgrperf_extend canceled by statement timeout'); +like($stderr, qr/INFO: Summary:/, 'polar_smgrperf_extend (bulkwrite) ok'); + +$node_primary->psql( + 'postgres', + qq[ + set polar_zero_extend_method to fallocate; + select polar_smgrperf_extend(); + ], + stderr => \$stderr); +like( + $stderr, + qr/ERROR: canceling statement due to statement timeout/, + 'polar_smgrperf_extend canceled by statement timeout'); +like($stderr, qr/INFO: Summary:/, 'polar_smgrperf_extend (fallocate) ok'); + +$node_primary->safe_psql('postgres', + 'set statement_timeout=0; select polar_smgrperf_prepare()'); + +$node_primary->psql( + 'postgres', + 'select polar_smgrperf_read()', + stderr => \$stderr); +like( + $stderr, + qr/ERROR: canceling statement due to statement timeout/, + 'polar_smgrperf_read canceled by statement timeout'); +like($stderr, qr/INFO: Summary:/, 'polar_smgrperf_read ok'); + +$node_primary->psql( + 'postgres', + 'select polar_smgrperf_write()', + stderr => \$stderr); +like( + $stderr, + qr/ERROR: canceling statement due to statement timeout/, + 'polar_smgrperf_write canceled by statement timeout'); +like($stderr, qr/INFO: Summary:/, 'polar_smgrperf_write ok'); + +$node_primary->psql( + 'postgres', + 'select polar_smgrperf_nblocks()', + stderr => \$stderr); +like( + $stderr, + qr/ERROR: canceling statement due to statement timeout/, + 'polar_smgrperf_nblocks canceled by statement timeout'); +like($stderr, qr/INFO: Summary:/, 'polar_smgrperf_nblocks ok'); + +$node_primary->safe_psql('postgres', 'select polar_smgrperf_cleanup()'); + +done_testing(); diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 20f470648be..89ab52d4fa6 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -16,6 +16,7 @@ #include "access/gin_private.h" #include "access/ginxlog.h" +#include "access/hio.h" #include "access/reloptions.h" #include "access/xloginsert.h" #include "catalog/pg_collation.h" @@ -331,7 +332,7 @@ GinNewBuffer(Relation index) if (needLock) LockRelationForExtension(index, ExclusiveLock); - buffer = ReadBuffer(index, P_NEW); + buffer = polar_index_add_blocks(index); LockBuffer(buffer, GIN_EXCLUSIVE); if (needLock) diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 6c48f56b3b2..798feee860c 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,7 +43,8 @@ #include "miscadmin.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" + #include "utils/memutils.h" #include "utils/rel.h" #include "utils/tuplesort.h" @@ -106,11 +107,8 @@ typedef struct Tuplesortstate *sortstate; /* state data for tuplesort.c */ BlockNumber pages_allocated; - BlockNumber pages_written; - int ready_num_pages; - BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; - Page ready_pages[XLR_MAX_BLOCK_ID]; + BulkWriteState *bulkstate; } GISTBuildState; #define GIST_SORTED_BUILD_PAGE_NUM 4 @@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, IndexTuple itup); static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, GistSortedBuildLevelState *levelstate); -static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); static void gistInitBuffering(GISTBuildState *buildstate); static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); @@ -404,27 +401,18 @@ gist_indexsortbuild(GISTBuildState *state) { IndexTuple itup; GistSortedBuildLevelState *levelstate; - Page page; + BulkWriteBuffer rootbuf; - state->pages_allocated = 0; - state->pages_written = 0; - state->ready_num_pages = 0; + /* Reserve block 0 for the root page */ + state->pages_allocated = 1; - /* - * Write an empty page as a placeholder for the root page. It will be - * replaced with the real root page at the end. - */ - page = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, - page, true); - state->pages_allocated++; - state->pages_written++; + state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM); /* Allocate a temporary buffer for the first leaf page batch. */ levelstate = palloc0(sizeof(GistSortedBuildLevelState)); - levelstate->pages[0] = page; + levelstate->pages[0] = palloc(BLCKSZ); levelstate->parent = NULL; - gistinitpage(page, F_LEAF); + gistinitpage(levelstate->pages[0], F_LEAF); /* * Fill index pages with tuples in the sorted order. @@ -454,31 +442,15 @@ gist_indexsortbuild(GISTBuildState *state) levelstate = parent; } - gist_indexsortbuild_flush_ready_pages(state); - /* Write out the root */ PageSetLSN(levelstate->pages[0], GistBuildLSN); - PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO); - smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); - if (RelationNeedsWAL(state->indexrel)) - log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); - - pfree(levelstate->pages[0]); + rootbuf = smgr_bulk_get_buf(state->bulkstate); + memcpy(rootbuf, levelstate->pages[0], BLCKSZ); + smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true); + pfree(levelstate); - /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. - */ - if (RelationNeedsWAL(state->indexrel)) - smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM); + smgr_bulk_finish(state->bulkstate); } /* @@ -508,7 +480,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state, levelstate->current_page++; if (levelstate->pages[levelstate->current_page] == NULL) - levelstate->pages[levelstate->current_page] = palloc_io_aligned(BLCKSZ, 0); + levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ); newPage = levelstate->pages[levelstate->current_page]; gistinitpage(newPage, old_page_flags); @@ -571,6 +543,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, for (; dist != NULL; dist = dist->next) { char *data; + BulkWriteBuffer buf; Page target; /* check once per page */ @@ -578,7 +551,8 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, /* Create page and copy data */ data = (char *) (dist->list); - target = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO); + buf = smgr_bulk_get_buf(state->bulkstate); + target = (Page) buf; gistinitpage(target, isleaf ? F_LEAF : 0); for (int i = 0; i < dist->block.num; i++) { @@ -591,20 +565,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, } union_tuple = dist->itup; - if (state->ready_num_pages == XLR_MAX_BLOCK_ID) - gist_indexsortbuild_flush_ready_pages(state); - - /* - * The page is now complete. Assign a block number to it, and add it - * to the list of finished pages. (We don't write it out immediately, - * because we want to WAL-log the pages in batches.) - */ - blkno = state->pages_allocated++; - state->ready_blknos[state->ready_num_pages] = blkno; - state->ready_pages[state->ready_num_pages] = target; - state->ready_num_pages++; - ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); - /* * Set the right link to point to the previous page. This is just for * debugging purposes: GiST only follows the right link if a page is @@ -619,6 +579,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, */ if (levelstate->last_blkno) GistPageGetOpaque(target)->rightlink = levelstate->last_blkno; + + /* + * The page is now complete. Assign a block number to it, and pass it + * to the bulk writer. + */ + blkno = state->pages_allocated++; + PageSetLSN(target, GistBuildLSN); + smgr_bulk_write(state->bulkstate, blkno, buf, true); + ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); levelstate->last_blkno = blkno; /* @@ -629,7 +598,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, if (parent == NULL) { parent = palloc0(sizeof(GistSortedBuildLevelState)); - parent->pages[0] = (Page) palloc_io_aligned(BLCKSZ, 0); + parent->pages[0] = palloc(BLCKSZ); parent->parent = NULL; gistinitpage(parent->pages[0], 0); @@ -639,39 +608,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, } } -static void -gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) -{ - if (state->ready_num_pages == 0) - return; - - for (int i = 0; i < state->ready_num_pages; i++) - { - Page page = state->ready_pages[i]; - BlockNumber blkno = state->ready_blknos[i]; - - /* Currently, the blocks must be buffered in order. */ - if (blkno != state->pages_written) - elog(ERROR, "unexpected block number to flush GiST sorting build"); - - PageSetLSN(page, GistBuildLSN); - PageSetChecksumInplace(page, blkno); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page, - true); - - state->pages_written++; - } - - if (RelationNeedsWAL(state->indexrel)) - log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages, - state->ready_blknos, state->ready_pages, true); - - for (int i = 0; i < state->ready_num_pages; i++) - pfree(state->ready_pages[i]); - - state->ready_num_pages = 0; -} - /*------------------------------------------------------------------------- * Routines for non-sorted build diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c index 4a0a91b7120..eabf7460182 100644 --- a/src/backend/access/gist/gistbuildbuffers.c +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -186,9 +186,8 @@ gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) { GISTNodeBufferPage *pageBuffer; - pageBuffer = (GISTNodeBufferPage *) - MemoryContextAllocIOAligned(gfbb->context, - BLCKSZ, MCXT_ALLOC_ZERO); + pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, + BLCKSZ); pageBuffer->prev = InvalidBlockNumber; /* Set page free space */ diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index d4bf0c7563d..d7029532db0 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -16,6 +16,7 @@ #include #include "access/gist_private.h" +#include "access/hio.h" #include "access/htup_details.h" #include "access/reloptions.h" #include "catalog/pg_opclass.h" @@ -883,7 +884,7 @@ gistNewBuffer(Relation r) if (needLock) LockRelationForExtension(r, ExclusiveLock); - buffer = ReadBuffer(r, P_NEW); + buffer = polar_index_add_blocks(r); LockBuffer(buffer, GIST_EXCLUSIVE); if (needLock) diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 9da5fb48658..0af9d6b0b95 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -992,7 +992,7 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; - PGAlignedBlock zerobuf; + PGIOAlignedBlock zerobuf; Page page; HashPageOpaque ovflopaque; diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 080ba8b4f5c..dec8a6c4098 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -22,14 +22,34 @@ #include "access/visibilitymap.h" #include "storage/bufmgr.h" #include "storage/freespace.h" +#include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/smgr.h" -/* POLAR */ -#include "utils/guc.h" -/* POLAR end */ +/* GUCs */ +int polar_heap_bulk_extend_size = 512; +int polar_index_bulk_extend_size = 128; -static Buffer polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkInsertState bistate); +static Buffer polar_heap_add_blocks(Relation relation, BulkInsertState bistate); + +int +polar_get_bulk_extend_size(BlockNumber first_block, int bulk_extend_size) +{ + /* Avoid small table bloat */ + if (first_block < bulk_extend_size) + bulk_extend_size = 1; + + /* Avoid failure on extremely small DB */ + bulk_extend_size = Min(NBuffers / 4, bulk_extend_size); + + /* Avoid acceed maximum possible length */ + bulk_extend_size = Min(MaxBlockNumber - first_block, bulk_extend_size); + + /* Extend by one page at least */ + bulk_extend_size = Max(1, bulk_extend_size); + + return bulk_extend_size; +} /* * RelationPutHeapTuple - place tuple at specified page @@ -351,7 +371,7 @@ RelationGetBufferForTuple(Relation relation, Size len, BlockNumber targetBlock, otherBlock; bool needLock; - int bulk_extend_size = polar_bulk_extend_size; + int bulk_extend_size = polar_heap_bulk_extend_size; len = MAXALIGN(len); /* be conservative */ @@ -369,7 +389,7 @@ RelationGetBufferForTuple(Relation relation, Size len, * POLAR: when enable preallocate_file, use fsm record blocks => if * preallocate_file is enabled, use fsm record blocks */ - if (polar_enable_shared_storage_mode && bulk_extend_size > 0) + if (bulk_extend_size > 0) use_fsm = true; /* @@ -629,7 +649,7 @@ RelationGetBufferForTuple(Relation relation, Size len, } /* POLAR: If we don't need file prealloc, use origin normal method */ - if (!(polar_enable_shared_storage_mode && bulk_extend_size > 0)) + if (bulk_extend_size <= 0) { /* Time to bulk-extend */ RelationAddExtraBlocks(relation, bistate); @@ -647,8 +667,8 @@ RelationGetBufferForTuple(Relation relation, Size len, * rather than relying on the kernel to do it for us? */ /* POLAR: preallocate multiple block and only use one block */ - if (polar_enable_shared_storage_mode && bulk_extend_size > 0) - buffer = polar_relation_add_extra_blocks_and_return_last_buffer(relation, bistate); + if (bulk_extend_size > 0) + buffer = polar_heap_add_blocks(relation, bistate); else buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); @@ -754,10 +774,9 @@ RelationGetBufferForTuple(Relation relation, Size len, return buffer; } - /* * POLAR: - * polar_relation_add_extra_blocks_and_return_last_buffer - Extend a relation by multiple blocks + * polar_heap_add_blocks - Extend a relation by multiple blocks * to avoid future contention on the relation extension lock and expensive pfs extend operation. * * If bistate isn't NULL, bistate->current_buf is assigned to last buffer alloced. @@ -770,15 +789,18 @@ RelationGetBufferForTuple(Relation relation, Size len, * relation extension lock has been acquired if relation is not local. */ static Buffer -polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkInsertState bistate) +polar_heap_add_blocks(Relation relation, BulkInsertState bistate) { - BlockNumber first_block_num_extended = InvalidBlockNumber; + BlockNumber first_block = InvalidBlockNumber; int block_count = 0; Buffer last_buffer = InvalidBuffer; Buffer *buffers = NULL; int index = 0; - char *bulk_buf_block = NULL; BufferAccessStrategy strategy = NULL; + SMgrRelation smgr = RelationGetSmgr(relation); + + if (polar_heap_bulk_extend_size == 0) + return ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate); if (bistate != NULL) { @@ -791,27 +813,17 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn } } - /* Open it at the smgr level if not already done */ - RelationGetSmgr(relation); - /* bulk extend times */ polar_pgstat_count_bulk_extend_times(relation); PG_TRY(); { /* init bulk extend backend-local-variable */ - polar_smgr_init_bulk_extend(relation->rd_smgr, MAIN_FORKNUM); - - first_block_num_extended = relation->rd_smgr->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM]; - block_count = Min(polar_bulk_extend_size, (BlockNumber) RELSEG_SIZE - (first_block_num_extended % ((BlockNumber) RELSEG_SIZE))); - if (block_count < 1) - block_count = 1; + polar_smgr_init_bulk_extend(smgr, MAIN_FORKNUM); - /* avoid small table bloat */ - if (first_block_num_extended < polar_min_bulk_extend_table_size) - block_count = 1; + first_block = smgr->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM]; + block_count = polar_get_bulk_extend_size(first_block, polar_heap_bulk_extend_size); - bulk_buf_block = (char *) palloc_io_aligned(block_count * BLCKSZ, MCXT_ALLOC_ZERO); buffers = (Buffer *) palloc(block_count * sizeof(Buffer)); for (index = 0; index < block_count; index++) @@ -825,11 +837,10 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn { /* * Lock last buffer in bulk extend, when last buffers are - * between buf head lock releasing and LockBuffer, the last - * buffers can be taken by other backends. Under this - * condition, the page will be init twice by two backend. we - * fix the bug by releasing buf head lock after LockBuffer in - * last buffer. + * between IO lock releasing and LockBuffer, the last buffer + * can be taken by other backends. Under this condition, the + * page will be init twice by two backend. we fix the bug by + * releasing IO lock after LockBuffer in last buffer. */ buffers[index] = ReadBufferExtended(relation, MAIN_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, strategy); } @@ -843,27 +854,20 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn * error recovery, very important, reset bulk extend * backend-local-variable */ - if (relation->rd_smgr != NULL) - polar_smgr_clear_bulk_extend(relation->rd_smgr, MAIN_FORKNUM); + polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM); PG_RE_THROW(); } PG_END_TRY(); - /* - * Reset bulk extend backend-local-variable. The reason why we can use - * backend-local-variable in bulk extend is that we don't allow to extend - * concurrently. - */ - polar_smgr_clear_bulk_extend(relation->rd_smgr, MAIN_FORKNUM); + /* reset bulk extend backend-local-variable */ + polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM); - /* bulk extend io */ - polar_smgrbulkextend(relation->rd_smgr, MAIN_FORKNUM, first_block_num_extended, block_count, bulk_buf_block, false); + /* bulk extend polar store */ + smgrzeroextend(smgr, MAIN_FORKNUM, first_block, block_count, false); - /* Update block countters */ + /* Update block counters */ polar_pgstat_count_bulk_extend_blocks(relation, block_count); - pfree(bulk_buf_block); - /* ---------------- * Until here, all alloced buffer are zero page(BM_VALID, non-BM_DIRTY). It is safe. * 1. They are not initialized, still zero-page. @@ -907,7 +911,7 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn MarkBufferDirty(buffer); /* we'll need this info below */ - Assert((first_block_num_extended + index) == BufferGetBlockNumber(buffer)); + Assert((first_block + index) == BufferGetBlockNumber(buffer)); freespace = PageGetHeapFreeSpace(page); UnlockReleaseBuffer(buffer); @@ -917,7 +921,7 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn * chance of making this page visible to other concurrently inserting * backends, and we want that to happen without delay. */ - RecordPageWithFreeSpace(relation, first_block_num_extended + index, freespace); + RecordPageWithFreeSpace(relation, first_block + index, freespace); } /* @@ -927,7 +931,7 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn * inserted. skip last block. */ if (block_count > 0) - FreeSpaceMapVacuumRange(relation, first_block_num_extended, first_block_num_extended + block_count); + FreeSpaceMapVacuumRange(relation, first_block, first_block + block_count); /* last block */ last_buffer = buffers[block_count]; @@ -941,3 +945,111 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn return last_buffer; } + +/* + * POLAR: index insert bulk extend. If we can not find free pages in index relation while + * doing index insert, we will do index bulk extend. The free blocks will be registered + * in FSM. + */ +Buffer +polar_index_add_blocks(Relation relation) +{ + BlockNumber first_block = InvalidBlockNumber; + int block_count = 0; + Buffer last_buffer = InvalidBuffer; + Buffer *buffers = NULL; + int index = 0; + SMgrRelation smgr = RelationGetSmgr(relation); + + if (polar_index_bulk_extend_size == 0) + return ReadBuffer(relation, P_NEW); + + PG_TRY(); + { + /* init bulk extend backend-local-variable */ + polar_smgr_init_bulk_extend(smgr, MAIN_FORKNUM); + + first_block = smgr->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM]; + block_count = polar_get_bulk_extend_size(first_block, polar_index_bulk_extend_size); + + buffers = (Buffer *) palloc(block_count * sizeof(Buffer)); + + /* + * The difference between polar_heap_add_blocks here is that: All the + * buffer is RBM_NORMAL, not RBM_ZERO_AND_LOCK for the last buffer. + * Because the btree index will not try to get the last blkno when fsm + * is not free, which is the scene in heap_insert. The return last + * buffer will be locked by locked from caller. + */ + for (index = 0; index < block_count; index++) + { + /* + * Extend by one page. This should generally match the main-line + * extension code in RelationGetBufferForTuple, except that we + * hold the relation extension lock throughout. + */ + buffers[index] = ReadBufferExtended(relation, MAIN_FORKNUM, P_NEW, RBM_NORMAL, NULL); + } + } + PG_CATCH(); + { + /* + * error recovery, very important, reset bulk extend + * backend-local-variable + */ + polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* reset bulk extend backend-local-variable */ + polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM); + + /* bulk extend polar store */ + smgrzeroextend(smgr, MAIN_FORKNUM, first_block, block_count, false); + + /* process left (block_count - 1) blocks, skip last block */ + block_count--; + for (index = 0; index < block_count; index++) + { + Buffer buffer; + Page page; + + buffer = buffers[index]; + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + if (!PageIsNew(page)) + elog(ERROR, "index bulk extend page %u of relation \"%s\" should be empty but is not", + BufferGetBlockNumber(buffer), + RelationGetRelationName(relation)); + + /* + * The difference between polar_heap_add_blocks here is that: we don't + * need to init new page and MarkBufferDirty. Because for btree index, + * when it get one page from fsm, it always call _bt_pageinit for a + * new page. While heap page should be inited by the caller. + */ + + Assert((first_block + index) == BufferGetBlockNumber(buffer)); + UnlockReleaseBuffer(buffer); + + /* + * We just register the free pages into FSM, no need to mark all the + * new buffers dirty + */ + RecordFreeIndexPage(relation, first_block + index); + } + + /* + * Finally, vacuum the FSM. Update the upper-level FSM pages to ensure + * that searchers can find them. + */ + if (block_count > 0) + FreeSpaceMapVacuumRange(relation, first_block, first_block + block_count); + + /* last block */ + last_buffer = buffers[block_count]; + pfree(buffers); + + return last_buffer; +} diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 21c6dbf0cc3..aa163ef664e 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -87,8 +87,8 @@ * is optimized for bulk inserting a lot of tuples, knowing that we have * exclusive access to the heap. raw_heap_insert builds new pages in * local storage. When a page is full, or at the end of the process, - * we insert it to WAL as a single record and then write it to disk - * directly through smgr. Note, however, that any data sent to the new + * we insert it to WAL as a single record and then write it to disk with + * the bulk smgr writer. Note, however, that any data sent to the new * heap's TOAST table will go through the normal bufmgr. * * @@ -119,9 +119,9 @@ #include "replication/logical.h" #include "replication/slot.h" #include "storage/bufmgr.h" +#include "storage/bulk_write.h" #include "storage/fd.h" #include "storage/procarray.h" -#include "storage/smgr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -136,9 +136,9 @@ typedef struct RewriteStateData { Relation rs_old_rel; /* source heap */ Relation rs_new_rel; /* destination heap */ - Page rs_buffer; /* page currently being built */ + BulkWriteState *rs_bulkstate; /* writer for the destination */ + BulkWriteBuffer rs_buffer; /* page currently being built */ BlockNumber rs_blockno; /* block where page will go */ - bool rs_buffer_valid; /* T if any tuples in buffer */ bool rs_logical_rewrite; /* do we need to do logical rewriting */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine * tuple visibility */ @@ -259,14 +259,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc_io_aligned(BLCKSZ, 0); + state->rs_buffer = NULL; /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); - state->rs_buffer_valid = false; state->rs_oldest_xmin = oldest_xmin; state->rs_freeze_xid = freeze_xid; state->rs_cutoff_multi = cutoff_multi; state->rs_cxt = rw_cxt; + state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM); /* Initialize hash tables used to track update chains */ hash_ctl.keysize = sizeof(TidHashKey); @@ -318,30 +318,13 @@ end_heap_rewrite(RewriteState state) } /* Write the last page, if any */ - if (state->rs_buffer_valid) + if (state->rs_buffer) { - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_node, - MAIN_FORKNUM, - state->rs_blockno, - state->rs_buffer, - true); - - PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, (char *) state->rs_buffer, true); + smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true); + state->rs_buffer = NULL; } - /* - * When we WAL-logged rel pages, we must nonetheless fsync them. The - * reason is the same as in storage.c's RelationCopyStorage(): we're - * writing data that's not in shared buffers, and so a CHECKPOINT - * occurring during the rewriteheap operation won't have fsync'd data we - * wrote before the checkpoint. - */ - if (RelationNeedsWAL(state->rs_new_rel)) - smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM); + smgr_bulk_finish(state->rs_bulkstate); logical_end_heap_rewrite(state); @@ -615,7 +598,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple) static void raw_heap_insert(RewriteState state, HeapTuple tup) { - Page page = state->rs_buffer; + Page page; Size pageFreeSpace, saveFreeSpace; Size len; @@ -668,7 +651,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup) HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ - if (state->rs_buffer_valid) + page = (Page) state->rs_buffer; + if (page) { pageFreeSpace = PageGetHeapFreeSpace(page); @@ -679,35 +663,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup) * contains a tuple. Hence, unlike RelationGetBufferForTuple(), * enforce saveFreeSpace unconditionally. */ - - /* XLOG stuff */ - if (RelationNeedsWAL(state->rs_new_rel)) - log_newpage(&state->rs_new_rel->rd_node, - MAIN_FORKNUM, - state->rs_blockno, - page, - true); - - /* - * Now write the page. We say skipFsync = true because there's no - * need for smgr to schedule an fsync for this write; we'll do it - * ourselves in end_heap_rewrite. - */ - PageSetChecksumInplace(page, state->rs_blockno); - - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, - state->rs_blockno, (char *) page, true); - + smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true); + state->rs_buffer = NULL; + page = NULL; state->rs_blockno++; - state->rs_buffer_valid = false; } } - if (!state->rs_buffer_valid) + if (!page) { /* Initialize a new empty page */ + state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate); + page = (Page) state->rs_buffer; PageInit(page, BLCKSZ, 0); - state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 35fe597f239..9290b03a319 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -66,7 +66,7 @@ #include "utils/timestamp.h" /* POLAR */ -#include "storage/polar_bufmgr.h" +#include "access/hio.h" /* * Space/time tradeoff parameters: do these need to be user-tunable? @@ -75,7 +75,7 @@ * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever * is less) potentially-freeable pages. */ -#define REL_TRUNCATE_MINIMUM 1000 +#define REL_TRUNCATE_MINIMUM MAX_BUFFERS_TO_EXTEND_BY #define REL_TRUNCATE_FRACTION 16 /* @@ -2853,21 +2853,24 @@ static bool should_attempt_truncation(LVRelState *vacrel) { BlockNumber possibly_freeable; + BlockNumber min_possibly_freeable; if (!vacrel->do_rel_truncate || vacrel->failsafe_active || old_snapshot_threshold >= 0) return false; - possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; - - /* POLAR: We don't expect that vacuum cleanup our prealloc file blocks */ - Assert(vacrel->rel); - if (polar_bulk_extend_size > 0 && - !RelationUsesLocalBuffers(vacrel->rel) && - possibly_freeable <= polar_bulk_extend_size) - return false; + /* + * POLAR: If the table can be truncated to empty, let's do this without + * considering bulk extend. Else if the table is truncatable, reserve bulk + * extended pages. + */ + if (vacrel->nonempty_pages == 0) + min_possibly_freeable = 0; + else + min_possibly_freeable = polar_heap_bulk_extend_size; - if (possibly_freeable > 0 && + possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages; + if (possibly_freeable > min_possibly_freeable && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION)) return true; @@ -2965,6 +2968,15 @@ lazy_truncate_heap(LVRelState *vacrel) * were vacuuming. */ new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected); + + /* + * POLAR: If the table can be truncated to empty, let's do this + * without considering bulk extend. Else if the table is truncatable, + * reserve bulk extended pages. + */ + if (new_rel_pages != 0 && orig_rel_pages > new_rel_pages) + new_rel_pages = Min(orig_rel_pages, new_rel_pages + polar_heap_bulk_extend_size); + vacrel->blkno = new_rel_pages; if (new_rel_pages >= orig_rel_pages) @@ -3133,14 +3145,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) /* Done scanning if we found a tuple here */ if (hastup) - { - Assert(vacrel->rel); - /* POLAR: bulk_extend_page is empty page, should be included */ - if (polar_bulk_extend_size > 0 && !RelationUsesLocalBuffers(vacrel->rel)) - return blkno + polar_bulk_extend_size; - else - return blkno + 1; - } + return blkno + 1; } /* diff --git a/src/backend/access/logindex/polar_fullpage.c b/src/backend/access/logindex/polar_fullpage.c index 7361d5be272..ea126f4f98e 100644 --- a/src/backend/access/logindex/polar_fullpage.c +++ b/src/backend/access/logindex/polar_fullpage.c @@ -17,7 +17,7 @@ * limitations under the License. * * IDENTIFICATION - * src/backend/access/logindex/polar_fullpage.c + * src/backend/access/logindex/polar_fullpage.c * *------------------------------------------------------------------------- */ @@ -33,6 +33,7 @@ #include "access/xlog_internal.h" #include "access/xloginsert.h" #include "catalog/pg_control.h" +#include "common/file_utils.h" #include "pgstat.h" #include "replication/walreceiver.h" #include "storage/buf_internals.h" @@ -221,46 +222,33 @@ install_fullpage_file_segment(polar_fullpage_ctl_t ctl, uint64 *seg_no, static void fill_fullpage_file_zero_pages(int fd, char *tmppath) { -#define ONE_MB (1024 * 1024L) - char *data; - int nbytes = 0; - - data = palloc_io_aligned(ONE_MB, MCXT_ALLOC_ZERO); + int rc; pgstat_report_wait_start(WAIT_EVENT_FULLPAGE_FILE_INIT_WRITE); - for (nbytes = 0; nbytes < FULLPAGE_SEGMENT_SIZE; nbytes += ONE_MB) - { - int rc = 0; - - errno = 0; - rc = (int) polar_pwrite(fd, data, ONE_MB, nbytes); + rc = polar_pwrite_zeros(fd, FULLPAGE_SEGMENT_SIZE, 0); - if (rc != ONE_MB) - { - int save_errno = errno; + if (rc < 0) + { + int save_errno = errno; - /* - * If we fail to make the file, delete it to release disk space - */ - polar_unlink(tmppath); + /* + * If we fail to make the file, delete it to release disk space + */ + polar_unlink(tmppath); - polar_close(fd); + polar_close(fd); - /* if write didn't set errno, assume problem is no disk space */ - errno = save_errno ? save_errno : ENOSPC; + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; - pfree(data); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", tmppath))); - } + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); } pgstat_report_wait_end(); - pfree(data); - if (polar_fsync(fd) != 0) { int save_errno = errno; @@ -327,9 +315,25 @@ polar_fullpage_file_init(polar_fullpage_ctl_t ctl, uint64 fullpage_no) (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); - /* POLAR: File allocate, juse change file metadata once */ - if (polar_fallocate(fd, 0, FULLPAGE_SEGMENT_SIZE) != 0) - elog(ERROR, "polar_posix_fallocate fail in polar_fullpage_file_init"); +#ifdef __linux__ + + /* + * POLAR: use FALLOC_FL_NO_HIDE_STALE on PFS to optimize appending writes. + */ + if (polar_enable_fallocate_no_hide_stale && + polar_vfs_type(fd) == POLAR_VFS_PFS && + polar_fallocate(fd, FALLOC_FL_NO_HIDE_STALE, 0, FULLPAGE_SEGMENT_SIZE) != 0) + { + int save_errno = errno; + + polar_unlink(tmppath); + polar_close(fd); + errno = save_errno; + + elog(ERROR, "fallocate failed \"%s\": %m", tmppath); + } + /* POLAR end */ +#endif fill_fullpage_file_zero_pages(fd, tmppath); @@ -652,7 +656,7 @@ polar_log_fullpage_snapshot_image(polar_fullpage_ctl_t ctl, Buffer buffer, XLogR * call. */ if (fullpage == NULL) - fullpage = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0); + fullpage = MemoryContextAllocAligned(TopMemoryContext, BLCKSZ, PG_IO_ALIGN_SIZE, 0); Assert(polar_is_primary()); diff --git a/src/backend/access/logindex/polar_logindex_redo.c b/src/backend/access/logindex/polar_logindex_redo.c index fe032f5c1eb..f5923880fb2 100644 --- a/src/backend/access/logindex/polar_logindex_redo.c +++ b/src/backend/access/logindex/polar_logindex_redo.c @@ -170,51 +170,27 @@ polar_logindex_abort_replaying_buffer(void) } /* - * For the block file in tag, extend it to tag->blockNum blocks. + * For the block file in tag, extend it to at least tag->blockNum blocks (might + * be longer to reduce smgrzeroextend invocations). */ static void polar_extend_block_if_not_exist(BufferTag *tag) { SMgrRelation smgr; BlockNumber nblocks; - char *extendBuf = NULL; - int copy_bulk_extend_size; - int copy_min_bulk_extend_table_size; smgr = smgropen(tag->rnode, InvalidBackendId); smgrcreate(smgr, tag->forkNum, true); nblocks = smgrnblocks(smgr, tag->forkNum); - /* When close the bulk extend, we still palloc one size */ - copy_bulk_extend_size = polar_recovery_bulk_extend_size; - copy_min_bulk_extend_table_size = polar_min_bulk_extend_table_size; - - while (tag->blockNum >= nblocks) + if (tag->blockNum >= nblocks) { - int block_count; + int block_count = polar_get_recovery_bulk_extend_size(tag->blockNum, nblocks); - if (copy_bulk_extend_size > 0 - && nblocks >= copy_min_bulk_extend_table_size - /* Avoid small table bloat */ ) - { - block_count = Min(copy_bulk_extend_size, - (BlockNumber) RELSEG_SIZE - nblocks % ((BlockNumber) RELSEG_SIZE)); - } - else - block_count = 1; + Assert(nblocks + block_count > tag->blockNum); - /* - * When close the bulk extend, we still palloc one size. We palloc - * only once. - */ - if (extendBuf == NULL) - extendBuf = palloc_io_aligned(BLCKSZ * Max(block_count, copy_bulk_extend_size), MCXT_ALLOC_ZERO); - polar_smgrbulkextend(smgr, tag->forkNum, nblocks, block_count, extendBuf, false); - nblocks += block_count; + smgrzeroextend(smgr, tag->forkNum, nblocks, block_count, false); } - - if (extendBuf != NULL) - pfree(extendBuf); } void diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index d3acbe0fb73..75ddf15fa5d 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -39,6 +39,7 @@ #include "utils/snapmgr.h" /* POLAR */ +#include "access/hio.h" #include "storage/smgr.h" #include "utils/guc.h" @@ -68,9 +69,6 @@ static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid); -/* POLAR */ -static Buffer polar_index_add_extra_blocks_and_return_last_buffer(Relation reln, BlockNumber blockNum); - /* * _bt_initmetapage() -- Fill a page buffer with a correct metapage image */ @@ -983,14 +981,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) if (needLock) LockRelationForExtension(rel, ExclusiveLock); - /* - * POLAR: if polar_index_bulk_extend_size > 0, use index bulk extend - * instead of extend one page - */ - if (polar_enable_shared_storage_mode && polar_index_bulk_extend_size > 0) - buf = polar_index_add_extra_blocks_and_return_last_buffer(rel, P_NEW); - else - buf = ReadBuffer(rel, P_NEW); + buf = polar_index_add_blocks(rel); /* Acquire buffer lock on new page */ _bt_lockbuf(rel, buf, BT_WRITE); @@ -3121,136 +3112,3 @@ _bt_pendingfsm_add(BTVacState *vstate, vstate->pendingpages[vstate->npendingpages].safexid = safexid; vstate->npendingpages++; } - -/* - * POLAR: index insert bulk extend. If we can not find free pages in index relation while - * doing index insert, we will do index bulk extend. The free blocks will be registered - * in FSM. - * We now only support bulk extend for btree index. - * Note: raw LockBuffer() calls are disallowed in nbtree; all - * buffer lock requests need to go through wrapper functions such - * as _bt_lockbuf(). - */ -static Buffer -polar_index_add_extra_blocks_and_return_last_buffer(Relation reln, BlockNumber blockNum) -{ - BlockNumber first_block_num_extended = InvalidBlockNumber; - int block_count = 0; - Buffer last_buffer = InvalidBuffer; - Buffer *buffers = NULL; - int index = 0; - char *bulk_buf_block = NULL; - - PG_TRY(); - { - /* init bulk extend backend-local-variable */ - polar_smgr_init_bulk_extend(RelationGetSmgr(reln), MAIN_FORKNUM); - - first_block_num_extended = RelationGetSmgr(reln)->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM]; - block_count = Min(polar_index_bulk_extend_size, (BlockNumber) RELSEG_SIZE - (first_block_num_extended % ((BlockNumber) RELSEG_SIZE))); - if (block_count < 1) - block_count = 1; - - /* avoid small table bloat */ - if (first_block_num_extended < polar_min_bulk_extend_table_size) - block_count = 1; - - bulk_buf_block = (char *) palloc_io_aligned(block_count * BLCKSZ, MCXT_ALLOC_ZERO); - buffers = (Buffer *) palloc0(block_count * sizeof(Buffer)); - - /* - * The difference between - * polar_relation_add_extra_blocks_and_return_last_buffer here is - * that: All the buffer is RBM_NORMAL, not RBM_ZERO_AND_LOCK for the - * last buffer. Because the btree index will not try to get the last - * blkno when fsm is not free. The btree index must get free page from - * fsm. When we check PageIsNew(), these bulk extend pages will not - * added into fsm. Moreover, The caller will init bulk extend pages - * which get from fsm, but bulk extend doesn't init index pages. There - * is no problem for init page twice. The return last buffer will be - * locked by _bt_lockbuf from caller. - */ - for (index = 0; index < block_count; index++) - { - /* - * Extend by one page. This should generally match the main-line - * extension code in RelationGetBufferForTuple, except that we - * hold the relation extension lock throughout. - */ - buffers[index] = ReadBufferExtended(reln, MAIN_FORKNUM, P_NEW, RBM_NORMAL, NULL); - } - } - PG_CATCH(); - { - /* - * error recovery, very important, reset bulk extend - * backend-local-variable - */ - if (RelationGetSmgr(reln) != NULL) - polar_smgr_clear_bulk_extend(RelationGetSmgr(reln), MAIN_FORKNUM); - PG_RE_THROW(); - } - PG_END_TRY(); - - /* reset bulk extend backend-local-variable */ - polar_smgr_clear_bulk_extend(RelationGetSmgr(reln), MAIN_FORKNUM); - - /* bulk extend polar store */ - polar_smgrbulkextend(RelationGetSmgr(reln), MAIN_FORKNUM, first_block_num_extended, block_count, bulk_buf_block, false); - - pfree(bulk_buf_block); - - /* process left (block_count-1) blocks, skip last block */ - block_count--; - for (index = 0; index < block_count; index++) - { - Buffer buffer; - Page page; - - buffer = buffers[index]; - - /* - * In polar_relation_add_extra_blocks_and_return_last_buffer, we have - * no need to init page. We just check zero page. we only need to add - * share lock to index buffer. - */ - _bt_lockbuf(reln, buffer, BT_READ); - page = BufferGetPage(buffer); - if (!PageIsNew(page)) - elog(ERROR, "index bulk extend page %u of relation \"%s\" should be empty but is not", - BufferGetBlockNumber(buffer), - RelationGetRelationName(reln)); - - /* - * The difference between - * polar_relation_add_extra_blocks_and_return_last_buffer here is - * that: we don't need to init new page and MarkBufferDirty. Because - * for btree index, when it get one page from fsm, it always call - * _bt_pageinit for a new page. While heap page should be inited by - * the caller. - */ - - Assert((first_block_num_extended + index) == BufferGetBlockNumber(buffer)); - /* unlock index buffer and release pin */ - _bt_relbuf(reln, buffer); - - /* - * We just register the free pages into FSM, no need to mark all the - * new buffers dirty - */ - RecordFreeIndexPage(reln, first_block_num_extended + index); - } - - /* - * Finally, vacuum the FSM. Update the upper-level FSM pages to ensure - * that searchers can find them. - */ - if (block_count > 0) - IndexFreeSpaceMapVacuum(reln); - - /* last block */ - last_buffer = buffers[block_count]; - pfree(buffers); - - return last_buffer; -} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 406f2a930ba..8fd10487efd 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -29,11 +29,11 @@ #include "nodes/execnodes.h" #include "pgstat.h" #include "postmaster/autovacuum.h" +#include "storage/bulk_write.h" #include "storage/condition_variable.h" #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" -#include "storage/smgr.h" #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" @@ -153,31 +153,18 @@ bthandler(PG_FUNCTION_ARGS) void btbuildempty(Relation index) { - Page metapage; + bool allequalimage = _bt_allequalimage(index, false); + BulkWriteState *bulkstate; + BulkWriteBuffer metabuf; - /* Construct metapage. */ - metapage = (Page) palloc_io_aligned(BLCKSZ, 0); - _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); + bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM); - /* - * Write the page and log it. It might seem that an immediate sync would - * be sufficient to guarantee that the file exists on disk, but recovery - * itself might remove it while replaying, for example, an - * XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE record. Therefore, we need - * this even when wal_level=minimal. - */ - PageSetChecksumInplace(metapage, BTREE_METAPAGE); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BTREE_METAPAGE, - (char *) metapage, true); - log_newpage(&RelationGetSmgr(index)->smgr_rnode.node, INIT_FORKNUM, - BTREE_METAPAGE, metapage, true); + /* Construct metapage. */ + metabuf = smgr_bulk_get_buf(bulkstate); + _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage); + smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true); - /* - * An immediate sync is required even if we xlog'd the page, because the - * write did not go through shared_buffers and therefore a concurrent - * checkpoint may have moved the redo pointer past our xlog record. - */ - smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM); + smgr_bulk_finish(bulkstate); } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index aa7d378f3d9..d0c921346d5 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -23,13 +23,8 @@ * many upper pages if the keys are reasonable-size) without risking a lot of * cascading splits during early insertions. * - * Formerly the index pages being built were kept in shared buffers, but - * that is of no value (since other backends have no interest in them yet) - * and it created locking problems for CHECKPOINT, because the upper-level - * pages were held exclusive-locked for long periods. Now we just build - * the pages in local memory and smgrwrite or smgrextend them as we finish - * them. They will need to be re-read into shared buffers on first use after - * the build finishes. + * We use the bulk smgr loading facility to bypass the buffer cache and + * WAL-log the pages efficiently. * * This code isn't concerned about the FSM at all. The caller is responsible * for initializing that. @@ -58,7 +53,7 @@ #include "executor/instrument.h" #include "miscadmin.h" #include "pgstat.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" #include "tcop/tcopprot.h" /* pgrminclude ignore */ #include "utils/rel.h" #include "utils/sortsupport.h" @@ -235,7 +230,7 @@ typedef struct BTBuildState */ typedef struct BTPageState { - Page btps_page; /* workspace for page building */ + BulkWriteBuffer btps_buf; /* workspace for page building */ BlockNumber btps_blkno; /* block # to write this page at */ IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ OffsetNumber btps_lastoff; /* last item offset loaded */ @@ -252,19 +247,9 @@ typedef struct BTWriteState { Relation heap; Relation index; + BulkWriteState *bulkstate; BTScanInsert inskey; /* generic insertion scankey */ - bool btws_use_wal; /* dump pages to WAL? */ BlockNumber btws_pages_alloced; /* # pages allocated */ - BlockNumber btws_pages_written; /* # pages written out */ - Page btws_zeropage; /* workspace for filling zeroes */ - - /* - * POLAR: bulk extend index file. polar_index_create_bulk_extend_size_copy - * is a copy of polar_index_create_bulk_extend_size, to avoid the impact - * of polar_index_create_bulk_extend_size realtime modifications. - */ - int polar_index_create_bulk_extend_size_copy; - /* POLAR end */ } BTWriteState; @@ -276,7 +261,7 @@ static void _bt_spool(BTSpool *btspool, ItemPointer self, static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); -static Page _bt_blnewpage(uint32 level); +static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); static void _bt_slideleft(Page rightmostpage); static void _bt_sortaddtup(Page page, Size itemsize, @@ -302,11 +287,6 @@ static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, Sharedsort *sharedsort2, int sortmem, bool progress); -/* POLAR */ -__attribute__((__unused__)) -static bool polar_bt_check_bulk_extend(BTWriteState *wstate); - -/* POLAR end */ /* * btbuild() -- build a new btree index. @@ -583,19 +563,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); - wstate.btws_use_wal = RelationNeedsWAL(wstate.index); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; - wstate.btws_pages_written = 0; - wstate.btws_zeropage = NULL; /* until needed */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); - - /* POLAR: bulk extend index file */ - wstate.polar_index_create_bulk_extend_size_copy = polar_index_create_bulk_extend_size; - _bt_load(&wstate, btspool, btspool2); } @@ -631,13 +604,15 @@ _bt_build_callback(Relation index, /* * allocate workspace for a new, clean btree page, not linked to any siblings. */ -static Page -_bt_blnewpage(uint32 level) +static BulkWriteBuffer +_bt_blnewpage(BTWriteState *wstate, uint32 level) { + BulkWriteBuffer buf; Page page; BTPageOpaque opaque; - page = (Page) palloc_io_aligned(BLCKSZ, 0); + buf = smgr_bulk_get_buf(wstate->bulkstate); + page = (Page) buf; /* Zero the page and set up standard page header info */ _bt_pageinit(page, BLCKSZ); @@ -652,136 +627,17 @@ _bt_blnewpage(uint32 level) /* Make the P_HIKEY line pointer appear allocated */ ((PageHeader) page)->pd_lower += sizeof(ItemIdData); - return page; + return buf; } /* * emit a completed btree page, and release the working storage. */ static void -_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) +_bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno) { - /* XLOG stuff */ - if (wstate->btws_use_wal) - { - /* We use the XLOG_FPI record type for this */ - log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); - } - - /* - * If we have to write pages nonsequentially, fill in the space with - * zeroes until we come back and overwrite. This is not logically - * necessary on standard Unix filesystems (unwritten space will read as - * zeroes anyway), but it should help to avoid fragmentation. The dummy - * pages aren't WAL-logged though. - */ - while (blkno > wstate->btws_pages_written) - { - /* ---------------- - * POLAR: bulk extend index relation in "create index", not "insert a index tuple". - * Extra zero-page are safe. - * In "create index", index_create(). Before it finished, it can't be accessed by other backend. - * 1. For a index relation, only one backend writes the index relation, even in parallel index build. - * 2. Logical nblocks is controlled by wstate->btws_pages_alloced, not file lseek. - * 3. zero-page will be overwrite by smgrwrite(). - * 4. When index_create() finished, small amount of last zero pages will be truncated. - * ---------------- - */ - if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0) - { - int polar_block_count = Min(wstate->polar_index_create_bulk_extend_size_copy, (BlockNumber) RELSEG_SIZE - (blkno % ((BlockNumber) RELSEG_SIZE))); - - if (polar_block_count < 0) - polar_block_count = 1; - /* ---------------- - * btws_zeropage has fixed size wstate->polar_index_create_bulk_extend_size_copy. - * polar_block_count <= wstate->polar_index_create_bulk_extend_size_copy. - * ---------------- - */ - if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc_io_aligned(BLCKSZ * wstate->polar_index_create_bulk_extend_size_copy, MCXT_ALLOC_ZERO); - - polar_smgrbulkextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, polar_block_count, (char *) wstate->btws_zeropage, true); - wstate->btws_pages_written += polar_block_count; - - polar_pgstat_count_bulk_create_index_extend_times(wstate->heap); - } /* POLAR end */ - else - { - if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO); - /* don't set checksum for all-zero page */ - smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, - wstate->btws_pages_written++, - (char *) wstate->btws_zeropage, - true); - } - } - - PageSetChecksumInplace(page, blkno); - - /* - * Now write the page. There's no need for smgr to schedule an fsync for - * this write; we'll do it ourselves before ending the build. - */ - if (blkno == wstate->btws_pages_written) - { - /* ---------------- - * POLAR: bulk extend index relation in "create index", not "insert a index tuple". - * Extra zero-page are safe. - * In "create index", index_create(). Before it finished, it can't be accessed by other backend. - * 1. For a index relation, only one backend writes the index relation, even in parallel index build. - * 2. Logical nblocks is controlled by wstate->btws_pages_alloced, not file lseek. - * 3. zero-page will be overwrite by smgrwrite(). - * 4. When index_create() finished, small amount of last zero pages will be truncated. - * ---------------- - */ - if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0) - { - int polar_block_count = Min(wstate->polar_index_create_bulk_extend_size_copy, (BlockNumber) RELSEG_SIZE - (blkno % ((BlockNumber) RELSEG_SIZE))); - - if (polar_block_count < 0) - polar_block_count = 1; - - /* ---------------- - * btws_zeropage has fixed size wstate->polar_index_create_bulk_extend_size_copy. - * polar_block_count <= wstate->polar_index_create_bulk_extend_size_copy. - * ---------------- - */ - if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc_io_aligned(BLCKSZ * wstate->polar_index_create_bulk_extend_size_copy, MCXT_ALLOC_ZERO); - - /* first page hold blkno's content */ - memcpy((char *) wstate->btws_zeropage, (char *) page, BLCKSZ); - - polar_smgrbulkextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, polar_block_count, (char *) wstate->btws_zeropage, true); - - wstate->btws_pages_written += polar_block_count; - - polar_pgstat_count_bulk_create_index_extend_times(wstate->heap); - } /* POLAR end */ - else - { - /* extending the file... */ - smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, - (char *) page, true); - wstate->btws_pages_written++; - } - } - else - { - /* overwriting a block we zero-filled before */ - smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, - (char *) page, true); - } - - pfree(page); - if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0 - && wstate->btws_zeropage) - { - pfree(wstate->btws_zeropage); - wstate->btws_zeropage = NULL; - } + smgr_bulk_write(wstate->bulkstate, blkno, buf, true); + /* smgr_bulk_write took ownership of 'buf' */ } /* @@ -794,7 +650,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); /* create initial page for level */ - state->btps_page = _bt_blnewpage(level); + state->btps_buf = _bt_blnewpage(wstate, level); /* and assign it a page position */ state->btps_blkno = wstate->btws_pages_alloced++; @@ -930,6 +786,7 @@ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra) { + BulkWriteBuffer nbuf; Page npage; BlockNumber nblkno; OffsetNumber last_off; @@ -944,7 +801,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, */ CHECK_FOR_INTERRUPTS(); - npage = state->btps_page; + nbuf = state->btps_buf; + npage = (Page) nbuf; nblkno = state->btps_blkno; last_off = state->btps_lastoff; last_truncextra = state->btps_lastextra; @@ -1000,6 +858,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, /* * Finish off the page and write it out. */ + BulkWriteBuffer obuf = nbuf; Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; @@ -1007,7 +866,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, IndexTuple oitup; /* Create new page of same level */ - npage = _bt_blnewpage(state->btps_level); + nbuf = _bt_blnewpage(wstate, state->btps_level); + npage = (Page) nbuf; /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; @@ -1119,10 +979,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, } /* - * Write out the old page. We never need to touch it again, so we can - * free the opage workspace too. + * Write out the old page. _bt_blwritepage takes ownership of the + * 'opage' buffer. */ - _bt_blwritepage(wstate, opage, oblkno); + _bt_blwritepage(wstate, obuf, oblkno); /* * Reset last_off to point to new page @@ -1155,7 +1015,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, _bt_sortaddtup(npage, itupsz, itup, last_off, !isleaf && last_off == P_FIRSTKEY); - state->btps_page = npage; + state->btps_buf = nbuf; state->btps_blkno = nblkno; state->btps_lastoff = last_off; } @@ -1207,7 +1067,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) BTPageState *s; BlockNumber rootblkno = P_NONE; uint32 rootlevel = 0; - Page metapage; + BulkWriteBuffer metabuf; /* * Each iteration of this loop completes one more level of the tree. @@ -1218,7 +1078,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) BTPageOpaque opaque; blkno = s->btps_blkno; - opaque = BTPageGetOpaque(s->btps_page); + opaque = BTPageGetOpaque((Page) s->btps_buf); /* * We have to link the last page on this level to somewhere. @@ -1252,9 +1112,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * This is the rightmost page, so the ItemId array needs to be slid * back one slot. Then we can dump out the page. */ - _bt_slideleft(s->btps_page); - _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); - s->btps_page = NULL; /* writepage freed the workspace */ + _bt_slideleft((Page) s->btps_buf); + _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno); + s->btps_buf = NULL; /* writepage took ownership of the buffer */ } /* @@ -1263,10 +1123,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. */ - metapage = (Page) palloc_io_aligned(BLCKSZ, 0); - _bt_initmetapage(metapage, rootblkno, rootlevel, + metabuf = smgr_bulk_get_buf(wstate->bulkstate); + _bt_initmetapage((Page) metabuf, rootblkno, rootlevel, wstate->inskey->allequalimage); - _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); + _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE); } /* @@ -1287,7 +1147,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; - ForkNumber bulk_index_extend_forknum; + + wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); @@ -1444,7 +1305,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) */ dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - sizeof(ItemIdData); - Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && + Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) && dstate->maxpostingsize <= INDEX_SIZE_MASK); dstate->htids = palloc(dstate->maxpostingsize); @@ -1514,32 +1375,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); - - /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. - */ - if (wstate->btws_use_wal) - smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM); - - /* - * POLAR: bulk extend index file, truncate amount of zero page at the end - * of file - */ - if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0) - { - if (wstate->btws_pages_alloced < wstate->btws_pages_written) - { - bulk_index_extend_forknum = MAIN_FORKNUM; - Assert(polar_bt_check_bulk_extend(wstate)); - smgrtruncate(RelationGetSmgr(wstate->index), &bulk_index_extend_forknum, 1, &(wstate->btws_pages_alloced)); - } - } + smgr_bulk_finish(wstate->bulkstate); } /* @@ -2127,39 +1963,3 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, if (btspool2) tuplesort_end(btspool2->sortstate); } - -/* - * POLAR: check if page content is expected. - * return false means something error, true OK. - */ -pg_attribute_unused() -static bool -polar_bt_check_bulk_extend(BTWriteState *wstate) -{ - PGAlignedBlock pg; - - MemSet(pg.data, 0, BLCKSZ); - /* wstate->btws_pages_alloced always >= 1 */ - - /* - * btws_pages_alloced page must be PageInit(). These pages are not - * zeroed-filled. - */ - smgrread(RelationGetSmgr(wstate->index), MAIN_FORKNUM, wstate->btws_pages_alloced - 1, pg.data); - if (PageIsNew(pg.data)) - return false; - - if (wstate->btws_pages_alloced < wstate->btws_pages_written) - { - memset(pg.data, 0, BLCKSZ); - smgrread(RelationGetSmgr(wstate->index), MAIN_FORKNUM, wstate->btws_pages_alloced, pg.data); - - /* - * The pages between btws_pages_alloced and btws_pages_written are - * bulk extend. These pages are zeroed-filled. - */ - if (!PageIsNew(pg.data)) - return false; - } - return true; -} diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index e7b748700d0..f97f9b0bac0 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -25,7 +25,7 @@ #include "catalog/index.h" #include "miscadmin.h" #include "storage/bufmgr.h" -#include "storage/smgr.h" +#include "storage/bulk_write.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -155,49 +155,27 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) void spgbuildempty(Relation index) { - Page page; + BulkWriteState *bulkstate; + BulkWriteBuffer buf; - /* Construct metapage. */ - page = (Page) palloc_io_aligned(BLCKSZ, 0); - SpGistInitMetapage(page); + bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM); - /* - * Write the page and log it unconditionally. This is important - * particularly for indexes created on tablespaces and databases whose - * creation happened after the last redo pointer as recovery removes any - * of their existing content when the corresponding create records are - * replayed. - */ - PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, - (char *) page, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM, - SPGIST_METAPAGE_BLKNO, page, true); + /* Construct metapage. */ + buf = smgr_bulk_get_buf(bulkstate); + SpGistInitMetapage((Page) buf); + smgr_bulk_write(bulkstate, SPGIST_METAPAGE_BLKNO, buf, true); /* Likewise for the root page. */ - SpGistInitPage(page, SPGIST_LEAF); - - PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_ROOT_BLKNO, - (char *) page, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM, - SPGIST_ROOT_BLKNO, page, true); + buf = smgr_bulk_get_buf(bulkstate); + SpGistInitPage((Page) buf, SPGIST_LEAF); + smgr_bulk_write(bulkstate, SPGIST_ROOT_BLKNO, buf, true); /* Likewise for the null-tuples root page. */ - SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); - - PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_NULL_BLKNO, - (char *) page, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM, - SPGIST_NULL_BLKNO, page, true); + buf = smgr_bulk_get_buf(bulkstate); + SpGistInitPage((Page) buf, SPGIST_LEAF | SPGIST_NULLS); + smgr_bulk_write(bulkstate, SPGIST_NULL_BLKNO, buf, true); - /* - * An immediate sync is required even if we xlog'd the pages, because the - * writes did not go through shared buffers and therefore a concurrent - * checkpoint may have moved the redo pointer past our xlog record. - */ - smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM); + smgr_bulk_finish(bulkstate); } /* diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 99fbe655318..bd993421251 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -16,6 +16,7 @@ #include "postgres.h" #include "access/amvalidate.h" +#include "access/hio.h" #include "access/htup_details.h" #include "access/reloptions.h" #include "access/spgist_private.h" @@ -428,7 +429,7 @@ SpGistNewBuffer(Relation index) if (needLock) LockRelationForExtension(index, ExclusiveLock); - buffer = ReadBuffer(index, P_NEW); + buffer = polar_index_add_blocks(index); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (needLock) diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index a4dddb0dcea..55503b18105 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -58,14 +58,17 @@ typedef struct char delta[MAX_DELTA_SIZE]; /* delta between page images */ } PageData; -/* State of generic xlog record construction */ +/* + * State of generic xlog record construction. Must be allocated at an I/O + * aligned address. + */ struct GenericXLogState { + /* Page images (properly aligned, must be first) */ + PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; /* Info about each page, see above */ PageData pages[MAX_GENERIC_XLOG_PAGES]; bool isLogged; - /* Page images (properly aligned) */ - PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; }; static void writeFragment(PageData *pageData, OffsetNumber offset, @@ -268,7 +271,9 @@ GenericXLogStart(Relation relation) GenericXLogState *state; int i; - state = (GenericXLogState *) palloc(sizeof(GenericXLogState)); + state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState), + PG_IO_ALIGN_SIZE, + 0); state->isLogged = RelationNeedsWAL(relation); for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 593768dc92f..1d3be6c0d1a 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2787,8 +2787,7 @@ AbortTransaction(void) pgstat_report_wait_end(); pgstat_progress_end_command(); - /* Clean up buffer I/O and buffer context locks, too */ - AbortBufferIO(); + /* Clean up buffer context locks, too */ UnlockBuffers(); /* Reset WAL record construction state */ @@ -5145,7 +5144,6 @@ AbortSubTransaction(void) pgstat_report_wait_end(); pgstat_progress_end_command(); - AbortBufferIO(); UnlockBuffers(); /* Reset WAL record construction state */ diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index dd66766aec3..2d624cc85a6 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -154,7 +154,6 @@ bool XLOG_DEBUG = false; #endif int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; -int polar_wal_init_set_size = POLAR_DEFAULT_XLOG_FILL_ZERO_SIZE; /* * Number of WAL insertion locks to use. A higher value allows more insertions @@ -3111,20 +3110,30 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, memset(zbuffer.data, 0, XLOG_BLCKSZ); pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); - save_errno = 0; - if (wal_init_zero) + +#ifdef __linux__ + + /* + * POLAR: use FALLOC_FL_NO_HIDE_STALE on PFS to optimize appending writes. + */ + if (polar_enable_fallocate_no_hide_stale && + polar_vfs_type(fd) == POLAR_VFS_PFS && + polar_fallocate(fd, FALLOC_FL_NO_HIDE_STALE, 0, (off_t) wal_segment_size) != 0) { - struct iovec iov[PG_IOV_MAX]; - int blocks; - static char zero_data[POLAR_MAX_XLOG_FILL_ZERO_SIZE]; + save_errno = errno; + polar_unlink(tmppath); + polar_close(fd); + errno = save_errno; - blocks = polar_wal_init_set_size / XLOG_BLCKSZ; - polar_wal_init_set_size = INTALIGN(blocks) * XLOG_BLCKSZ; + elog(ERROR, "fallocate failed \"%s\": %m", tmppath); + } + /* POLAR end */ +#endif - if (polar_wal_init_set_size > POLAR_MAX_XLOG_FILL_ZERO_SIZE) - memset(zero_data, 0, POLAR_MAX_XLOG_FILL_ZERO_SIZE); - else - memset(zero_data, 0, polar_wal_init_set_size); + save_errno = 0; + if (wal_init_zero) + { + ssize_t rc; /* * Zero-fill the file. With this setting, we do this the hard way to @@ -3135,29 +3144,10 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * indirect blocks are down on disk. Therefore, fdatasync(2) or * O_DSYNC will be sufficient to sync future writes to the log file. */ + rc = polar_pwrite_zeros(fd, wal_segment_size, 0); - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (int i = 0; i < lengthof(iov); ++i) - { - iov[i].iov_base = zero_data; - iov[i].iov_len = polar_wal_init_set_size; - } - - /* Loop, writing as many blocks as we can for each system call. */ - blocks = wal_segment_size / polar_wal_init_set_size; - for (int i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * polar_wal_init_set_size; - - if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) - { - save_errno = errno; - break; - } - - i += iovcnt; - } + if (rc < 0) + save_errno = errno; } else { @@ -4610,7 +4600,7 @@ XLOGShmemSize(void) /* xlblocks array */ size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers)); /* extra alignment padding for XLOG I/O buffers */ - size = add_size(size, XLOG_BLCKSZ); + size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE)); /* and the buffers themselves */ size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 4e26c40810d..b63d4fe9c59 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -34,6 +34,7 @@ #include "utils/rel.h" /* POLAR */ +#include "access/hio.h" #include "access/polar_logindex_redo.h" #include "storage/bufpage.h" #include "storage/buf_internals.h" @@ -45,6 +46,7 @@ /* GUC variable */ bool ignore_invalid_pages = false; +int polar_recovery_bulk_extend_size = 512; /* * Are we doing recovery from XLOG? @@ -478,33 +480,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * exceed one file size. */ static Buffer -polar_xlog_relation_bulk_extend_within_segment(SMgrRelationData *smgr, - RelFileNode rnode, - ReadBufferMode mode, - ForkNumber forknum, - int num_bulk_block_once) +polar_recovery_add_blocks(SMgrRelation smgr, ReadBufferMode mode, + ForkNumber forknum, BlockNumber blockNum) { - char *bulk_buf_block = NULL; - BlockNumber first_block_num_extended = InvalidBlockNumber; + BlockNumber first_block = InvalidBlockNumber; Buffer buffer = InvalidBuffer; - - Assert(num_bulk_block_once > 0); + int block_count = 0; PG_TRY(); { int index = 0; - /* - * POLAR: acquire relation file lock to avoid extend the same block - * concurrently which may result in EOF error when proc1 read block - * extended and modified by proc2. - */ - polar_acquire_relfile_lock(rnode, forknum, ExclusiveLock); - polar_smgr_init_bulk_extend(smgr, forknum); - first_block_num_extended = smgr->polar_nblocks_faked_for_bulk_extend[forknum]; - bulk_buf_block = (char *) palloc_io_aligned(num_bulk_block_once * BLCKSZ, MCXT_ALLOC_ZERO); + first_block = smgr->polar_nblocks_faked_for_bulk_extend[forknum]; + block_count = polar_get_bulk_extend_size(first_block, polar_recovery_bulk_extend_size); do { @@ -518,9 +508,8 @@ polar_xlog_relation_bulk_extend_within_segment(SMgrRelationData *smgr, * read transaction arrives now, and this buffer will be * evicted,it will not launch any write to disk. So, it is * Safe. By the way, maybe the most safe algo is to do this - * after polar_smgrbulkextend, but it's ok to leave it here - * currently. The last buffer will hold on buffer content - * exclusive lock. + * after smgrzeroextend, but it's ok to leave it here + * currently. */ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) { @@ -528,87 +517,24 @@ polar_xlog_relation_bulk_extend_within_segment(SMgrRelationData *smgr, } ReleaseBuffer(buffer); } - buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL, true); - - } while (index < num_bulk_block_once); + buffer = ReadBufferWithoutRelcache(smgr->smgr_rnode.node, forknum, P_NEW, mode, NULL, true); + } while (index < block_count); } PG_CATCH(); { polar_smgr_clear_bulk_extend(smgr, forknum); - - /* POLAR: release relation file lock after extend */ - polar_release_relfile_lock(rnode, forknum, ExclusiveLock); - PG_RE_THROW(); } PG_END_TRY(); polar_smgr_clear_bulk_extend(smgr, forknum); - if (bulk_buf_block == NULL) - elog(FATAL, "Block buffer is NULL in recovery bulk extend"); - /* - * In polar_smgrbulkextend, if first_block_num_extended is a new segment - * start block, then, it will open a new file, and write it from 0 to - * num_bulk_block_once * 8K. else, will just return an existing file - * handler to write into. + * in smgrzeroextend, if first_block is a new segment start block, then, + * it will open a new file, and write it from 0 to block_count * 8K. else, + * will just return an existing file handler to write into. */ - polar_smgrbulkextend(smgr, forknum, first_block_num_extended, num_bulk_block_once, bulk_buf_block, false); - - /* POLAR: release relation file lock after extend */ - polar_release_relfile_lock(rnode, forknum, ExclusiveLock); - - pfree(bulk_buf_block); - return buffer; -} - -/* - * POLAR: - * polar_xlog_relation_bulk_extend_blocks -- to extend relation file size more once, - * worked only on polar store env, with recoverying mode. - * - * Returns: the target block's buffer. - * - * Params: - * current_total_blocks: the total number of rel blocks currently, which is the - * start position of PwriteExtend. - * target_blkno: the block id to be replayed, target_blkno - current_total_blocks is - * the total blocks to be extended. - */ -static Buffer -polar_xlog_relation_bulk_extend_blocks(SMgrRelationData *smgr, - RelFileNode rnode, - ReadBufferMode mode, - ForkNumber forknum, - BlockNumber current_total_blocks, - BlockNumber target_blkno, - int one_bulk_max_size) -{ - int left_blocks_seg = 0; - int num_bulk_block = 0; - int left_blocks = target_blkno + 1 - current_total_blocks; - Buffer buffer = InvalidBuffer; - - do - { - if (buffer != InvalidBuffer) - { - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - } - ReleaseBuffer(buffer); - } - left_blocks_seg = (BlockNumber) RELSEG_SIZE - - (current_total_blocks % (BlockNumber) RELSEG_SIZE); - num_bulk_block = Min(left_blocks, left_blocks_seg); - num_bulk_block = Min(num_bulk_block, one_bulk_max_size); - buffer = polar_xlog_relation_bulk_extend_within_segment(smgr, - rnode, mode, forknum, num_bulk_block); - current_total_blocks += num_bulk_block; - left_blocks -= num_bulk_block; - } while (left_blocks > 0); + smgrzeroextend(smgr, forknum, first_block, block_count, false); return buffer; } @@ -650,8 +576,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber lastblock = InvalidBlockNumber; Buffer buffer; SMgrRelation smgr; - int one_bulk_max_size = 0; - bool can_bulk = false; Assert(blkno != P_NEW); @@ -703,66 +627,32 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, Assert(InRecovery || POLAR_IN_LOGINDEX_PARALLEL_REPLAY()); buffer = InvalidBuffer; - /* POLAR: bulk block extend opt start */ - one_bulk_max_size = polar_recovery_bulk_extend_size; - can_bulk = false; - if (InHotStandby) - { - can_bulk = true; - } - else if (InRecovery && polar_enable_primary_recovery_bulk_extend) - { - /* - * can_bulk set true when RW crashes recovery and page in xlog - * will not exist in heap table. The latter will occur in insert - * and truncate relfilenode. In online promote, can_bulk will not - * be true because RO type has been RW when xlog replay. - * InRecovery here is always false in online promote. - */ - can_bulk = true; - } - - /* get last blocks */ - if (lastblock == InvalidBlockNumber) - lastblock = smgrnblocks(smgr, forknum); - /* avoid small table bloat */ - if (lastblock < polar_min_bulk_extend_table_size) - can_bulk = false; - - if (can_bulk && polar_enable_shared_storage_mode && - one_bulk_max_size > 0) - { - buffer = polar_xlog_relation_bulk_extend_blocks(smgr, - rnode, mode, forknum, lastblock, blkno, - one_bulk_max_size); - } - else + do { - do + if (buffer != InvalidBuffer) { - if (buffer != InvalidBuffer) - { - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); - } + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } - /* - * POLAR: acquire relation file lock to avoid extend the same - * block concurrently which may result in EOF error when proc1 - * read block extended and modified by proc2. - */ - polar_acquire_relfile_lock(rnode, forknum, ExclusiveLock); + /* + * POLAR: acquire relation file lock to avoid extend the same + * block concurrently which may result in EOF error when proc1 + * read block extended and modified by proc2. + */ + polar_acquire_relfile_lock(rnode, forknum, ExclusiveLock); - buffer = ReadBufferWithoutRelcache(rnode, forknum, - P_NEW, mode, NULL, true); + if ((InHotStandby || InRecovery) && + polar_recovery_bulk_extend_size > 0) + buffer = polar_recovery_add_blocks(smgr, mode, forknum, blkno); + else + buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL, true); - /* POLAR: release relation file lock after extend */ - polar_release_relfile_lock(rnode, forknum, ExclusiveLock); - } - while (BufferGetBlockNumber(buffer) < blkno); + /* POLAR: release relation file lock after extend */ + polar_release_relfile_lock(rnode, forknum, ExclusiveLock); } - /* POLAR: bulk block extend opt end */ + while (BufferGetBlockNumber(buffer) < blkno); /* Handle the corner case that P_NEW returns non-consecutive pages */ if (BufferGetBlockNumber(buffer) != blkno) @@ -1374,3 +1264,23 @@ WALReadRaiseError(WALReadError *errinfo) errinfo->wre_req))); } } + +int +polar_get_recovery_bulk_extend_size(BlockNumber target_block, BlockNumber nblocks) +{ + int bulk_extend_size = polar_recovery_bulk_extend_size; + + Assert(target_block >= nblocks); + + /* Avoid small table bloat */ + if (nblocks < polar_recovery_bulk_extend_size) + bulk_extend_size = 1; + + /* Avoid acceed maximum possible length */ + bulk_extend_size = Min(MaxBlockNumber - nblocks, bulk_extend_size); + + /* Extend the relation to blockNum + 1 at least */ + bulk_extend_size = Max(target_block - nblocks + 1, bulk_extend_size); + + return bulk_extend_size; +} diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index c8250205919..270f53c323f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -28,6 +28,7 @@ #include "catalog/storage.h" #include "catalog/storage_xlog.h" #include "miscadmin.h" +#include "storage/bulk_write.h" #include "storage/freespace.h" #include "storage/smgr.h" #include "utils/hsearch.h" @@ -474,14 +475,14 @@ void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ForkNumber forkNum, char relpersistence) { - PGAlignedBlock buf; - Page page; bool use_wal; bool copying_initfork; BlockNumber nblocks; BlockNumber blkno; - - page = (Page) buf.data; + BulkWriteState *bulkstate; + void *buffer; + int block_count; + int max_block_count; /* * The init fork for an unlogged relation in many respects has to be @@ -500,64 +501,67 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, use_wal = XLogIsNeeded() && (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork); + max_block_count = Max(1, polar_bulk_read_size); + buffer = palloc_aligned(max_block_count * BLCKSZ, PG_IO_ALIGN_SIZE, 0); + + bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal, relpersistence); + nblocks = smgrnblocks(src, forkNum); - for (blkno = 0; blkno < nblocks; blkno++) + for (blkno = 0; blkno < nblocks; blkno += block_count) { + BulkWriteBuffer buf; + /* If we got a cancel signal during the copy of the data, quit */ CHECK_FOR_INTERRUPTS(); - smgrread(src, forkNum, blkno, buf.data); + block_count = Min(max_block_count, nblocks - blkno); + + polar_smgrbulkread(src, forkNum, blkno, block_count, buffer); - if (!PageIsVerifiedExtended(page, blkno, - PIV_LOG_WARNING | PIV_REPORT_STAT)) + for (int i = 0; i < block_count; i++) { - /* - * For paranoia's sake, capture the file path before invoking the - * ereport machinery. This guards against the possibility of a - * relcache flush caused by, e.g., an errcontext callback. - * (errcontext callbacks shouldn't be risking any such thing, but - * people have been known to forget that rule.) - */ - char *relpath = relpathbackend(src->smgr_rnode.node, - src->smgr_rnode.backend, - forkNum); - - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid page in block %u of relation %s", - blkno, relpath))); - } + BlockNumber cur_blkno = blkno + i; + Page page = (Page) ((char *) buffer + i * BLCKSZ); - /* - * WAL-log the copied page. Unfortunately we don't know what kind of a - * page this is, so we have to log the full page including any unused - * space. - */ - if (use_wal) - log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false); + if (!PageIsVerifiedExtended(page, cur_blkno, + PIV_LOG_WARNING | PIV_REPORT_STAT)) + { + /* + * For paranoia's sake, capture the file path before invoking + * the ereport machinery. This guards against the possibility + * of a relcache flush caused by, e.g., an errcontext + * callback. (errcontext callbacks shouldn't be risking any + * such thing, but people have been known to forget that + * rule.) + */ + char *relpath = relpathbackend(src->smgr_rnode.node, + src->smgr_rnode.backend, + forkNum); + + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid page in block %u of relation %s", + cur_blkno, relpath))); + } - PageSetChecksumInplace(page, blkno); + buf = smgr_bulk_get_buf(bulkstate); - /* - * Now write the page. We say skipFsync = true because there's no - * need for smgr to schedule an fsync for this write; we'll do it - * ourselves below. - */ - smgrextend(dst, forkNum, blkno, buf.data, true); + memcpy(buf, page, BLCKSZ); + + /* + * Queue the page for WAL-logging and writing out. Unfortunately + * we don't know what kind of a page this is, so we have to log + * the full page including any unused space. + */ + smgr_bulk_write(bulkstate, cur_blkno, buf, false); + } } + Assert(blkno == nblocks); - /* - * When we WAL-logged rel pages, we must nonetheless fsync them. The - * reason is that since we're copying outside shared buffers, a CHECKPOINT - * occurring during the copy has no way to flush the previously written - * data to disk (indeed it won't know the new rel even exists). A crash - * later on would replay WAL from the checkpoint, therefore it wouldn't - * replay our earlier WAL entries. If we do not fsync those pages here, - * they might still not be on disk when the crash occurs. - */ - if (use_wal || copying_initfork) - smgrimmedsync(dst, forkNum); + smgr_bulk_finish(bulkstate); + + pfree(buffer); } /* diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index eaf155d8084..297f1399319 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -533,7 +533,6 @@ AutoVacLauncherMain(int argc, char *argv[]) */ LWLockReleaseAll(); pgstat_report_wait_end(); - AbortBufferIO(); UnlockBuffers(); /* this is probably dead code, but let's be safe: */ if (AuxProcessResourceOwner) diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 18c7dfb78c0..14f519cf22a 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -187,7 +187,6 @@ BackgroundWriterMain(void) */ LWLockReleaseAll(); ConditionVariableCancelSleep(); - AbortBufferIO(); UnlockBuffers(); ReleaseAuxProcessResources(false); AtEOXact_Buffers(false); diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index a27962c3e38..ef3f4d49806 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -282,7 +282,6 @@ CheckpointerMain(void) LWLockReleaseAll(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); - AbortBufferIO(); UnlockBuffers(); ReleaseAuxProcessResources(false); AtEOXact_Buffers(false); diff --git a/src/backend/postmaster/polar_parallel_bgwriter.c b/src/backend/postmaster/polar_parallel_bgwriter.c index 6557be3192f..94e9233b0a0 100644 --- a/src/backend/postmaster/polar_parallel_bgwriter.c +++ b/src/backend/postmaster/polar_parallel_bgwriter.c @@ -342,7 +342,6 @@ polar_parallel_bgwriter_worker_main(Datum main_arg) */ LWLockReleaseAll(); ConditionVariableCancelSleep(); - AbortBufferIO(); UnlockBuffers(); ReleaseAuxProcessResources(false); AtEOXact_Buffers(false); diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 18c17623811..99210711779 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -174,7 +174,6 @@ WalWriterMain(void) LWLockReleaseAll(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); - AbortBufferIO(); UnlockBuffers(); ReleaseAuxProcessResources(false); AtEOXact_Buffers(false); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 3110a2dcc86..07e7494d3b1 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -990,8 +990,8 @@ polar_logical_read_xlog_page_bulk(XLogReaderState *state, pfree(state->bulk_read_buffer); state->bulk_read_buffer_size = polar_logical_repl_xlog_bulk_read_size; - state->bulk_read_buffer = palloc_extended(state->bulk_read_buffer_size * XLOG_BLCKSZ, - MCXT_ALLOC_NO_OOM); + state->bulk_read_buffer = palloc_aligned(state->bulk_read_buffer_size * XLOG_BLCKSZ, + PG_IO_ALIGN_SIZE, MCXT_ALLOC_NO_OOM); /* * Fail to allocate bulk buffer, turn to use original read buffer diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 01165d36eb7..50c6b23cc4a 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -19,6 +19,7 @@ #include "storage/proc.h" /* POLAR */ +#include "common/file_utils.h" #include "storage/polar_copybuf.h" #include "storage/polar_fd.h" #include "storage/polar_flush.h" @@ -63,6 +64,48 @@ CkptSortItem *CkptBufferIds; * multiple times. Check the PrivateRefCount infrastructure in bufmgr.c. */ +static Size +polar_zero_buffer_shmem_size() +{ + /* -1 indicates a request for auto-tune. */ + if (polar_zero_buffers == -1) + { + /* Request according to NBuffers, which is in [16, INT_MAX / 2) */ + polar_zero_buffers = 4; + + if (NBuffers >= 1024) + polar_zero_buffers = 32; + + if (NBuffers >= 16384) + polar_zero_buffers = 512; + } + + /* 0 disables the zero buffer. */ + if (polar_zero_buffers == 0) + return 0; + + polar_zero_buffer_size = polar_zero_buffers * BLCKSZ; + + return polar_zero_buffer_size + PG_IO_ALIGN_SIZE; +} + +static void +polar_zero_buffer_init() +{ + bool found; + + if (polar_zero_buffer_size == 0) + return; + + polar_zero_buffer = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStruct("Zero Buffer Blocks", + polar_zero_buffer_size + PG_IO_ALIGN_SIZE, + &found)); + + if (!found) + MemSet(polar_zero_buffer, 0, polar_zero_buffer_size); +} /* * Initialize shared buffer pool @@ -84,10 +127,11 @@ InitBufferPool(void) NBuffers * sizeof(BufferDescPadded), &foundDescs); + /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) - TYPEALIGN(POLAR_BUFFER_ALIGN_LEN, + TYPEALIGN(PG_IO_ALIGN_SIZE, ShmemInitStruct("Buffer Blocks", - ((NBuffers * (Size) BLCKSZ) + POLAR_BUFFER_ALIGN_LEN), + NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, &foundBufs)); /* Align condition variables to cacheline boundary. */ @@ -169,6 +213,9 @@ InitBufferPool(void) /* POLAR: init copy buffer pool */ polar_init_copy_buffer_pool(); + /* POLAR: init global zero buffer */ + polar_zero_buffer_init(); + /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, &backend_flush_after); @@ -211,5 +258,8 @@ BufferShmemSize(void) /* POLAR: add copy buffer shared memory size */ size = add_size(size, polar_copy_buffer_shmem_size()); + /* POLAR: size of global zero buffer */ + size = add_size(size, polar_zero_buffer_shmem_size()); + return size; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d60e504d6cd..a07bbbec1b1 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -188,23 +188,6 @@ int checkpoint_flush_after = 0; int bgwriter_flush_after = 0; int backend_flush_after = 0; -/* local state for StartBufferIO and related functions */ -static BufferDesc *InProgressBuf = NULL; -static bool IsForInput; - -/* - * POLAR: bulk io local state for StartBufferIO/TerminateBufferIO/AbortBufferIO and related functions. - * - * notice: bulk read io may be mixed with temporary write io, for flushing dirty evicted page. - * So polar_bulk_io_is_for_input[] is required for error recovery. - */ -bool polar_bulk_io_is_in_progress = false; -int polar_bulk_io_in_progress_count = 0; -BufferDesc **polar_bulk_io_in_progress_buf = NULL; -static bool *polar_bulk_io_is_for_input = NULL; - -/* POLAR end */ - /* local state for LockBufferForCleanup */ static BufferDesc *PinCountWaitBuf = NULL; @@ -250,6 +233,9 @@ static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); +/* POLAR */ +bool polar_has_partial_write; +int polar_bulk_read_size = 16; static bool polar_apply_io_locked_page(BufferDesc *bufHdr, XLogRecPtr replay_from, XLogRecPtr checkpoint_lsn, SMgrRelation smgr, @@ -862,6 +848,19 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, mode, strategy, &hit); } +/* + * Convenience wrapper for ReadBuffer_common, exported for outer usage. + */ +Buffer +polar_read_buffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy) +{ + bool hit; + + return ReadBuffer_common(smgr, relpersistence, forkNum, + blockNum, mode, strategy, &hit); +} /* * ReadBuffer_common -- common logic for all ReadBuffer variants @@ -1071,7 +1070,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* new buffers are zero-filled */ MemSet((char *) bufBlock, 0, BLCKSZ); /* don't set checksum for all-zero page */ - smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); + smgrzeroextend(smgr, forkNum, blockNum, 1, false); /* * NB: we're *not* doing a ScheduleBufferTagForWriteback here; @@ -1096,7 +1095,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); - smgrread(smgr, forkNum, blockNum, (char *) bufBlock); + smgrread(smgr, forkNum, blockNum, bufBlock); if (track_io_timing) { @@ -2914,7 +2913,6 @@ InitBufferPoolAccess(void) static void AtProcExit_Buffers(int code, Datum arg) { - AbortBufferIO(); UnlockBuffers(); CheckForBufferLeaks(); @@ -4055,7 +4053,7 @@ RelationCopyStorageUsingBuffer(RelFileNode srcnode, bool use_wal; BlockNumber nblocks; BlockNumber blkno; - PGAlignedBlock buf; + PGIOAlignedBlock buf; BufferAccessStrategy bstrategy_src; BufferAccessStrategy bstrategy_dst; @@ -5038,14 +5036,7 @@ polar_start_buffer_io_extend(BufferDesc *buf, { uint32 buf_state; - /* POLAR: bulk io */ - if (!polar_bulk_io_is_in_progress) - { - /* single io */ - Assert(!InProgressBuf); - } - /* POLAR end */ - + ResourceOwnerEnlargeBufferIOs(CurrentResourceOwner); for (;;) { @@ -5085,27 +5076,8 @@ polar_start_buffer_io_extend(BufferDesc *buf, buf_state |= BM_IO_IN_PROGRESS; UnlockBufHdr(buf, buf_state); - /* POLAR: bulk io */ - if (!polar_bulk_io_is_in_progress) - { - /* single io */ - InProgressBuf = buf; - IsForInput = forInput; - } - else - { - /* bulk io */ - polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count] = buf; - - /* - * bulk read io may be mixed with temporary write io, for flushing - * dirty evicted page. So polar_bulk_io_is_for_input[] is required for - * error recovery. - */ - polar_bulk_io_is_for_input[polar_bulk_io_in_progress_count] = forInput; - polar_bulk_io_in_progress_count++; - } - /* POLAR end */ + ResourceOwnerRememberBufferIO(CurrentResourceOwner, + BufferDescriptorGetBuffer(buf)); return true; } @@ -5131,32 +5103,6 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) { uint32 buf_state; - /* POLAR:bulk io */ - - /* - * Because assert will be ignored during release mode, we only use - * Assert() - */ - /* ---------------- - * single io: - * if (!polar_bulk_io_is_in_progress) - { Assert(buf == InProgressBuf); } - * ---------------- - */ - Assert(polar_bulk_io_is_in_progress || buf == InProgressBuf); - /* ---------------- - * bulk io: - * if (polar_bulk_io_is_in_progress) - * { - * Assert(polar_bulk_io_in_progress_count > 0); - * Assert(buf == polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]); - * } - * ---------------- - */ - Assert(!polar_bulk_io_is_in_progress || polar_bulk_io_in_progress_count > 0); - Assert(!polar_bulk_io_is_in_progress || buf == polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]); - /* POLAR end */ - buf_state = LockBufHdr(buf); Assert(buf_state & BM_IO_IN_PROGRESS); @@ -5168,26 +5114,14 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) buf_state |= set_flag_bits; UnlockBufHdr(buf, buf_state); - /* POLAR: bulk io */ - if (!polar_bulk_io_is_in_progress) - { - /* single io */ - InProgressBuf = NULL; - } - else - { - /* bulk io */ - polar_bulk_io_in_progress_count--; - } - /* POLAR end */ - - InProgressBuf = NULL; + ResourceOwnerForgetBufferIO(CurrentResourceOwner, + BufferDescriptorGetBuffer(buf)); ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf)); } /* - * AbortBufferIO: Clean up any active buffer I/O after an error. + * AbortBufferIO: Clean up active buffer I/O after an error. * * All LWLocks we might have held have been released, * but we haven't yet released buffer pins, so the buffer is still pinned. @@ -5196,71 +5130,42 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) * possible the error condition wasn't related to the I/O. */ void -AbortBufferIO(void) +AbortBufferIO(Buffer buf) { - BufferDesc *buf = InProgressBuf; + BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1); + uint32 buf_state; -polar_bulk_read: + buf_state = LockBufHdr(buf_hdr); + Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID)); - /* - * POLAR: deal with local buffer(buf->buf_id < 0) local buffer doesn't - * need to release source, just decrease io_in_progress_count - */ - if (buf && buf->buf_id < 0) - polar_bulk_io_in_progress_count--; - else if (buf) + if (!(buf_state & BM_VALID)) { - uint32 buf_state; - - buf_state = LockBufHdr(buf); - Assert(buf_state & BM_IO_IN_PROGRESS); - if (IsForInput) - { - Assert(!(buf_state & BM_DIRTY)); - - /* We'd better not think buffer is valid yet */ - Assert(!(buf_state & BM_VALID)); - UnlockBufHdr(buf, buf_state); - } - else - { - Assert(buf_state & BM_DIRTY); - UnlockBufHdr(buf, buf_state); - /* Issue notice if this is not the first failure... */ - if (buf_state & BM_IO_ERROR) - { - /* Buffer is pinned, so we can read tag without spinlock */ - char *path; - - path = relpathperm(buf->tag.rnode, buf->tag.forkNum); - ereport(WARNING, - (errcode(ERRCODE_IO_ERROR), - errmsg("could not write block %u of %s", - buf->tag.blockNum, path), - errdetail("Multiple failures --- write error might be permanent."))); - pfree(path); - } - } - TerminateBufferIO(buf, false, BM_IO_ERROR); + Assert(!(buf_state & BM_DIRTY)); + UnlockBufHdr(buf_hdr, buf_state); } - - /* POLAR: bulk io recovery */ - if (polar_bulk_io_is_in_progress) + else { - if (polar_bulk_io_in_progress_count > 0) - { - buf = polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]; - IsForInput = polar_bulk_io_is_for_input[polar_bulk_io_in_progress_count - 1]; + Assert(buf_state & BM_DIRTY); + UnlockBufHdr(buf_hdr, buf_state); - /* - * In TerminateBufferIO(), polar_bulk_io_in_progress_count was - * reduced by 1. - */ - goto polar_bulk_read; + /* Issue notice if this is not the first failure... */ + if (buf_state & BM_IO_ERROR) + { + /* Buffer is pinned, so we can read tag without spinlock */ + char *path; + + path = relpathperm(buf_hdr->tag.rnode, buf_hdr->tag.forkNum); + ereport(WARNING, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not write block %u of %s", + buf_hdr->tag.blockNum, path), + errdetail("Multiple failures --- write error might be permanent."))); + pfree(path); } - polar_bulk_io_is_in_progress = false; } + TerminateBufferIO(buf_hdr, false, BM_IO_ERROR); + /* * POLAR: we must reset read_min_lsn where ERROR, otherwise bgwriter * cannot clean hashtable or logindex anymore diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index e01dddd1108..7d5a7b2db03 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -202,8 +202,6 @@ have_free_buffer(void) * * To ensure that no one else can pin the buffer before we do, we must * return the buffer with the buffer header spinlock still held. - * POLAR: if reading bulk non-first page and most buffers are pinned. return NULL - * instead of log(error). */ BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) @@ -358,14 +356,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) * infinite loop. */ UnlockBufHdr(buf, local_buf_state); - - /* - * POLAR: bulk read, alloc not-first page, if failed just ok, - * return NULL. - */ - if (polar_bulk_io_is_in_progress && polar_bulk_io_in_progress_count > 0) - return NULL; - elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf, local_buf_state); diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index fd0b44c3e0f..39b2b2a9908 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -154,12 +154,6 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, *foundPtr = true; else { - /* POLAR: bulk read. the same as StartBufferIO */ - if (polar_bulk_io_is_in_progress) - { - polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count] = bufHdr; - polar_bulk_io_in_progress_count++; - } /* Previous read attempt must have failed; try again */ *foundPtr = false; } @@ -281,15 +275,6 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); *foundPtr = false; - - /* POLAR: Bulk read. the same as StartBufferIO */ - if (polar_bulk_io_is_in_progress) - { - polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count] = bufHdr; - polar_bulk_io_in_progress_count++; - } - /* POLAR end */ - return bufHdr; } @@ -543,8 +528,11 @@ GetLocalBufferStorage(void) /* And don't overflow MaxAllocSize, either */ num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); - cur_block = (char *) MemoryContextAllocIOAligned(LocalBufferContext, - num_bufs * BLCKSZ, 0); + /* Buffers should be I/O aligned. */ + cur_block = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + MemoryContextAlloc(LocalBufferContext, + num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); next_buf_in_block = 0; num_bufs_in_block = num_bufs; } diff --git a/src/backend/storage/buffer/polar_bufmgr.c b/src/backend/storage/buffer/polar_bufmgr.c index ae33fde212b..d202ff18557 100644 --- a/src/backend/storage/buffer/polar_bufmgr.c +++ b/src/backend/storage/buffer/polar_bufmgr.c @@ -1188,20 +1188,20 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for { SMgrRelation smgr = reln->rd_smgr; BufferDesc *bufHdr; - BufferDesc *first_buf_hdr; Block bufBlock; bool found; bool isLocalBuf = SmgrIsTemp(smgr); int actual_bulk_io_count; int index; char *buf_read; + BufferDesc *buffers[MAX_BUFFERS_TO_READ_BY]; + bool checksum_fail[MAX_BUFFERS_TO_READ_BY] = {false}; /* POLAR: start lsn to do replay */ XLogRecPtr checkpoint_redo_lsn = InvalidXLogRecPtr; XLogRecPtr replay_from; polar_redo_action redo_action; uint32 repeat_read_times = 0; - bool *checksum_fail; polar_pgstat_count_bulk_read_calls(reln); @@ -1215,28 +1215,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for mode, strategy, hit); } - Assert(!polar_bulk_io_is_in_progress); - Assert(0 == polar_bulk_io_in_progress_count); - - /* bulk read begin */ - polar_bulk_io_is_in_progress = true; - - /* - * Alloc buffer for polar_bulk_io_in_progress_buf and - * polar_bulk_io_is_for_input on demand. If bulk read is called once, - * there is a great possibility that bulk read will be called later. - * polar_bulk_io_in_progress_buf and polar_bulk_io_is_for_input will be - * not freed, until backend exit. - */ - if (NULL == polar_bulk_io_in_progress_buf) - { - Assert(NULL == polar_bulk_io_is_for_input); - polar_bulk_io_in_progress_buf = MemoryContextAlloc(TopMemoryContext, - POLAR_MAX_BULK_IO_SIZE * sizeof(polar_bulk_io_in_progress_buf[0])); - polar_bulk_io_is_for_input = MemoryContextAlloc(TopMemoryContext, - POLAR_MAX_BULK_IO_SIZE * sizeof(polar_bulk_io_is_for_input[0])); - } - *hit = false; /* Make sure we will have room to remember the buffer pin */ @@ -1259,11 +1237,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for } else { - /* - * lookup the buffer. IO_IN_PROGRESS is set if the requested block is - * not currently in memory. If not found, - * polar_bulk_io_in_progress_count will be added by 1. - */ bufHdr = BufferAlloc(smgr, relpersistence, forkNum, firstBlockNum, strategy, &found); if (found) @@ -1291,10 +1264,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for false, found); - Assert(0 == polar_bulk_io_in_progress_count); - /* important, mark bulk_io end */ - polar_bulk_io_is_in_progress = false; - return BufferDescriptorGetBuffer(bufHdr); } @@ -1312,14 +1281,7 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for */ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */ - Assert(1 == polar_bulk_io_in_progress_count); - Assert(bufHdr == polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]); - - /* - * Hold the first block bufHdr, after TerminateBufferIO(), - * polar_bulk_io_in_progress_buf is freed. - */ - first_buf_hdr = bufHdr; + buffers[0] = bufHdr; /* * Make sure than single bulk_read will not read blocks across files. @@ -1340,9 +1302,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for /* * lookup the buffer. IO_IN_PROGRESS is set if the requested block is * not currently in memory. - * - * If not found, polar_bulk_io_in_progress_count will be added by 1 by - * StartBufferIO(). */ if (isLocalBuf) bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); @@ -1350,32 +1309,30 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, strategy, &found); + Assert(bufHdr); + /* * For extra block, don't update pgBufferUsage.shared_blks_hit or * pgBufferUsage.shared_blks_read, also the blocks are not used now. */ - /* bufHdr == NULL, all buffers are pinned. */ - if (found || bufHdr == NULL) + if (found) { /* * important: this buffer is the upper boundary, it should be * excluded. */ - if (bufHdr != NULL) - { - ReleaseBuffer(BufferDescriptorGetBuffer(bufHdr)); - } + ReleaseBuffer(BufferDescriptorGetBuffer(bufHdr)); break; } Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */ + buffers[index] = bufHdr; } - Assert(index == polar_bulk_io_in_progress_count); + actual_bulk_io_count = index; /* - * Until now, as to {blockNum + [0, polar_bulk_io_in_progress_count)} - * block buffers, IO_IN_PROGRESS flag is set and io_in_progress_lock is - * holded. + * Until now, as to {blockNum + [0, actual_bulk_io_count)} block buffers, + * IO_IN_PROGRESS flag is set and io_in_progress_lock is holded. * * Other proc(include backend sql exec、start xlog replay) which read * there buffers, would be blocked on io_in_progress_lock. @@ -1384,21 +1341,7 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for * lock on io_in_progress_lock. */ - /* - * polar_bulk_io_in_progress_count will be reduced by TerminateBufferIO(), - * For safety, its copy actual_bulk_io_count is used. - */ - actual_bulk_io_count = polar_bulk_io_in_progress_count; - - /* for eliminating palloc and memcpy */ - if (1 == actual_bulk_io_count) - buf_read = isLocalBuf ? - (char *) LocalBufHdrGetBlock(first_buf_hdr) : - (char *) BufHdrGetBlock(first_buf_hdr); - else - buf_read = (char *) palloc_io_aligned(actual_bulk_io_count * BLCKSZ, MCXT_ALLOC_ZERO); - - checksum_fail = (bool *) palloc0(actual_bulk_io_count * sizeof(bool)); + buf_read = (char *) palloc_aligned(actual_bulk_io_count * BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); repeat_read: @@ -1475,25 +1418,15 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for } } - /* - * notice: 1. buffers must be processed by TerminateBufferIO() from back - * to front. a) TerminateBufferIO() release - * polar_bulk_io_in_progress_buf[] in decrement order. b) For better - * performance, LWLockRelease() release io_in_progress_lock in decrement - * order. 2. polar_bulk_io_in_progress_count was reduced by - * TerminateBufferIO(). a) polar_bulk_io_in_progress_count must not be - * used here. - */ for (index = actual_bulk_io_count - 1; index >= 0; index--) { BlockNumber blockNum = firstBlockNum + index; - bufHdr = polar_bulk_io_in_progress_buf[index]; + bufHdr = buffers[index]; bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); /* need copy page content from aligned_buf_read to block shared_buffer */ - if (actual_bulk_io_count != 1) - memcpy((char *) bufBlock, buf_read + index * BLCKSZ, BLCKSZ); + memcpy((char *) bufBlock, buf_read + index * BLCKSZ, BLCKSZ); if (unlikely(polar_trace_logindex_messages <= DEBUG3)) { @@ -1552,8 +1485,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for buf_state |= BM_VALID; pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); - /* bulk io */ - polar_bulk_io_in_progress_count--; } else TerminateBufferIO(bufHdr, false, BM_VALID); @@ -1589,21 +1520,9 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for false, found); - /* - * notice: polar_bulk_io_in_progress_count was reduced by - * TerminateBufferIO(). polar_bulk_io_in_progress_count must not be used - * here. - */ - if (actual_bulk_io_count != 1) - pfree(buf_read); - - pfree(checksum_fail); - - Assert(0 == polar_bulk_io_in_progress_count); - /* important, mark bulk_io end */ - polar_bulk_io_is_in_progress = false; + pfree(buf_read); - return BufferDescriptorGetBuffer(first_buf_hdr); + return BufferDescriptorGetBuffer(buffers[0]); } bool polar_is_future_page(BufferDesc *buf_hdr) diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 56b88594cc8..1b1d556fd4a 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -95,6 +95,11 @@ struct BufFile off_t curOffset; /* offset part of current pos */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */ + + /* + * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid + * wasting per-file alignment padding when some users create many files. + */ PGAlignedBlock buffer; }; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 2fe8c0334c9..198540106f0 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -96,7 +96,6 @@ #include "common/pg_prng.h" #include "miscadmin.h" #include "pgstat.h" -#include "port/pg_iovec.h" #include "portability/mem.h" #include "postmaster/startup.h" #include "storage/fd.h" @@ -169,6 +168,11 @@ bool data_sync_retry = false; /* How SyncDataDirectory() should do its job. */ int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; +/* POLAR: GUC */ +bool polar_enable_fallocate_no_hide_stale; + +/* POLAR end */ + /* Debugging.... */ #ifdef FDDEBUG @@ -2080,16 +2084,16 @@ FileClose(File file) * to read into. */ int -FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info) +FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info) { #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) int returnCode; Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d", + DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT, file, VfdCache[file].fileName, - (int64) offset, amount)); + (int64) offset, (int64) amount)); returnCode = FileAccess(file); if (returnCode < 0) @@ -2130,16 +2134,16 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) pgstat_report_wait_end(); } -int -FileRead(File file, char *buffer, int amount, off_t offset, +ssize_t +FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info) { - int returnCode; + ssize_t returnCode; Vfd *vfdP; Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p", + DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p", file, VfdCache[file].fileName, (int64) offset, amount, buffer)); @@ -2188,16 +2192,16 @@ FileRead(File file, char *buffer, int amount, off_t offset, return returnCode; } -int -FileWrite(File file, char *buffer, int amount, off_t offset, +ssize_t +FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info) { - int returnCode; + ssize_t returnCode; Vfd *vfdP; Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p", + DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p", file, VfdCache[file].fileName, (int64) offset, amount, buffer)); @@ -2307,6 +2311,100 @@ FileSync(File file, uint32 wait_event_info) return returnCode; } +/* + * Zero a region of the file. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info, bool bulkwrite) +{ + int returnCode; + ssize_t written; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + if (bulkwrite) + written = polar_pwrite_zeros(VfdCache[file].fd, amount, offset); + else + written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset); + pgstat_report_wait_end(); + + if (written < 0) + return -1; + else if (written != amount) + { + /* if errno is unset, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + return -1; + } + + return 0; +} + +/* + * Try to reserve file space with posix_fallocate(). If posix_fallocate() is + * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP, + * use FileZero() instead. + * + * Note that at least glibc() implements posix_fallocate() in userspace if not + * implemented by the filesystem. That's not the case for all environments + * though. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ +#ifdef HAVE_POSIX_FALLOCATE + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return -1; + +retry: + pgstat_report_wait_start(wait_event_info); + returnCode = polar_posix_fallocate(VfdCache[file].fd, offset, amount); + pgstat_report_wait_end(); + + if (returnCode == 0) + return 0; + else if (returnCode == EINTR) + goto retry; + + /* for compatibility with %m printing etc */ + errno = returnCode; + + /* + * Return in cases of a "real" failure, if fallocate is not supported, + * fall through to the FileZero() backed implementation. + */ + if (returnCode != EINVAL && returnCode != EOPNOTSUPP) + return -1; +#endif + + return FileZero(file, offset, amount, wait_event_info, true); +} + off_t FileSize(File file) { @@ -3846,67 +3944,3 @@ data_sync_elevel(int elevel) { return data_sync_retry ? elevel : PANIC; } - -/* - * A convenience wrapper for pg_pwritev() that retries on partial write. If an - * error is returned, it is unspecified how much has been written. - */ -ssize_t -pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) -{ - struct iovec iov_copy[PG_IOV_MAX]; - ssize_t sum = 0; - ssize_t part; - - /* We'd better have space to make a copy, in case we need to retry. */ - if (iovcnt > PG_IOV_MAX) - { - errno = EINVAL; - return -1; - } - - for (;;) - { - /* Write as much as we can. */ - part = polar_pwritev(fd, iov, iovcnt, offset); - if (part < 0) - return -1; - -#ifdef SIMULATE_SHORT_WRITE - part = Min(part, 4096); -#endif - - /* Count our progress. */ - sum += part; - offset += part; - - /* Step over iovecs that are done. */ - while (iovcnt > 0 && iov->iov_len <= part) - { - part -= iov->iov_len; - ++iov; - --iovcnt; - } - - /* Are they all done? */ - if (iovcnt == 0) - { - /* We don't expect the kernel to write more than requested. */ - Assert(part == 0); - break; - } - - /* - * Move whatever's left to the front of our mutable copy and adjust - * the leading iovec. - */ - Assert(iovcnt > 0); - memmove(iov_copy, iov, sizeof(*iov) * iovcnt); - Assert(iov->iov_len > part); - iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part; - iov_copy[0].iov_len -= part; - iov = iov_copy; - } - - return sum; -} diff --git a/src/backend/storage/file/polar_fd.c b/src/backend/storage/file/polar_fd.c index 1f16b79f156..a668752ff7f 100644 --- a/src/backend/storage/file/polar_fd.c +++ b/src/backend/storage/file/polar_fd.c @@ -104,7 +104,12 @@ vfs_mgr polar_vfs[] = .vfs_fsync = pg_fsync, .vfs_unlink = unlink, .vfs_rename = rename, - .vfs_fallocate = posix_fallocate, + .vfs_posix_fallocate = posix_fallocate, +#ifdef __linux__ + .vfs_fallocate = fallocate, +#else + .vfs_fallocate = NULL, +#endif .vfs_ftruncate = ftruncate, .vfs_truncate = truncate, .vfs_opendir = opendir, @@ -125,6 +130,7 @@ vfs_mgr polar_vfs[] = #else .vfs_posix_fadvise = NULL, #endif + .vfs_type = polar_bufferio_vfs_type, }, { .vfs_env_init = NULL, @@ -149,6 +155,7 @@ vfs_mgr polar_vfs[] = .vfs_fsync = NULL, .vfs_unlink = NULL, .vfs_rename = NULL, + .vfs_posix_fallocate = NULL, .vfs_fallocate = NULL, .vfs_ftruncate = NULL, .vfs_truncate = NULL, @@ -162,6 +169,7 @@ vfs_mgr polar_vfs[] = .vfs_sync_file_range = NULL, .vfs_posix_fadvise = NULL, .vfs_mmap = NULL, + .vfs_type = NULL, } }; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 9d14151b572..d72d4502652 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1525,7 +1525,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) * and second to avoid wasting space in processes that never call this. */ if (pageCopy == NULL) - pageCopy = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0); + pageCopy = MemoryContextAllocAligned(TopMemoryContext, + BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); memcpy(pageCopy, (char *) page, BLCKSZ); ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile index 7b629f08d25..a36313f2c39 100644 --- a/src/backend/storage/smgr/Makefile +++ b/src/backend/storage/smgr/Makefile @@ -13,6 +13,7 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = \ + bulk_write.o \ md.o \ smgr.o diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c new file mode 100644 index 00000000000..dbda0e26982 --- /dev/null +++ b/src/backend/storage/smgr/bulk_write.c @@ -0,0 +1,275 @@ +/*------------------------------------------------------------------------- + * + * bulk_write.c + * Efficiently and reliably populate a new relation + * + * The assumption is that no other backends access the relation while we are + * loading it, so we can take some shortcuts. Do not mix operations through + * the regular buffer manager and the bulk loading interface! + * + * We bypass the buffer manager to avoid the locking overhead, and call + * smgrextend() directly. A downside is that the pages will need to be + * re-read into shared buffers on first use after the build finishes. That's + * usually a good tradeoff for large relations, and for small relations, the + * overhead isn't very significant compared to creating the relation in the + * first place. + * + * The pages are WAL-logged if needed. To save on WAL header overhead, we + * WAL-log several pages in one record. + * + * One tricky point is that because we bypass the buffer manager, we need to + * register the relation for fsyncing at the next checkpoint ourselves, and + * make sure that the relation is correctly fsync'd by us or the checkpointer + * even if a checkpoint happens concurrently. + * + * NOTE: + * fsync is removed from PolarDB for we use buffer pool to cache those pages. + * + * Portions Copyright (c) 2024, Alibaba Group Holding Limited + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/smgr/bulk_write.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xloginsert.h" +#include "access/xlogrecord.h" +#include "storage/bufmgr.h" +#include "storage/bufpage.h" +#include "storage/bulk_write.h" +#include "storage/proc.h" +#include "storage/smgr.h" +#include "utils/rel.h" + +#define MAX_PENDING_WRITES 512 + +typedef struct PendingWrite +{ + BulkWriteBuffer buf; + BlockNumber blkno; + bool page_std; +} PendingWrite; + +/* + * Bulk writer state for one relation fork. + */ +struct BulkWriteState +{ + /* Information about the target relation we're writing */ + SMgrRelation smgr; + ForkNumber forknum; + bool use_wal; + char relpersistence; + + /* We keep several writes queued, and WAL-log them in batches */ + int npending; + PendingWrite pending_writes[MAX_PENDING_WRITES]; + + /* Current size of the relation */ + BlockNumber pages_written; + + MemoryContext memcxt; +}; + +/* GUCs */ +int polar_bulk_write_maxpages; + +static void smgr_bulk_flush(BulkWriteState *bulkstate); + +/* + * Start a bulk write operation on a relation fork. + */ +BulkWriteState * +smgr_bulk_start_rel(Relation rel, ForkNumber forknum) +{ + return smgr_bulk_start_smgr(RelationGetSmgr(rel), + forknum, + RelationNeedsWAL(rel) || forknum == INIT_FORKNUM, + rel->rd_rel->relpersistence); +} + +/* + * Start a bulk write operation on a relation fork. + * + * This is like smgr_bulk_start_rel, but can be used without a relcache entry. + */ +BulkWriteState * +smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal, char relpersistence) +{ + BulkWriteState *state; + + state = palloc(sizeof(BulkWriteState)); + state->smgr = smgr; + state->forknum = forknum; + state->use_wal = use_wal; + state->relpersistence = relpersistence; + + state->npending = 0; + state->pages_written = 0; + + /* + * Remember the memory context. We will use it to allocate all the + * buffers later. + */ + state->memcxt = CurrentMemoryContext; + + return state; +} + +/* + * Finish bulk write operation. + */ +void +smgr_bulk_finish(BulkWriteState *bulkstate) +{ + /* WAL-log and flush any remaining pages */ + smgr_bulk_flush(bulkstate); +} + +static int +buffer_cmp(const void *a, const void *b) +{ + const PendingWrite *bufa = (const PendingWrite *) a; + const PendingWrite *bufb = (const PendingWrite *) b; + + /* We should not see duplicated writes for the same block */ + Assert(bufa->blkno != bufb->blkno); + if (bufa->blkno > bufb->blkno) + return 1; + else + return -1; +} + +/* + * Finish all the pending writes. + */ +static void +smgr_bulk_flush(BulkWriteState *bulkstate) +{ + int npending = bulkstate->npending; + PendingWrite *pending_writes = bulkstate->pending_writes; + BlockNumber nblocks; + + if (npending == 0) + return; + + if (npending > 1) + qsort(pending_writes, npending, sizeof(PendingWrite), buffer_cmp); + + nblocks = bulkstate->pending_writes[npending - 1].blkno + 1; + + /* + * Before we alloc buffers from buffer pool for those pages, extend the + * underlying file first. + */ + if (nblocks > bulkstate->pages_written) + { + smgrzeroextend(bulkstate->smgr, bulkstate->forknum, bulkstate->pages_written, + nblocks - bulkstate->pages_written, true); + bulkstate->pages_written = nblocks; + } + + for (int i = 0; i < npending;) + { + int nbatch = 0; + BlockNumber blknos[XLR_MAX_BLOCK_ID]; + Page pages[XLR_MAX_BLOCK_ID]; + Buffer buffers[XLR_MAX_BLOCK_ID]; + bool page_std = true; + + /* + * Accumulate XLR_MAX_BLOCK_ID pages at most per round. For + * log_newpages takes those count of pages into one record. Also to + * reduce the usage of LWLock to avoid "too many LWLocks taken" ERROR. + */ + do + { + BlockNumber blkno = pending_writes[i].blkno; + Page cached_page = pending_writes[i].buf->data; + Page page; + Buffer buffer; + + buffer = polar_read_buffer_common(bulkstate->smgr, bulkstate->relpersistence, + bulkstate->forknum, blkno, RBM_ZERO_AND_LOCK, NULL); + page = BufferGetPage(buffer); + + memcpy(page, cached_page, BLCKSZ); + pfree(cached_page); + + MarkBufferDirty(buffer); + + /* + * If any of the pages use !page_std, we log them all as such. + * That's a bit wasteful, but in practice, a mix of standard and + * non-standard page layout is rare. None of the built-in AMs do + * that. + */ + if (!pending_writes[i].page_std) + page_std = false; + + blknos[nbatch] = blkno; + pages[nbatch] = page; + buffers[nbatch] = buffer; + + i++; + nbatch++; + } while (i < npending && nbatch < XLR_MAX_BLOCK_ID); + + /* + * log_newpages takes pages from buffer pool, it will do PageSetLSN + * for those pages. After the logging stuff, we can mark dirty and + * release those buffers. + */ + if (bulkstate->use_wal) + log_newpages(&bulkstate->smgr->smgr_rnode.node, bulkstate->forknum, + nbatch, blknos, pages, page_std); + + for (int j = 0; j < nbatch; j++) + UnlockReleaseBuffer(buffers[j]); + } + + bulkstate->npending = 0; +} + +/* + * Queue write of 'buf'. + * + * NB: this takes ownership of 'buf'! + * + * You are only allowed to write a given block once as part of one bulk write + * operation. + */ +void +smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std) +{ + PendingWrite *w; + + w = &bulkstate->pending_writes[bulkstate->npending++]; + w->buf = buf; + w->blkno = blocknum; + w->page_std = page_std; + + if (bulkstate->npending >= polar_bulk_write_maxpages) + smgr_bulk_flush(bulkstate); +} + +/* + * Allocate a new buffer which can later be written with smgr_bulk_write(). + * + * There is no function to free the buffer. When you pass it to + * smgr_bulk_write(), it takes ownership and frees it when it's no longer + * needed. + * + * This is currently implemented as a simple palloc, but could be implemented + * using a ring buffer or larger chunks in the future, so don't rely on it. + */ +BulkWriteBuffer +smgr_bulk_get_buf(BulkWriteState *bulkstate) +{ + return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0); +} diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index c986868dd81..dc4c0e6f1aa 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -45,6 +45,7 @@ /* POLAR */ #include "access/polar_logindex_redo.h" #include "storage/polar_fd.h" +#include "utils/guc.h" /* * The magnetic disk storage manager keeps track of open file @@ -125,6 +126,17 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ #define EXTENSION_DONT_OPEN (1 << 5) +/* POLAR: GUCs */ +int polar_zero_extend_method = POLAR_ZERO_EXTEND_BULKWRITE; + +const struct config_enum_entry polar_zero_extend_method_options[] = { + {"none", POLAR_ZERO_EXTEND_NONE, false}, + {"bulkwrite", POLAR_ZERO_EXTEND_BULKWRITE, false}, + {"fallocate", POLAR_ZERO_EXTEND_FALLOCATE, false}, + {NULL, 0, false} +}; + + /* local routines */ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo); @@ -457,13 +469,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) + const void *buffer, bool skipFsync) { off_t seekpos; int nbytes; MdfdVec *v; - AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN); + AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE); /* POLAR: bulk extend */ if (reln->polar_flag_for_bulk_extend[forknum]) @@ -492,13 +504,29 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, relpath(reln->smgr_rnode, forknum), InvalidBlockNumber))); + TRACE_POSTGRESQL_SMGR_MD_EXTEND_START(forknum, blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) + nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND); + + TRACE_POSTGRESQL_SMGR_MD_EXTEND_DONE(forknum, blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, + nbytes, + BLCKSZ); + + if (nbytes != BLCKSZ) { if (nbytes < 0) ereport(ERROR, @@ -521,6 +549,262 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); } +/* + * mdzeroextend() -- Add new zeroed out blocks to the specified relation. + * + * Similar to mdextend(), except the relation can be extended by multiple + * blocks at once and the added blocks will be filled with zeroes. + */ +void +mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + Assert(nblocks > 0); + + /* POLAR: bulk extend */ + if (reln->polar_flag_for_bulk_extend[forknum]) + { + Assert(reln->polar_nblocks_faked_for_bulk_extend[forknum] == blocknum); + reln->polar_nblocks_faked_for_bulk_extend[forknum] += nblocks; + return; + } + /* POLAR end */ + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rnode, forknum), + InvalidBlockNumber))); + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + TRACE_POSTGRESQL_SMGR_MD_ZEROEXTEND_START(forknum, blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + + v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + /* + * If available and useful, use posix_fallocate() (via + * FileFallocate()) to extend the relation. That's often more + * efficient than using write(), as it commonly won't cause the kernel + * to allocate page cache space for the extended pages. + * + * However, we don't use FileFallocate() for small extensions, as it + * defeats delayed allocation on some filesystems. Not clear where + * that decision should be made though? For now just use a cutoff of + * 8, anything between 4 and 8 worked OK in some local testing. + */ + if (polar_zero_extend_method == POLAR_ZERO_EXTEND_FALLOCATE) + { + int ret; + + ret = FileFallocate(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret != 0) + { + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\" with FileFallocate(): %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + } + else if (polar_zero_extend_method == POLAR_ZERO_EXTEND_BULKWRITE) + { + int ret; + + /* + * Even if we don't want to use fallocate, we can still extend a + * bit more efficiently than writing each 8kB block individually. + * polar_pwrite_zeros() (via FileZero()) uses bulk zero buffers to + * avoid multiple writes or needing a zeroed buffer for the whole + * length of the extension. + */ + ret = FileZero(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND, true); + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + else if (polar_zero_extend_method == POLAR_ZERO_EXTEND_NONE) + { + int ret; + + /* + * Even if we don't want to use fallocate, we can still extend a + * bit more efficiently than writing each 8kB block individually. + * pg_pwrite_zeroes() (via FileZero()) uses + * pg_pwritev_with_retry() to avoid multiple writes or needing a + * zeroed buffer for the whole length of the extension. + */ + ret = FileZero(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND, false); + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + else + elog(ERROR, "Invalid polar_zero_extend_method %d", polar_zero_extend_method); + + TRACE_POSTGRESQL_SMGR_MD_ZEROEXTEND_DONE(forknum, blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, + BLCKSZ * numblocks, + BLCKSZ * numblocks); + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + + remblocks -= numblocks; + curblocknum += numblocks; + } +} + +/* + * polar_mdbulkextend() -- Add blocks to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE); + + Assert(nblocks > 0); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rnode, forknum), + InvalidBlockNumber))); + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + int nbytes; + int amount; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + amount = BLCKSZ * numblocks; + + TRACE_POSTGRESQL_SMGR_MD_EXTEND_START(forknum, blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + + v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + + seekpos = (off_t) BLCKSZ * segstartblock; + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + nbytes = FileWrite(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_EXTEND); + + TRACE_POSTGRESQL_SMGR_MD_EXTEND_DONE(forknum, blocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, + nbytes, + amount); + + if (nbytes != amount) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space."))); + /* short write: complain appropriately */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", + FilePathName(v->mdfd_vfd), + nbytes, amount, blocknum), + errhint("Check free disk space."))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + remblocks -= numblocks; + curblocknum += numblocks; + buffer = (char *) buffer + amount; + } +} + /* * mdopenfork() -- Open one fork of the specified relation. * @@ -690,13 +974,13 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, */ void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer) + void *buffer) { off_t seekpos; int nbytes; MdfdVec *v; - AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN); + AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE); TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -752,6 +1036,96 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } } +/* + * polar_mdbulkread() -- Read the specified continuous blocks from a relation. + */ +void +polar_mdbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, void *buffer) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE); + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + int nbytes; + int amount; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + amount = BLCKSZ * numblocks; + + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, curblocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + + v = _mdfd_getseg(reln, forknum, curblocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * segstartblock; + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + nbytes = FileRead(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_READ); + + TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, curblocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, + nbytes, + amount); + + if (nbytes != amount) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in file \"%s\": %m", + blocknum, FilePathName(v->mdfd_vfd)))); + + /* + * Short read: we are at or past EOF, or we read a partial block + * at EOF. Normally this is an error; upper levels should never + * try to read a nonexistent block. However, if + * zero_damaged_pages is ON or we are InRecovery, we should + * instead return zeroes without complaining. This allows, for + * example, the case of trying to update a block that was later + * truncated away. + */ + if (zero_damaged_pages || InRecovery) + { + /* only zero damaged_pages */ + int damaged_pages_start_offset = nbytes - nbytes % BLCKSZ; + + MemSet((char *) buffer + damaged_pages_start_offset, 0, amount - damaged_pages_start_offset); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not bulk read block %u in file \"%s\": read only %d of %d bytes", + blocknum, FilePathName(v->mdfd_vfd), + nbytes, amount))); + } + + remblocks -= numblocks; + curblocknum += numblocks; + buffer = (char *) buffer + amount; + } +} + /* * mdwrite() -- Write the supplied block at the appropriate location. * @@ -761,13 +1135,13 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) + const void *buffer, bool skipFsync) { off_t seekpos; int nbytes; MdfdVec *v; - AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN); + AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE); /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND @@ -818,6 +1192,93 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, register_dirty_segment(reln, forknum, v); } +/* + * polar_mdbulkwrite() -- Write the supplied continuous blocks at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +polar_mdbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum < mdnblocks(reln, forknum)); +#endif + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + int nbytes; + int amount; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + amount = BLCKSZ * numblocks; + + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, curblocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + + v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * segstartblock; + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + nbytes = FileWrite(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, curblocknum, + reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, + reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, + nbytes, + amount); + + if (nbytes != amount) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write block %u in file \"%s\": %m", + blocknum, FilePathName(v->mdfd_vfd)))); + /* short write: complain appropriately */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", + blocknum, + FilePathName(v->mdfd_vfd), + nbytes, BLCKSZ), + errhint("Check free disk space."))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + remblocks -= numblocks; + curblocknum += numblocks; + buffer = (char *) buffer + amount; + } +} + /* * mdnblocks() -- Get the number of blocks stored in a relation. * @@ -979,6 +1440,49 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } } +/* + * mdregistersync() -- Mark whole relation as needing fsync + */ +void +mdregistersync(SMgrRelation reln, ForkNumber forknum) +{ + int segno; + int min_inactive_seg; + + /* + * NOTE: mdnblocks makes sure we have opened all active segments, so that + * the loop below will get them all! + */ + mdnblocks(reln, forknum); + + min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + + /* + * Temporarily open inactive segments, then close them after sync. There + * may be some inactive segments left opened after error, but that is + * harmless. We don't bother to clean them up and take a risk of further + * trouble. The next mdclose() will soon close them. + */ + while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + segno++; + + while (segno > 0) + { + MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + + register_dirty_segment(reln, forknum, v); + + /* Close inactive segments immediately */ + if (segno > min_inactive_seg) + { + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, segno - 1); + } + + segno--; + } +} + /* * mdimmedsync() -- Immediately sync a relation to stable storage. * @@ -998,7 +1502,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) /* * NOTE: mdnblocks makes sure we have opened all active segments, so that - * fsync loop will get them all! + * the loop below will get them all! */ mdnblocks(reln, forknum); @@ -1328,7 +1832,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ if (nblocks < ((BlockNumber) RELSEG_SIZE)) { - char *zerobuf = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO); + char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); mdextend(reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, @@ -1484,141 +1988,3 @@ mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) */ return ftag->rnode.dbNode == candidate->rnode.dbNode; } - -/* - * polar_mdbulkextend() -- Add a block to the specified relation. - * - * The semantics are nearly the same as mdwrite(): write at the - * specified position. However, this is to be used for the case of - * extending a relation (i.e., blocknum is at or beyond the current - * EOF). Note that we assume writing a block beyond current EOF - * causes intervening file space to become filled with zeroes. - */ -void -polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer, bool skipFsync) -{ - off_t seekpos; - int nbytes; - MdfdVec *v; - uint64 newblocknum = blocknum + blockCount; - - AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN); - - /* This assert is too expensive to have on normally ... */ -#ifdef CHECK_WRITE_VS_EXTEND - Assert(blocknum >= mdnblocks(reln, forknum)); -#endif - - /* - * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any - * more --- we mustn't create a block whose number actually is - * InvalidBlockNumber. - */ - if (newblocknum >= InvalidBlockNumber) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("cannot extend file \"%s\" beyond %u blocks", - relpath(reln->smgr_rnode, forknum), - InvalidBlockNumber))); - - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ * blockCount, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ * blockCount) - { - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), - errhint("Check free disk space."))); - /* short write: complain appropriately */ - ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", - FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ * blockCount, blocknum), - errhint("Check free disk space."))); - } - - if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); -} - -/* - * POLAR: bulk read - * - * polar_mdbulkread() -- Read the specified continuous blocks from a relation. - * - * Caller must ensure that the blockcount does not exceed the length of the relation file. - */ -void -polar_mdbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer) -{ - off_t seekpos; - int nbytes; - MdfdVec *v; - int amount = blockCount * BLCKSZ; - - AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN); - - TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - reln->smgr_rnode.backend); - - v = _mdfd_getseg(reln, forknum, blocknum, false, - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - Assert(seekpos + (off_t) amount <= (off_t) BLCKSZ * RELSEG_SIZE); - - nbytes = FileRead(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_READ); - - TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - reln->smgr_rnode.backend, - nbytes, - amount); - - if (nbytes != amount) - { - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); - - /* - * Short read: we are at or past EOF, or we read a partial block at - * EOF. Normally this is an error; upper levels should never try to - * read a nonexistent block. However, if zero_damaged_pages is ON or - * we are InRecovery, we should instead return zeroes without - * complaining. This allows, for example, the case of trying to - * update a block that was later truncated away. - */ - if (zero_damaged_pages || InRecovery || POLAR_IN_LOGINDEX_PARALLEL_REPLAY()) - { - /* only zero damaged_pages */ - int damaged_pages_start_offset = nbytes - nbytes % BLCKSZ; - - MemSet((char *) buffer + damaged_pages_start_offset, 0, amount - damaged_pages_start_offset); - } - else - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not bulk read block %u in file \"%s\": read only %d of %d bytes", - blocknum, FilePathName(v->mdfd_vfd), - nbytes, amount))); - } -} diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index bdf159e75dc..d0b69d3238c 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -6,6 +6,7 @@ * All file system operations in POSTGRES dispatch through these * routines. * + * Portions Copyright (c) 2024, Alibaba Group Holding Limited * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -53,24 +54,31 @@ typedef struct f_smgr void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer); + BlockNumber blocknum, void *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, const void *buffer, bool skipFsync); void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); - /* POLAR: bulk io */ - void (*polar_smgr_bulkextend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int blockCount, char *buffer, bool skipFsync); + void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum); + /* POLAR: bulk read */ void (*polar_smgr_bulkread) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer); + int nblocks, void *buffer); + /* POLAR: bulk write */ + void (*polar_smgr_bulkwrite) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync); + /* POLAR: bulk extend */ + void (*polar_smgr_bulkextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, const void *buffer, bool skipFsync); /* POLAR end */ } f_smgr; @@ -85,6 +93,7 @@ static const f_smgr smgrsw[] = { .smgr_exists = mdexists, .smgr_unlink = mdunlink, .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, .smgr_prefetch = mdprefetch, .smgr_read = mdread, .smgr_write = mdwrite, @@ -92,9 +101,13 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, - /* POLAR: extend io */ - .polar_smgr_bulkextend = polar_mdbulkextend, + .smgr_registersync = mdregistersync, + /* POLAR: bulk read */ .polar_smgr_bulkread = polar_mdbulkread, + /* POLAR: bulk write */ + .polar_smgr_bulkwrite = polar_mdbulkwrite, + /* POLAR: extend batch */ + .polar_smgr_bulkextend = polar_mdbulkextend, /* POLAR end */ } }; @@ -544,7 +557,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) */ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) + const void *buffer, bool skipFsync) { smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, buffer, skipFsync); @@ -573,23 +586,27 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* POLAR end */ } -/* POLAR: bulk extend */ +/* + * smgrzeroextend() -- Add new zeroed out blocks to a file. + * + * Similar to smgrextend(), except the relation can be extended by + * multiple blocks at once and the added blocks will be filled with + * zeroes. + */ void -polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer, bool skipFsync) +smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, bool skipFsync) { - Assert(blockCount >= 1); - Assert(!reln->polar_flag_for_bulk_extend[forknum]); - - smgrsw[reln->smgr_which].polar_smgr_bulkextend(reln, forknum, blocknum, blockCount, buffer, skipFsync); + smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum, + nblocks, skipFsync); /* - * Normally we expect this to increase nblocks by one, but if the cached - * value isn't as expected, just invalidate it so the next call asks the - * kernel. nblock should be blocknum + blockCount in bulkExtend + * Normally we expect this to increase the fork size by nblocks, but if + * the cached value isn't as expected, just invalidate it so the next call + * asks the kernel. */ if (reln->smgr_cached_nblocks[forknum] == blocknum) - reln->smgr_cached_nblocks[forknum] = blocknum + blockCount; + reln->smgr_cached_nblocks[forknum] = blocknum + nblocks; else reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; @@ -597,24 +614,41 @@ polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum * POLAR RSC: update new blocknum into entry. */ if (POLAR_RSC_SHOULD_UPDATE(reln, forknum)) - polar_rsc_update_entry(reln, forknum, blocknum + blockCount); + polar_rsc_update_entry(reln, forknum, blocknum + nblocks); + /* POLAR end */ } /* - * POLAR: bulk read - * - * polar_smgrbulkread() -- read multi particular block from a relation into the supplied - * buffer. + * polar_smgrbulkextend() -- Add new blocks to a file. * - * This routine is called from the buffer manager in order to - * instantiate pages in the shared buffer cache. All storage managers - * return pages in the format that POSTGRES expects. + * The semantics are nearly the same as smgrwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing nblocks beyond current EOF + * causes intervening file space to become filled with zeroes. */ void -polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer) +polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].polar_smgr_bulkread(reln, forknum, blocknum, blockCount, buffer); + smgrsw[reln->smgr_which].polar_smgr_bulkextend(reln, forknum, blocknum, nblocks, buffer, skipFsync); + + /* + * Normally we expect this to increase the fork size by nblocks, but if + * the cached value isn't as expected, just invalidate it so the next call + * asks the kernel. + */ + if (reln->smgr_cached_nblocks[forknum] == blocknum) + reln->smgr_cached_nblocks[forknum] = blocknum + nblocks; + else + reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; + + /* + * POLAR RSC: update new blocknum into entry. + */ + if (POLAR_RSC_SHOULD_UPDATE(reln, forknum)) + polar_rsc_update_entry(reln, forknum, blocknum + nblocks); + /* POLAR end */ } /* @@ -640,11 +674,26 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) */ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer) + void *buffer) { smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); } +/* + * polar_smgrbulkread() -- read multi particular blocks from a relation into + * the supplied buffers. + * + * This routine is called from the buffer manager in order to + * instantiate pages in the shared buffer cache. All storage managers + * return pages in the format that POSTGRES expects. + */ +void +polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, void *buffer) +{ + smgrsw[reln->smgr_which].polar_smgr_bulkread(reln, forknum, blocknum, nblocks, buffer); +} + /* * smgrwrite() -- Write the supplied buffer out. * @@ -662,12 +711,42 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, */ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) + const void *buffer, bool skipFsync) { smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, buffer, skipFsync); } +/* + * polar_smgrbulkwrite() -- Write the supplied buffers out. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use polar_smgrbulkextend(). + * + * This is not a synchronous write -- the block is not necessarily + * on disk at return, only dumped out to the kernel. However, + * provisions will be made to fsync the write before the next checkpoint. + * + * NB: The mechanism to ensure fsync at next checkpoint assumes that there is + * something that prevents a concurrent checkpoint from "racing ahead" of the + * write. One way to prevent that is by holding a lock on the buffer; the + * buffer manager's writes are protected by that. The bulk writer facility + * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a + * checkpoint happened; that relies on the fact that no other backend can be + * concurrently modifying the page. + * + * skipFsync indicates that the caller will make other provisions to + * fsync the relation, so we needn't bother. Temporary relations also + * do not require fsync. + */ +void +polar_smgrbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync) +{ + smgrsw[reln->smgr_which].polar_smgr_bulkwrite(reln, forknum, blocknum, nblocks, + buffer, skipFsync); +} /* * smgrwriteback() -- Trigger kernel writeback for the supplied range of @@ -847,6 +926,24 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb } } +/* + * smgrregistersync() -- Request a relation to be sync'd at next checkpoint + * + * This can be used after calling smgrwrite() or smgrextend() with skipFsync = + * true, to register the fsyncs that were skipped earlier. + * + * Note: be mindful that a checkpoint could already have happened between the + * smgrwrite or smgrextend calls and this! In that case, the checkpoint + * already missed fsyncing this relation, and you should use smgrimmedsync + * instead. Most callers should use the bulk loading facility in bulk_write.c + * which handles all that. + */ +void +smgrregistersync(SMgrRelation reln, ForkNumber forknum) +{ + smgrsw[reln->smgr_which].smgr_registersync(reln, forknum); +} + /* * smgrimmedsync() -- Force the specified relation to stable storage. * diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 6dddd180afe..44541f42a76 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -819,9 +819,6 @@ pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) tabentry->polar_bulk_read_calls += lstats->t_counts.polar_t_bulk_read_calls; tabentry->polar_bulk_read_calls_IO += lstats->t_counts.polar_t_bulk_read_calls_IO; tabentry->polar_bulk_read_blocks_IO += lstats->t_counts.polar_t_bulk_read_blocks_IO; - - /* POLAR: create index bulk extend */ - tabentry->polar_bulk_create_index_extends_times += lstats->t_counts.polar_t_bulk_create_index_extends_times; /* POLAR end */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index de70701465e..884dee08be4 100755 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -117,13 +117,16 @@ #include "utils/xml.h" /* POLAR */ +#include "access/hio.h" #include "access/multixact.h" #include "access/polar_logindex.h" #include "access/polar_logindex_redo.h" #include "access/subtrans.h" #include "access/slru.h" #include "commands/tablecmds.h" +#include "common/file_utils.h" #include "common/username.h" +#include "storage/bulk_write.h" #include "storage/polar_fd.h" #include "storage/polar_rsc.h" #include "storage/polar_xlogbuf.h" @@ -715,6 +718,7 @@ extern const struct config_enum_entry dynamic_shared_memory_options[]; /* POLAR enum GUC options start */ extern const struct config_enum_entry polar_release_assert_level_options[]; +extern const struct config_enum_entry polar_zero_extend_method_options[]; const struct config_enum_entry polar_session_id_display_options[] = { {"proxy", POLAR_SID_DISPLAY_PROXY, false}, @@ -862,20 +866,8 @@ bool polar_enable_alloc_checkinterrupts; bool polar_enable_sync_ddl; -/* POLAR: bulk io */ -int polar_recovery_bulk_extend_size = 0; -int polar_min_bulk_extend_table_size = 0; -bool polar_enable_primary_recovery_bulk_extend = false; -int polar_bulk_extend_size = 0; -int polar_bulk_read_size = 0; -int polar_index_bulk_extend_size = 0; -int polar_index_create_bulk_extend_size = 0; - /* POLAR end */ -/* POLAR: partial write */ -bool polar_has_partial_write; - bool polar_disable_escape_inside_gbk_character; /* @@ -1074,8 +1066,8 @@ const char *const config_group_names[] = gettext_noop("PolarDB Buffer Management"), /* POLAR_PROXY */ gettext_noop("PolarDB Proxy"), - /* POLAR_BULK_READ_EXTEND */ - gettext_noop("PolarDB bulk read/extend"), + /* POLAR I/O management */ + gettext_noop("PolarDB I/O Management"), /* POLAR end */ /* DEVELOPER_OPTIONS */ @@ -1233,7 +1225,7 @@ static const unit_conversion time_unit_conversion_table[] = /******** option records follow ********/ static struct config_bool ConfigureNamesBool[] = { - /* POLAR boolean GUCs start */ + /* POLAR bool GUCs start */ { {"polar_enable_persisted_logical_slot", PGC_POSTMASTER, UNGROUPED, gettext_noop("Enable persisted logical slot on shared storage."), @@ -1664,6 +1656,17 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"polar_enable_fallocate_no_hide_stale", PGC_USERSET, POLAR_IO_MANAGEMENT, + gettext_noop("Allow using FALLOC_FL_NO_HIDE_STALE during file extension."), + NULL, + POLAR_GUC_IS_CHANGABLE | POLAR_GUC_IS_INVISIBLE + }, + &polar_enable_fallocate_no_hide_stale, + true, + NULL, NULL, NULL + }, + /* * POLAR: enable to send SIGSTOP rather than SIGQUIT to all peers when * backend exit abnormally, this is set with -T parameter when start @@ -1724,20 +1727,6 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, - /* - * POLAR: bulk io - */ - { - {"polar_enable_primary_recovery_bulk_extend", PGC_SIGHUP, POLAR_BULK_READ_EXTEND, - gettext_noop("A switch to control whether to use xlog bulk extend opt during recovery on primary."), - NULL, - GUC_NO_RESET_ALL | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE - }, - &polar_enable_primary_recovery_bulk_extend, - true, - NULL, NULL, NULL - }, - { {"polar_enable_async_lock_replay_debug", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Enable async lock replay debug logging."), @@ -1793,7 +1782,7 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, - /* POLAR boolean GUCs end */ + /* POLAR bool GUCs end */ { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, @@ -3067,8 +3056,6 @@ static struct config_bool ConfigureNamesBool[] = }, - /* POLAR bool GUCs end */ - /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -3078,7 +3065,7 @@ static struct config_bool ConfigureNamesBool[] = static struct config_int ConfigureNamesInt[] = { - /* POLAR integer GUCs start */ + /* POLAR int GUCs start */ { /* see max_connections */ @@ -3701,89 +3688,76 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_BLOCKS | POLAR_GUC_IS_VISIBLE | POLAR_GUC_IS_CHANGABLE }, &polar_ring_buffer_vacuum_size, - 0, 0, (10 * 1024 * 1024 * 1024L) / BLCKSZ, + 128 * 1024 * 1024 / BLCKSZ, 0, (10 * 1024 * 1024 * 1024L) / BLCKSZ, NULL, NULL, NULL }, { - {"polar_recovery_bulk_extend_size", PGC_SIGHUP, POLAR_BULK_READ_EXTEND, - gettext_noop("Sets the size for bulk file extension while replaying xlog on standby (0 turns this feature off)."), - NULL, + {"polar_bulk_read_size", PGC_USERSET, POLAR_IO_MANAGEMENT, + gettext_noop("Size of bulk read."), + gettext_noop("0 turns this feature off."), GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - &polar_recovery_bulk_extend_size, - 512, 0, 2048, + &polar_bulk_read_size, + 16, 0, MAX_BUFFERS_TO_READ_BY, NULL, NULL, NULL }, { - {"polar_min_bulk_extend_table_size", PGC_USERSET, POLAR_BULK_READ_EXTEND, - gettext_noop("Sets the minimum amount of table data for bulk extend," - "bulk extend is enabled only when the table size >= polar_min_bulk_extend_table_size."), - NULL, + {"polar_heap_bulk_extend_size", PGC_USERSET, POLAR_IO_MANAGEMENT, + gettext_noop("Size of bulk extend for heap table."), + gettext_noop("0 turns this feature off."), GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - &polar_min_bulk_extend_table_size, - (8 * 1024 * 1024) / BLCKSZ, 0, INT_MAX / 2, + &polar_heap_bulk_extend_size, + 512, 0, MAX_BUFFERS_TO_EXTEND_BY, NULL, NULL, NULL }, { - {"polar_bulk_extend_size", PGC_USERSET, POLAR_BULK_READ_EXTEND, - gettext_noop("Sets the size of preallocate file, 0 (turning this feature off)."), - NULL, + {"polar_index_bulk_extend_size", PGC_USERSET, POLAR_IO_MANAGEMENT, + gettext_noop("Size of bulk extend for index table."), + gettext_noop("0 turns this feature off."), GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - &polar_bulk_extend_size, - 512, 0, INT_MAX / 2, + &polar_index_bulk_extend_size, + 128, 0, MAX_BUFFERS_TO_EXTEND_BY, NULL, NULL, NULL }, { - {"polar_bulk_read_size", PGC_USERSET, POLAR_BULK_READ_EXTEND, - gettext_noop("Sets size of bulk read, 0 (turning this feature off). polar_bulk_read_size= 16 means 128KB."), - NULL, + {"polar_recovery_bulk_extend_size", PGC_SIGHUP, POLAR_IO_MANAGEMENT, + gettext_noop("Size for bulk extend during recovery."), + gettext_noop("0 turns this feature off."), GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - &polar_bulk_read_size, - 16, 0, POLAR_MAX_BULK_IO_SIZE, - NULL, NULL, NULL - }, - - { - {"polar_xlog_page_buffers", PGC_POSTMASTER, RESOURCES_MEM, - gettext_noop("Sets the size of xlog buffer used by multi processes."), - NULL, - GUC_UNIT_MB | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE - }, - &polar_xlog_page_buffers, - 0, 0, INT_MAX / 2, + &polar_recovery_bulk_extend_size, + 512, 0, MAX_BUFFERS_TO_EXTEND_BY, NULL, NULL, NULL }, { - {"polar_index_bulk_extend_size", PGC_SIGHUP, POLAR_BULK_READ_EXTEND, - gettext_noop("Sets the size of preallocate file for index, 0 (turning this feature off)."), + {"polar_bulk_write_maxpages", PGC_USERSET, POLAR_IO_MANAGEMENT, + gettext_noop("Max cached pages in bulk write."), NULL, GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - /* POLAR: default 1MB */ - &polar_index_bulk_extend_size, - 128, 0, INT_MAX / 2, + &polar_bulk_write_maxpages, + 128, 1, 512, NULL, NULL, NULL }, { - {"polar_index_create_bulk_extend_size", PGC_SIGHUP, POLAR_BULK_READ_EXTEND, - gettext_noop("Sets the size of preallocate file for index create, 0 (turning this feature off)."), + {"polar_xlog_page_buffers", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Sets the size of xlog buffer used by multi processes."), NULL, - GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE + GUC_UNIT_MB | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - /* POLAR: default 4MB */ - &polar_index_create_bulk_extend_size, - 512, 0, INT_MAX / 2, + &polar_xlog_page_buffers, + 0, 0, INT_MAX / 2, NULL, NULL, NULL }, + /* POLAR int GUCs end */ { @@ -5415,16 +5389,17 @@ static struct config_int ConfigureNamesInt[] = 30000, 0, INT_MAX, NULL, NULL, NULL }, + { - {"polar_wal_init_set_size", PGC_SIGHUP, WAL_SETTINGS, - gettext_noop("Set the size of each data block written when initializing the zero wal file."), - NULL, - GUC_UNIT_BYTE | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE + {"polar_zero_buffers", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Set the size of zero buffer, which is used to init zero wal and data file."), + gettext_noop("A value of -1 indicates a request for auto-tune, 0 disables it."), + GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE }, - &polar_wal_init_set_size, - POLAR_DEFAULT_XLOG_FILL_ZERO_SIZE, POLAR_MIN_XLOG_FILL_ZERO_SIZE, POLAR_MAX_XLOG_FILL_ZERO_SIZE, - NULL, NULL, NULL + &polar_zero_buffers, + -1, -1, 4096, }, + { {"polar_instance_spec_mem", PGC_SIGHUP, DEVELOPER_OPTIONS, gettext_noop("PolarDB instance specification for memory."), @@ -6765,6 +6740,17 @@ static struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"polar_zero_extend_method", PGC_USERSET, POLAR_IO_MANAGEMENT, + gettext_noop("Selects the method of zero extend to use."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_NO_SHOW_ALL | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE + }, + &polar_zero_extend_method, + POLAR_ZERO_EXTEND_FALLOCATE, polar_zero_extend_method_options, + NULL, NULL, NULL + }, + /* POLAR enum GUCs end */ { @@ -7183,6 +7169,8 @@ static const char *const map_old_guc_names[] = { /* POLAR */ "smgr_shared_relations", "polar_rsc_shared_relations", /* RSC old style GUC */ "smgr_pool_sweep_times", "polar_rsc_pool_sweep_times", /* RSC old style GUC */ + "polar_bulk_extend_size", "polar_heap_bulk_extend_size", /* bulk extend old style + * GUC */ /* POLAR */ NULL diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 29eeaf70f62..08f52a2aa79 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -1434,25 +1434,12 @@ MemoryContextAllocAligned(MemoryContext context, return aligned; } -void * -MemoryContextAllocIOAligned(MemoryContext context, Size size, int flags) -{ - /* FIXME: don't hardcode page size */ - return MemoryContextAllocAligned(context, size, POLAR_BUFFER_ALIGN_LEN, flags); -} - void * palloc_aligned(Size size, Size alignto, int flags) { return MemoryContextAllocAligned(CurrentMemoryContext, size, alignto, flags); } -void * -palloc_io_aligned(Size size, int flags) -{ - return MemoryContextAllocIOAligned(CurrentMemoryContext, size, flags); -} - /* * POLAR: Like palloc, * but allow to work well in the critical section temporarily. diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d index 3ebbcf88ebe..b4be30ec717 100644 --- a/src/backend/utils/probes.d +++ b/src/backend/utils/probes.d @@ -86,6 +86,10 @@ provider postgresql { probe smgr__md__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); probe smgr__md__write__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int); probe smgr__md__write__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); + probe smgr__md__extend__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int); + probe smgr__md__extend__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); + probe smgr__md__zeroextend__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int); + probe smgr__md__zeroextend__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int); probe wal__insert(unsigned char, unsigned char); probe wal__switch(); diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c index 559401dda9d..cde8d3b8c91 100644 --- a/src/backend/utils/resowner/resowner.c +++ b/src/backend/utils/resowner/resowner.c @@ -121,6 +121,7 @@ typedef struct ResourceOwnerData /* We have built-in support for remembering: */ ResourceArray bufferarr; /* owned buffers */ + ResourceArray bufferioarr; /* in-progress buffer IO */ ResourceArray catrefarr; /* catcache references */ ResourceArray catlistrefarr; /* catcache-list pins */ ResourceArray relrefarr; /* relcache references */ @@ -441,6 +442,7 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name) } ResourceArrayInit(&(owner->bufferarr), BufferGetDatum(InvalidBuffer)); + ResourceArrayInit(&(owner->bufferioarr), BufferGetDatum(InvalidBuffer)); ResourceArrayInit(&(owner->catrefarr), PointerGetDatum(NULL)); ResourceArrayInit(&(owner->catlistrefarr), PointerGetDatum(NULL)); ResourceArrayInit(&(owner->relrefarr), PointerGetDatum(NULL)); @@ -516,6 +518,24 @@ ResourceOwnerReleaseInternal(ResourceOwner owner, if (phase == RESOURCE_RELEASE_BEFORE_LOCKS) { + /* + * Abort failed buffer IO. AbortBufferIO()->TerminateBufferIO() calls + * ResourceOwnerForgetBufferIOs(), so we just have to iterate till + * there are none. + * + * Needs to be before we release buffer pins. + * + * During a commit, there shouldn't be any in-progress IO. + */ + while (ResourceArrayGetAny(&(owner->bufferioarr), &foundres)) + { + Buffer res = DatumGetBuffer(foundres); + + if (isCommit) + elog(PANIC, "lost track of buffer IO on buffer %u", res); + AbortBufferIO(res); + } + /* * Release buffer pins. Note that ReleaseBuffer will remove the * buffer entry from our array, so we just have to iterate till there @@ -741,6 +761,7 @@ ResourceOwnerDelete(ResourceOwner owner) /* And it better not own any resources, either */ Assert(owner->bufferarr.nitems == 0); + Assert(owner->bufferioarr.nitems == 0); Assert(owner->catrefarr.nitems == 0); Assert(owner->catlistrefarr.nitems == 0); Assert(owner->relrefarr.nitems == 0); @@ -770,6 +791,7 @@ ResourceOwnerDelete(ResourceOwner owner) /* And free the object. */ ResourceArrayFree(&(owner->bufferarr)); + ResourceArrayFree(&(owner->bufferioarr)); ResourceArrayFree(&(owner->catrefarr)); ResourceArrayFree(&(owner->catlistrefarr)); ResourceArrayFree(&(owner->relrefarr)); @@ -971,6 +993,43 @@ ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer) buffer, owner->name); } +/* + * Make sure there is room for at least one more entry in a ResourceOwner's + * buffer array. + * + * This is separate from actually inserting an entry because if we run out + * of memory, it's critical to do so *before* acquiring the resource. + */ +void +ResourceOwnerEnlargeBufferIOs(ResourceOwner owner) +{ + /* We used to allow pinning buffers without a resowner, but no more */ + Assert(owner != NULL); + ResourceArrayEnlarge(&(owner->bufferioarr)); +} + +/* + * Remember that a buffer IO is owned by a ResourceOwner + * + * Caller must have previously done ResourceOwnerEnlargeBufferIOs() + */ +void +ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer) +{ + ResourceArrayAdd(&(owner->bufferioarr), BufferGetDatum(buffer)); +} + +/* + * Forget that a buffer IO is owned by a ResourceOwner + */ +void +ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer) +{ + if (!ResourceArrayRemove(&(owner->bufferioarr), BufferGetDatum(buffer))) + elog(PANIC, "buffer IO %d is not owned by resource owner %s", + buffer, owner->name); +} + /* * Remember that a Local Lock is owned by a ResourceOwner * diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index fb0d395f44a..4e4ca750deb 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -252,7 +252,7 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer) */ while (blocknum > lts->nBlocksWritten) { - PGAlignedBlock zerobuf; + PGIOAlignedBlock zerobuf; MemSet(zerobuf.data, 0, sizeof(zerobuf)); diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index b0045746b0b..8864460c7cf 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -2965,7 +2965,7 @@ main(int argc, char **argv) BaseBackup(compression_algorithm, compression_detail, compressloc, &client_compress); - polar_vfs_destory_fe(ftype, polar_disk_name); + polar_vfs_destroy_fe(ftype, polar_disk_name); success = true; return 0; } diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c index d363c11f20b..98926a64bac 100644 --- a/src/bin/pg_basebackup/walmethods.c +++ b/src/bin/pg_basebackup/walmethods.c @@ -213,33 +213,27 @@ dir_open_for_write(const char *pathname, const char *temp_suffix, size_t pad_to_ /* Do pre-padding on non-compressed files */ if (pad_to_size && dir_data->compression_algorithm == PG_COMPRESSION_NONE) { - char *data; - int bytes; - int write_once_bytes; + ssize_t rc; - write_once_bytes = polar_is_write_pfs ? MAX_SEND_SIZE : XLOG_BLCKSZ; - data = (char *) pg_malloc0(write_once_bytes); - for (bytes = 0; bytes < pad_to_size; bytes += write_once_bytes) + rc = polar_pwrite_zeros(fd, pad_to_size, 0); + + if (rc < 0) { - errno = 0; - if (polar_write(fd, data, write_once_bytes) != write_once_bytes) - { - /* If write didn't set errno, assume problem is no disk space */ - dir_data->lasterrno = errno ? errno : ENOSPC; - pg_free(data); - polar_close(fd); - return NULL; - } + dir_data->lasterrno = errno; + polar_close(fd); + return NULL; } + /* + * pg_pwrite() (called via polar_pwrite_zeros()) may have moved the + * file position, so reset it (see win32pwrite.c). + */ if (polar_lseek(fd, 0, SEEK_SET) != 0) { dir_data->lasterrno = errno; - pg_free(data); polar_close(fd); return NULL; } - pg_free(data); } /* diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 7987c734317..d843328458b 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -188,7 +188,7 @@ skipfile(const char *fn) static void scan_file(const char *fn, int segmentno) { - PGAlignedBlock buf; + PGIOAlignedBlock buf; PageHeader header = (PageHeader) buf.data; int f; BlockNumber blockno; @@ -686,7 +686,7 @@ main(int argc, char *argv[]) printf(_("Checksums disabled in cluster\n")); } - polar_vfs_destory_simple_fe(); + polar_vfs_destroy_simple_fe(); return 0; } diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index 6cf3b455650..ed900116490 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -183,7 +183,7 @@ main(int argc, char *argv[]) "Either the file is corrupt, or it has a different layout than this program\n" "is expecting. The results below are untrustworthy.\n\n")); - polar_vfs_destory_simple_fe(); + polar_vfs_destroy_simple_fe(); /* set wal segment size */ WalSegSz = ControlFile->xlog_seg_size; diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index 7b1b3a9a54f..22e60239703 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -2296,7 +2296,7 @@ get_control_dbstate(void) /* POLAR: umount */ if (pg_config) - polar_vfs_destory_simple_fe(); + polar_vfs_destroy_simple_fe(); return ret; } diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index a6ff206f289..a040f5053c4 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -520,7 +520,7 @@ main(int argc, char *argv[]) if (polar_real_datadir) free(polar_real_datadir); - polar_vfs_destory_simple_fe(); + polar_vfs_destroy_simple_fe(); printf(_("Write-ahead log reset\n")); return 0; diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c index 2e50485c395..83b37a1e91c 100644 --- a/src/bin/pg_rewind/local_source.c +++ b/src/bin/pg_rewind/local_source.c @@ -77,7 +77,7 @@ static void local_queue_fetch_file(rewind_source *source, const char *path, size_t len) { const char *datadir = ((local_source *) source)->datadir; - PGAlignedBlock buf; + PGIOAlignedBlock buf; char srcpath[MAXPGPATH]; int srcfd; size_t written_len; @@ -129,7 +129,7 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len) { const char *datadir = ((local_source *) source)->datadir; - PGAlignedBlock buf; + PGIOAlignedBlock buf; char srcpath[MAXPGPATH]; int srcfd; off_t begin = off; diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index e16c2e009eb..7365bdef2e1 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -177,8 +177,8 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, { int src_fd; int dst_fd; - PGAlignedBlock buffer; - PGAlignedBlock new_vmbuf; + PGIOAlignedBlock buffer; + PGIOAlignedBlock new_vmbuf; ssize_t totalBytesRead = 0; ssize_t src_filesize; int rewriteVmBytesPerPage; diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index c07dc28809e..7f6af3935fd 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -1196,13 +1196,13 @@ main(int argc, char **argv) XLogReaderFree(xlogreader_state); if (polar_disk_name && polar_storage_cluster_name) - polar_vfs_destory_fe(ftype, polar_disk_name); + polar_vfs_destroy_fe(ftype, polar_disk_name); return EXIT_SUCCESS; bad_argument: if (polar_disk_name && polar_storage_cluster_name) - polar_vfs_destory_fe(ftype, polar_disk_name); + polar_vfs_destroy_fe(ftype, polar_disk_name); pg_log_error_hint("Try \"%s --help\" for more information.", progname); return EXIT_FAILURE; } diff --git a/src/common/file_utils.c b/src/common/file_utils.c index ccb9133c2ee..68ddc694c99 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -28,11 +28,16 @@ #ifdef FRONTEND #include "common/logging.h" #endif +#include "port/pg_iovec.h" /* POLAR */ #include "storage/polar_fd.h" /* POLAR end */ +int polar_zero_buffer_size = 0; +int polar_zero_buffers = -1; +void *polar_zero_buffer = NULL; + #ifdef FRONTEND /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ @@ -498,3 +503,203 @@ get_dirent_type(const char *path, return result; } + +/* + * Compute what remains to be done after a possibly partial vectored read or + * write. The part of 'source' beginning after 'transferred' bytes is copied + * to 'destination', and its length is returned. 'source' and 'destination' + * may point to the same array, for in-place adjustment. A return value of + * zero indicates completion (for callers without a cheaper way to know that). + */ +int +compute_remaining_iovec(struct iovec *destination, + const struct iovec *source, + int iovcnt, + size_t transferred) +{ + Assert(iovcnt > 0); + + /* Skip wholly transferred iovecs. */ + while (source->iov_len <= transferred) + { + transferred -= source->iov_len; + source++; + iovcnt--; + + /* All iovecs transferred? */ + if (iovcnt == 0) + { + /* + * We don't expect the kernel to transfer more than we asked it + * to, or something is out of sync. + */ + Assert(transferred == 0); + return 0; + } + } + + /* Copy the remaining iovecs to the front of the array. */ + if (source != destination) + memmove(destination, source, sizeof(*source) * iovcnt); + + /* Adjust leading iovec, which may have been partially transferred. */ + Assert(destination->iov_len > transferred); + destination->iov_base = (char *) destination->iov_base + transferred; + destination->iov_len -= transferred; + + return iovcnt; +} + +/* + * pg_pwritev_with_retry + * + * Convenience wrapper for pg_pwritev() that retries on partial write. If an + * error is returned, it is unspecified how much has been written. + */ +ssize_t +pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + struct iovec iov_copy[PG_IOV_MAX]; + ssize_t sum = 0; + ssize_t part; + + /* We'd better have space to make a copy, in case we need to retry. */ + if (iovcnt > PG_IOV_MAX) + { + errno = EINVAL; + return -1; + } + + do + { + /* Write as much as we can. */ + part = polar_pwritev(fd, iov, iovcnt, offset); + if (part < 0) + return -1; + +#ifdef SIMULATE_SHORT_WRITE + part = Min(part, 4096); +#endif + + /* Count our progress. */ + sum += part; + offset += part; + + /* + * See what is left. On the first loop we used the caller's array, + * but in later loops we'll use our local copy that we are allowed to + * mutate. + */ + iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part); + iov = iov_copy; + } while (iovcnt > 0); + + return sum; +} + +/* + * pg_pwrite_zeros + * + * Writes zeros to file worth "size" bytes at "offset" (from the start of the + * file), using vectored I/O. + * + * Returns the total amount of data written. On failure, a negative value + * is returned with errno set. + */ +ssize_t +pg_pwrite_zeros(int fd, size_t size, off_t offset) +{ + static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */ + void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data; + struct iovec iov[PG_IOV_MAX]; + size_t remaining_size = size; + ssize_t total_written = 0; + + /* Loop, writing as many blocks as we can for each system call. */ + while (remaining_size > 0) + { + int iovcnt = 0; + ssize_t written; + + for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++) + { + size_t this_iov_size; + + iov[iovcnt].iov_base = zerobuf_addr; + + if (remaining_size < BLCKSZ) + this_iov_size = remaining_size; + else + this_iov_size = BLCKSZ; + + iov[iovcnt].iov_len = this_iov_size; + remaining_size -= this_iov_size; + } + + written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + + if (written < 0) + return written; + + offset += written; + total_written += written; + } + + Assert(total_written == size); + + return total_written; +} + +/* + * polar_pwrite_zeros + * + * Writes zeros to file worth "size" bytes at "offset" (from the start of the + * file), using bulk I/O. + * + * Returns the total amount of data written. On failure, a negative value + * is returned with errno set. + * + * If there is no valid global zero buffer, it will fallback to pg_pwrite_zeros. + */ +ssize_t +polar_pwrite_zeros(int fd, size_t size, off_t offset) +{ + size_t remaining_size = size; + ssize_t total_written = 0; + +#ifdef FRONTEND + if (polar_zero_buffer_size == 0) + { +#define FRONTEND_ZERO_BUFFER_SIZE (1024 * 1024) + /* In frontend, we malloc a fixed size of 1MB and never free */ + polar_zero_buffer = (void *) TYPEALIGN(PG_IO_ALIGN_SIZE, + malloc(FRONTEND_ZERO_BUFFER_SIZE + PG_IO_ALIGN_SIZE)); + polar_zero_buffer_size = FRONTEND_ZERO_BUFFER_SIZE; + } +#else + if (polar_zero_buffer_size == 0) + return pg_pwrite_zeros(fd, size, offset); + + Assert(polar_zero_buffer); +#endif + + /* Loop, writing as many blocks as we can for each system call. */ + while (remaining_size > 0) + { + ssize_t written; + size_t amount = Min(remaining_size, polar_zero_buffer_size); + + written = polar_pwrite(fd, polar_zero_buffer, amount, offset); + + if (written != amount) + return -1; + + remaining_size -= written; + offset += written; + total_written += written; + } + + Assert(total_written == size); + + return total_written; +} diff --git a/src/include/access/hio.h b/src/include/access/hio.h index bb90c6fad81..73b5f98c0e7 100644 --- a/src/include/access/hio.h +++ b/src/include/access/hio.h @@ -32,6 +32,9 @@ typedef struct BulkInsertStateData Buffer current_buf; /* current insertion target page */ } BulkInsertStateData; +/* GUCs */ +extern PGDLLIMPORT int polar_index_bulk_extend_size; +extern PGDLLIMPORT int polar_heap_bulk_extend_size; extern void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token); @@ -40,4 +43,7 @@ extern Buffer RelationGetBufferForTuple(Relation relation, Size len, BulkInsertStateData *bistate, Buffer *vmbuffer, Buffer *vmbuffer_other); +extern int polar_get_bulk_extend_size(BlockNumber first_block, int bulk_extend_size); +extern Buffer polar_index_add_blocks(Relation relation); + #endif /* HIO_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d4ff4843eb4..ff836f7769b 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -58,12 +58,6 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; -extern int polar_wal_init_set_size; - -/* xlog init zero file write size */ -#define POLAR_DEFAULT_XLOG_FILL_ZERO_SIZE 1024 * 1024 -#define POLAR_MIN_XLOG_FILL_ZERO_SIZE XLOG_BLCKSZ -#define POLAR_MAX_XLOG_FILL_ZERO_SIZE 4 * 1024 * 1024 /* Archive modes */ typedef enum ArchiveMode diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 0e47962daa4..a77b5270464 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -14,6 +14,9 @@ #include "access/xlogreader.h" #include "storage/bufmgr.h" +/* POLAR */ +#include "storage/polar_bufmgr.h" + /* * Prior to 8.4, all activity during recovery was carried out by the startup * process. This local variable continues to be used in many parts of the @@ -56,9 +59,8 @@ extern PGDLLIMPORT HotStandbyState standbyState; #define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING) - -/* POLAR */ -#include "storage/polar_bufmgr.h" +/* GUCs */ +extern PGDLLIMPORT int polar_recovery_bulk_extend_size; extern bool XLogHaveInvalidPages(void); extern void XLogCheckInvalidPages(void); @@ -118,4 +120,6 @@ extern void XLogReadDetermineTimeline(XLogReaderState *state, extern void WALReadRaiseError(WALReadError *errinfo); +extern int polar_get_recovery_bulk_extend_size(BlockNumber target_block, BlockNumber nblocks); + #endif diff --git a/src/include/c.h b/src/include/c.h index 8ae7471d72f..5a4f094e78b 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1247,34 +1247,45 @@ extern void polar_exceptional_condition(const char *conditionName, /* * Use this, not "char buf[BLCKSZ]", to declare a field or local variable - * holding a page buffer, if that page might be accessed as a page and not - * just a string of bytes. Otherwise the variable might be under-aligned, - * causing problems on alignment-picky hardware. (In some places, we use - * this to declare buffers even though we only pass them to read() and - * write(), because copying to/from aligned buffers is usually faster than - * using unaligned buffers.) We include both "double" and "int64" in the - * union to ensure that the compiler knows the value must be MAXALIGN'ed - * (cf. configure's computation of MAXIMUM_ALIGNOF). + * holding a page buffer, if that page might be accessed as a page. Otherwise + * the variable might be under-aligned, causing problems on alignment-picky + * hardware. We include both "double" and "int64" in the union to ensure that + * the compiler knows the value must be MAXALIGN'ed (cf. configure's + * computation of MAXIMUM_ALIGNOF). */ typedef union PGAlignedBlock { #ifdef pg_attribute_aligned - pg_attribute_aligned(4096) -#else - __declspec(align(4096)) + pg_attribute_aligned(PG_IO_ALIGN_SIZE) #endif char data[BLCKSZ]; double force_align_d; int64 force_align_i64; } PGAlignedBlock; +/* + * Use this to declare a field or local variable holding a page buffer, if that + * page might be accessed as a page or passed to an SMgr I/O function. If + * allocating using the MemoryContext API, the aligned allocation functions + * should be used with PG_IO_ALIGN_SIZE. This alignment may be more efficient + * for I/O in general, but may be strictly required on some platforms when + * using direct I/O. + */ +typedef union PGIOAlignedBlock +{ +#ifdef pg_attribute_aligned + pg_attribute_aligned(PG_IO_ALIGN_SIZE) +#endif + char data[BLCKSZ]; + double force_align_d; + int64 force_align_i64; +} PGIOAlignedBlock; + /* Same, but for an XLOG_BLCKSZ-sized buffer */ typedef union PGAlignedXLogBlock { #ifdef pg_attribute_aligned - pg_attribute_aligned(4096) -#else - __declspec(align(4096)) + pg_attribute_aligned(PG_IO_ALIGN_SIZE) #endif char data[XLOG_BLCKSZ]; double force_align_d; diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 9e423ed9027..743edcdd655 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -24,6 +24,12 @@ typedef enum PGFileType PGFILETYPE_LNK } PGFileType; +struct iovec; /* avoid including port/pg_iovec.h here */ + +extern int polar_zero_buffer_size; +extern int polar_zero_buffers; +extern void *polar_zero_buffer; + #ifdef FRONTEND extern int fsync_fname(const char *fname, bool isdir); extern void fsync_pgdata(const char *pg_data, int serverVersion); @@ -38,4 +44,18 @@ extern PGFileType get_dirent_type(const char *path, bool look_through_symlinks, int elevel); +extern int compute_remaining_iovec(struct iovec *destination, + const struct iovec *source, + int iovcnt, + size_t transferred); + +extern ssize_t pg_pwritev_with_retry(int fd, + const struct iovec *iov, + int iovcnt, + off_t offset); + +extern ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset); + +extern ssize_t polar_pwrite_zeros(int fd, size_t size, off_t offset); + #endif /* FILE_UTILS_H */ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index a1910756a88..9f612eaac30 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -271,6 +271,12 @@ */ #define PG_CACHE_LINE_SIZE 128 +/* + * Assumed alignment requirement for direct I/O. 4K corresponds to common + * sector and memory page size. + */ +#define PG_IO_ALIGN_SIZE 4096 + /* *------------------------------------------------------------------------ * The following symbols are for enabling debugging code, not for diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 18257077e0d..cec3e1490a2 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -196,9 +196,6 @@ typedef struct PgStat_TableCounts PgStat_Counter polar_t_bulk_read_calls_IO; PgStat_Counter polar_t_bulk_read_blocks_IO; /* POLAR end */ - - /* bulk create index extend times */ - PgStat_Counter polar_t_bulk_create_index_extends_times; } PgStat_TableCounts; /* ---------- @@ -257,7 +254,7 @@ typedef struct PgStat_TableXactStatus * ------------------------------------------------------------ */ -#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA9 +#define PGSTAT_FILE_FORMAT_ID 0x01A5BCA8 typedef struct PgStat_ArchiverStats { @@ -412,10 +409,6 @@ typedef struct PgStat_StatTabEntry /* bulk read calls, IO read blocks counts */ PgStat_Counter polar_bulk_read_blocks_IO; /* POLAR end */ - - /* POLAR: bulk extend */ - PgStat_Counter polar_bulk_create_index_extends_times; - /* POLAR end */ } PgStat_StatTabEntry; typedef struct PgStat_WalStats @@ -685,13 +678,6 @@ polar_stat_wait_obj_and_time_clear(void) /* POLAR: end */ -/* POLAR: bulk create index extend stats */ -#define polar_pgstat_count_bulk_create_index_extend_times(rel) \ - do { \ - if ((rel)->pgstat_info != NULL) \ - (rel)->pgstat_info->t_counts.polar_t_bulk_create_index_extends_times++; \ - } while (0) -/* POLAR end */ extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n); extern void pgstat_count_heap_update(Relation rel, bool hot); diff --git a/src/include/polar_vfs/polar_directio.h b/src/include/polar_vfs/polar_directio.h index d1e043f87b2..e2c3f900e6f 100644 --- a/src/include/polar_vfs/polar_directio.h +++ b/src/include/polar_vfs/polar_directio.h @@ -17,8 +17,7 @@ * limitations under the License. * * IDENTIFICATION - * src/include/polar_vfs/polar_directio.h - * + * src/include/polar_vfs/polar_directio.h * *------------------------------------------------------------------------- */ @@ -46,10 +45,9 @@ extern char *polar_directio_buffer; extern const vfs_mgr polar_vfs_dio; #define POLAR_ACCESS_MODE_MASK 0x3 -#define POLAR_DIRECTIO_ALIGN_LEN POLAR_BUFFER_ALIGN_LEN -#define POLAR_DIRECTIO_ALIGN_DOWN(LEN) TYPEALIGN_DOWN(POLAR_DIRECTIO_ALIGN_LEN, LEN) -#define POLAR_DIRECTIO_ALIGN(LEN) TYPEALIGN(POLAR_DIRECTIO_ALIGN_LEN, LEN) -#define POLAR_DIECRTIO_IS_ALIGNED(LEN) !((uintptr_t)(LEN) & (uintptr_t)(POLAR_DIRECTIO_ALIGN_LEN - 1)) +#define POLAR_DIRECTIO_ALIGN_DOWN(LEN) TYPEALIGN_DOWN(PG_IO_ALIGN_SIZE, LEN) +#define POLAR_DIRECTIO_ALIGN(LEN) TYPEALIGN(PG_IO_ALIGN_SIZE, LEN) +#define POLAR_DIRECTIO_IS_ALIGNED(LEN) !((uintptr_t)(LEN) & (uintptr_t)(PG_IO_ALIGN_SIZE - 1)) extern int polar_directio_open(const char *path, int flags, mode_t mode); extern ssize_t polar_directio_read(int fd, void *buf, size_t len); diff --git a/src/include/polar_vfs/polar_vfs_fe.h b/src/include/polar_vfs/polar_vfs_fe.h index 3a70e9a10b0..8f315dd2a08 100644 --- a/src/include/polar_vfs/polar_vfs_fe.h +++ b/src/include/polar_vfs/polar_vfs_fe.h @@ -44,12 +44,12 @@ extern char *polar_storage_cluster_name; extern int polar_mkdir_p(char *path, int omode); extern void polar_vfs_init_fe(bool is_pfs, char *fname, char *storage_cluster_name, char *polar_disk_name, int flag); -extern void polar_vfs_destory_fe(char *ftype, char *polar_disk_name); +extern void polar_vfs_destroy_fe(char *ftype, char *polar_disk_name); extern bool polar_in_shared_storage_mode_fe(char *pgconfig); extern bool polar_in_localfs_mode_fe(char *pgconfig); extern bool polar_in_replica_mode_fe(const char *pgconfig); extern void polar_vfs_init_simple_fe(char *pgconfig, char *pg_datadir, int flag); -extern void polar_vfs_destory_simple_fe(void); +extern void polar_vfs_destroy_simple_fe(void); extern int polar_vfs_state_backup_current(void); extern int polar_vfs_state_restore_current(int index); extern int polar_vfs_state_backup(bool is_shared, bool is_localfs, int hostid, diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index ca09a1608f8..451a01fa6bc 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -141,6 +141,9 @@ typedef struct timespec instr_time; #define INSTR_TIME_GET_MICROSEC(t) \ (((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000)) +#define INSTR_TIME_GET_NANOSEC(t) \ + (((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec)) + #else /* !HAVE_CLOCK_GETTIME */ /* Use gettimeofday() */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index e23e252b7e5..fb04b3c678b 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -243,7 +243,7 @@ typedef struct BufferDesc * platform with either 32 or 128 byte line sizes, it's good to align to * boundaries and avoid false sharing. */ -#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1) +#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 128 : 1) typedef union BufferDescPadded { @@ -251,6 +251,9 @@ typedef union BufferDescPadded char pad[BUFFERDESC_PAD_TO_SIZE]; } BufferDescPadded; +StaticAssertDecl(sizeof(BufferDesc) <= BUFFERDESC_PAD_TO_SIZE, + "padding size is too small to fit BufferDesc"); + #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc) #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)]) diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index df531edfff3..ed2d2617d37 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -77,6 +77,9 @@ extern PGDLLIMPORT bool track_io_timing; extern PGDLLIMPORT int effective_io_concurrency; extern PGDLLIMPORT int maintenance_io_concurrency; +#define MAX_BUFFERS_TO_READ_BY 64 +#define MAX_BUFFERS_TO_EXTEND_BY 1024 + extern PGDLLIMPORT int checkpoint_flush_after; extern PGDLLIMPORT int backend_flush_after; extern PGDLLIMPORT int bgwriter_flush_after; @@ -89,12 +92,6 @@ extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT int32 *LocalRefCount; -/* POLAR: bulk read */ -extern bool polar_bulk_io_is_in_progress; -extern int polar_bulk_io_in_progress_count; - -/* POLAR end */ - /* upper limit for effective_io_concurrency */ #define MAX_IO_CONCURRENCY 1000 @@ -255,7 +252,7 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer); extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); -extern void AbortBufferIO(void); +extern void AbortBufferIO(Buffer buffer); extern void BufmgrCommit(void); extern bool BgBufferSync(struct WritebackContext *wb_context, int flags); @@ -281,7 +278,10 @@ extern bool StartBufferIO(BufferDesc *buf, bool forInput); /* POLAR: bulk read */ extern int polar_get_buffer_access_strategy_ring_size(BufferAccessStrategy strategy); -extern BufferDesc **polar_bulk_io_in_progress_buf; + +extern Buffer polar_read_buffer_common(struct SMgrRelationData *smgr, char relpersistence, ForkNumber forkNum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy); /* inline functions */ @@ -326,6 +326,9 @@ TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page) } /* POLAR */ +extern PGDLLIMPORT bool polar_has_partial_write; +extern PGDLLIMPORT int polar_bulk_read_size; + extern void polar_lock_buffer_for_cleanup_ext(Buffer buffer, bool fresh_check); extern void polar_lock_buffer_ext(Buffer buffer, int mode, bool fresh_check); extern bool polar_conditional_lock_buffer_ext(Buffer buffer, bool fresh_check); diff --git a/src/include/storage/bulk_write.h b/src/include/storage/bulk_write.h new file mode 100644 index 00000000000..73e4f08dde8 --- /dev/null +++ b/src/include/storage/bulk_write.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * bulk_write.h + * Efficiently and reliably populate a new relation + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/bulk_write.h + * + *------------------------------------------------------------------------- + */ +#ifndef BULK_WRITE_H +#define BULK_WRITE_H + +#include "storage/smgr.h" +#include "utils/rel.h" + +/* GUCs */ +extern PGDLLIMPORT int polar_bulk_write_maxpages; + +/* Bulk writer state, contents are private to bulk_write.c */ +typedef struct BulkWriteState BulkWriteState; + +/* + * Temporary buffer to hold a page to until it's written out. Use + * smgr_bulk_get_buf() to reserve one of these. This is a separate typedef to + * distinguish it from other block-sized buffers passed around in the system. + */ +typedef PGIOAlignedBlock *BulkWriteBuffer; + +/* forward declared from smgr.h */ +struct SMgrRelationData; + +extern BulkWriteState *smgr_bulk_start_rel(Relation rel, ForkNumber forknum); +extern BulkWriteState *smgr_bulk_start_smgr(struct SMgrRelationData *smgr, ForkNumber forknum, bool use_wal, char relpersistence); + +extern BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate); +extern void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std); + +extern void smgr_bulk_finish(BulkWriteState *bulkstate); + +#endif /* BULK_WRITE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 69549b000fa..b5673adac80 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -51,8 +51,6 @@ typedef enum RecoveryInitSyncMethod RECOVERY_INIT_SYNC_METHOD_SYNCFS } RecoveryInitSyncMethod; -struct iovec; /* avoid including port/pg_iovec.h here */ - typedef int File; @@ -61,6 +59,11 @@ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; extern PGDLLIMPORT int recovery_init_sync_method; +/* POLAR: GUC */ +extern PGDLLIMPORT bool polar_enable_fallocate_no_hide_stale; + +/* POLAR end */ + /* * This is private to fd.c, but exported for save/restore_backend_variables() */ @@ -84,9 +87,10 @@ extern PGDLLIMPORT int max_safe_fds; * to the appropriate Windows flag in src/port/open.c. We simulate it with * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good - * idea on a Unix). + * idea on a Unix). We can only use it if the compiler will correctly align + * PGIOAlignedBlock for us, though. */ -#if defined(O_DIRECT) +#if defined(O_DIRECT) && defined(pg_attribute_aligned) #define PG_O_DIRECT O_DIRECT #elif defined(F_NOCACHE) #define PG_O_DIRECT 0x80000000 @@ -104,10 +108,13 @@ extern File PathNameOpenFile(const char *fileName, int fileFlags); extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); -extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info); -extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); -extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); +extern int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info); +extern ssize_t FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info); +extern ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); +extern int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info, bool bulkwrite); +extern int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info); + extern off_t FileSize(File file); extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); @@ -178,10 +185,6 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern void pg_flush_data(int fd, off_t offset, off_t amount); -extern ssize_t pg_pwritev_with_retry(int fd, - const struct iovec *iov, - int iovcnt, - off_t offset); extern int pg_truncate(const char *path, off_t length); extern void fsync_fname(const char *fname, bool isdir); extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 781f43ec910..89552788fd7 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -27,19 +27,22 @@ extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern bool mdexists(SMgrRelation reln, ForkNumber forknum); extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo); extern void mdextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer); + void *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, const void *buffer, bool skipFsync); extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); +extern void mdregistersync(SMgrRelation reln, ForkNumber forknum); extern void ForgetDatabaseSyncRequests(Oid dbid); extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo); @@ -49,9 +52,14 @@ extern int mdsyncfiletag(const FileTag *ftag, char *path); extern int mdunlinkfiletag(const FileTag *ftag, char *path); extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate); -/* POLAR: bulk io */ -extern void polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer, bool skipFsync); +/* POLAR */ extern void polar_mdbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer); + int nblocks, void *buffer); +extern void polar_mdbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync); +extern void polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync); + +/* POLAR end */ + #endif /* MD_H */ diff --git a/src/include/storage/polar_copybuf.h b/src/include/storage/polar_copybuf.h index 1841c620dee..9bc0cf30c5e 100644 --- a/src/include/storage/polar_copybuf.h +++ b/src/include/storage/polar_copybuf.h @@ -19,7 +19,7 @@ * limitations under the License. * * IDENTIFICATION - * src/include/storage/polar_copybuf.h + * src/include/storage/polar_copybuf.h * *------------------------------------------------------------------------- */ @@ -72,6 +72,9 @@ typedef union CopyBufferDescPadded char pad[COPYBUFFERDESC_PAD_TO_SIZE]; } CopyBufferDescPadded; +StaticAssertDecl(sizeof(CopyBufferDesc) <= COPYBUFFERDESC_PAD_TO_SIZE, + "padding size is too small to fit CopyBufferDesc"); + #define CopyBufHdrGetBlock(copy_buf_hdr) \ ((Block) (polar_copy_buffer_blocks + ((Size) (copy_buf_hdr)->buf_id) * BLCKSZ)) diff --git a/src/include/storage/polar_fd.h b/src/include/storage/polar_fd.h index 46d0458b934..cd84c17471d 100644 --- a/src/include/storage/polar_fd.h +++ b/src/include/storage/polar_fd.h @@ -18,7 +18,7 @@ * limitations under the License. * * IDENTIFICATION - * src/include/storage/polar_fd.h + * src/include/storage/polar_fd.h * *------------------------------------------------------------------------- */ @@ -156,7 +156,8 @@ typedef struct vfs_mgr int (*vfs_fsync) (int fd); int (*vfs_unlink) (const char *path); int (*vfs_rename) (const char *oldpath, const char *newpath); - int (*vfs_fallocate) (int fd, off_t offset, off_t len); + int (*vfs_posix_fallocate) (int fd, off_t offset, off_t len); + int (*vfs_fallocate) (int fd, int mode, off_t offset, off_t len); int (*vfs_ftruncate) (int fd, off_t len); int (*vfs_truncate) (const char *path, off_t len); DIR *(*vfs_opendir) (const char *path); @@ -170,10 +171,17 @@ typedef struct vfs_mgr int (*vfs_sync_file_range) (int fd, off_t offset, off_t nbytes, unsigned int flags); int (*vfs_posix_fadvise) (int fd, off_t offset, off_t len, int advice); int (*vfs_umount) (char *ftype, const char *pbdname); + PolarVFSKind (*vfs_type) (int fd); } vfs_mgr; extern vfs_mgr polar_vfs[]; +static inline PolarVFSKind +polar_bufferio_vfs_type(int fd) +{ + return POLAR_VFS_LOCAL_BIO; +} + extern ssize_t polar_read_line(int fd, void *buffer, size_t len); extern int polar_copy_file(char *fromfile, char *tofile, bool skiperr); extern void polar_copydir(char *fromdir, char *todir, bool recurse, bool clean, bool skip_file_err); @@ -327,9 +335,15 @@ polar_rename(const char *oldfile, const char *newfile) } static inline int -polar_fallocate(int fd, off_t offset, off_t len) +polar_posix_fallocate(int fd, off_t offset, off_t len) +{ + return polar_vfs[polar_vfs_switch].vfs_posix_fallocate(fd, offset, len); +} + +static inline int +polar_fallocate(int fd, int mode, off_t offset, off_t len) { - return polar_vfs[polar_vfs_switch].vfs_fallocate(fd, offset, len); + return polar_vfs[polar_vfs_switch].vfs_fallocate(fd, mode, offset, len); } static inline int @@ -486,4 +500,10 @@ polar_umount(char *ftype, const char *pbdname) return rc; } +static inline PolarVFSKind +polar_vfs_type(int fd) +{ + return polar_vfs[polar_vfs_switch].vfs_type(fd); +} + #endif diff --git a/src/include/storage/polar_xlogbuf.h b/src/include/storage/polar_xlogbuf.h index 548fc687107..0b4f031ebe6 100644 --- a/src/include/storage/polar_xlogbuf.h +++ b/src/include/storage/polar_xlogbuf.h @@ -97,6 +97,9 @@ typedef union polar_xlog_buffer_desc_padded char pad[XLOGBUFFERDESC_PAD_TO_SIZE]; } polar_xlog_buffer_desc_padded; +StaticAssertDecl(sizeof(polar_xlog_buffer_desc) <= XLOGBUFFERDESC_PAD_TO_SIZE, + "padding size is too small to fit polar_xlog_buffer_desc"); + typedef struct polar_xlog_buffer_ctl_t { polar_xlog_buffer_desc_padded *buffer_descriptors; diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a415f3ac0c0..d40ad2ee183 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -105,13 +105,15 @@ extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer); + BlockNumber blocknum, void *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); + BlockNumber blocknum, const void *buffer, bool skipFsync); extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); @@ -119,16 +121,25 @@ extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); +extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); -/* POLAR: bulk io */ +/* POLAR */ +#define POLAR_ZERO_EXTEND_NONE 0 +#define POLAR_ZERO_EXTEND_BULKWRITE 1 +#define POLAR_ZERO_EXTEND_FALLOCATE 2 + +extern PGDLLIMPORT int polar_zero_extend_method; + +extern void polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, void *buffer); +extern void polar_smgrbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, const void *buffer, bool skipFsync); extern void polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int blockCount, char *buffer, bool skipFsync); + BlockNumber blocknum, int nblocks, const void *buffer, bool skipFsync); extern void polar_smgr_init_bulk_extend(SMgrRelation reln, ForkNumber forknum); extern void polar_smgr_clear_bulk_extend(SMgrRelation reln, ForkNumber forknum); -extern void polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - int blockCount, char *buffer); /* POLAR end */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 8716c95a335..dfb8b63789a 100755 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -272,10 +272,6 @@ extern struct config_generic *polar_parameter_check_name_internal(const char *gu /* POLAR end */ -/* POLAR */ -#define POLAR_MAX_BULK_IO_SIZE 64 -/* POLAR end */ - /* GUC vars that are actually declared in guc.c, rather than elsewhere */ extern PGDLLIMPORT bool Debug_print_plan; extern PGDLLIMPORT bool Debug_print_parse; @@ -615,22 +611,8 @@ extern bool polar_enable_track_lock_timing; extern bool polar_enable_track_network_stat; extern bool polar_enable_track_network_timing; -/* POLAR: bulk io */ -extern int polar_recovery_bulk_extend_size; -extern int polar_min_bulk_extend_table_size; -extern bool polar_enable_primary_recovery_bulk_extend; -extern int polar_bulk_extend_size; -extern int polar_bulk_read_size; - -extern int polar_index_bulk_extend_size; - - -extern int polar_index_create_bulk_extend_size; - /* POLAR end */ -/* POLAR: partial write */ -extern bool polar_has_partial_write; extern bool polar_find_in_string_list(const char *itemname, const char *stringlist); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 5d7e6f3e66e..f3aae6a0bae 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -100,7 +100,7 @@ enum config_group POLAR_REL_SIZE_CACHE, POLAR_BUFFER_MANAGEMENT, POLAR_PROXY, - POLAR_BULK_READ_EXTEND, + POLAR_IO_MANAGEMENT, /* POLAR end */ DEVELOPER_OPTIONS }; diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h index b050ab53db9..2100597c2df 100644 --- a/src/include/utils/palloc.h +++ b/src/include/utils/palloc.h @@ -75,13 +75,11 @@ extern void *MemoryContextAllocExtended(MemoryContext context, Size size, int flags); extern void *MemoryContextAllocAligned(MemoryContext context, Size size, Size alignto, int flags); -extern void *MemoryContextAllocIOAligned(MemoryContext context, Size size, int flags); extern void *palloc(Size size); extern void *palloc0(Size size); extern void *palloc_extended(Size size, int flags); extern void *palloc_aligned(Size size, Size alignto, int flags); -extern void *palloc_io_aligned(Size size, int flags); extern pg_nodiscard void *repalloc(void *pointer, Size size); extern void pfree(void *pointer); diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h index d01cccc27c1..61a72ed52f5 100644 --- a/src/include/utils/resowner_private.h +++ b/src/include/utils/resowner_private.h @@ -30,6 +30,11 @@ extern void ResourceOwnerEnlargeBuffers(ResourceOwner owner); extern void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer); extern void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer); +/* support for IO-in-progress management */ +extern void ResourceOwnerEnlargeBufferIOs(ResourceOwner owner); +extern void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer); +extern void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer); + /* support for local lock management */ extern void ResourceOwnerRememberLock(ResourceOwner owner, LOCALLOCK *locallock); extern void ResourceOwnerForgetLock(ResourceOwner owner, LOCALLOCK *locallock); diff --git a/src/polar_vfs/polar_bufferio.c b/src/polar_vfs/polar_bufferio.c index f37090d26a0..ae8b99b0067 100644 --- a/src/polar_vfs/polar_bufferio.c +++ b/src/polar_vfs/polar_bufferio.c @@ -57,7 +57,12 @@ const vfs_mgr polar_vfs_bio = #endif .vfs_unlink = unlink, .vfs_rename = rename, - .vfs_fallocate = posix_fallocate, + .vfs_posix_fallocate = posix_fallocate, +#ifdef __linux__ + .vfs_fallocate = fallocate, +#else + .vfs_fallocate = NULL, +#endif .vfs_ftruncate = ftruncate, .vfs_truncate = truncate, .vfs_opendir = opendir, @@ -68,4 +73,5 @@ const vfs_mgr polar_vfs_bio = .vfs_mgr_func = NULL, .vfs_chmod = chmod, .vfs_mmap = mmap, + .vfs_type = polar_bufferio_vfs_type, }; diff --git a/src/polar_vfs/polar_directio.c b/src/polar_vfs/polar_directio.c index a16fc0c1bb3..186743d05ee 100644 --- a/src/polar_vfs/polar_directio.c +++ b/src/polar_vfs/polar_directio.c @@ -40,6 +40,12 @@ polar_directio_fsync(int fd) #endif } +static inline PolarVFSKind +polar_directio_vfs_type(int fd) +{ + return POLAR_VFS_LOCAL_DIO; +} + /* * Local file system interface with O_DIRECT flag. * It use original file system interface to do other jobs @@ -73,7 +79,12 @@ const vfs_mgr polar_vfs_dio = .vfs_fsync = polar_directio_fsync, .vfs_unlink = unlink, .vfs_rename = rename, - .vfs_fallocate = posix_fallocate, + .vfs_posix_fallocate = posix_fallocate, +#ifdef __linux__ + .vfs_fallocate = fallocate, +#else + .vfs_fallocate = NULL, +#endif .vfs_ftruncate = ftruncate, .vfs_truncate = truncate, .vfs_opendir = opendir, @@ -84,6 +95,7 @@ const vfs_mgr polar_vfs_dio = .vfs_mgr_func = NULL, .vfs_chmod = chmod, .vfs_mmap = mmap, + .vfs_type = polar_directio_vfs_type, }; /* @@ -125,9 +137,9 @@ polar_directio_write(int fd, const void *buf, size_t len) if (offset < 0) return res; - if (POLAR_DIECRTIO_IS_ALIGNED(buf) && - POLAR_DIECRTIO_IS_ALIGNED(len) && - POLAR_DIECRTIO_IS_ALIGNED(offset)) + if (POLAR_DIRECTIO_IS_ALIGNED(buf) && + POLAR_DIRECTIO_IS_ALIGNED(len) && + POLAR_DIRECTIO_IS_ALIGNED(offset)) return write(fd, buf, len); res = polar_directio_pwrite(fd, buf, len, offset); @@ -148,9 +160,9 @@ polar_directio_read(int fd, void *buf, size_t len) if (offset < 0) return res; - if (POLAR_DIECRTIO_IS_ALIGNED(buf) && - POLAR_DIECRTIO_IS_ALIGNED(len) && - POLAR_DIECRTIO_IS_ALIGNED(offset)) + if (POLAR_DIRECTIO_IS_ALIGNED(buf) && + POLAR_DIRECTIO_IS_ALIGNED(len) && + POLAR_DIRECTIO_IS_ALIGNED(offset)) return read(fd, buf, len); res = polar_directio_pread(fd, buf, len, offset); @@ -187,9 +199,9 @@ polar_directio_pread(int fd, void *buffer, size_t len, off_t offset) off_t nleft; ssize_t cplen; - if (POLAR_DIECRTIO_IS_ALIGNED(buffer) && - POLAR_DIECRTIO_IS_ALIGNED(len) && - POLAR_DIECRTIO_IS_ALIGNED(offset)) + if (POLAR_DIRECTIO_IS_ALIGNED(buffer) && + POLAR_DIRECTIO_IS_ALIGNED(len) && + POLAR_DIRECTIO_IS_ALIGNED(offset)) return pread(fd, buffer, len, offset); from = (char *) buffer; @@ -208,19 +220,19 @@ polar_directio_pread(int fd, void *buffer, size_t len, off_t offset) nleft > 0) { off = head_start; - res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off); + res = pread(fd, buf, PG_IO_ALIGN_SIZE, off); if (res < 0) return res; - else if (res <= (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1))) + else if (res <= (offset & (PG_IO_ALIGN_SIZE - 1))) return count; else { - cplen = Min(res - (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), len); + cplen = Min(res - (offset & (PG_IO_ALIGN_SIZE - 1)), len); cplen = Min(nleft, cplen); } - memcpy(from, buf + (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), cplen); + memcpy(from, buf + (offset & (PG_IO_ALIGN_SIZE - 1)), cplen); from += cplen; count += cplen; nleft -= cplen; @@ -259,13 +271,13 @@ polar_directio_pread(int fd, void *buffer, size_t len, off_t offset) nleft > 0) { off = tail_start; - res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off); + res = pread(fd, buf, PG_IO_ALIGN_SIZE, off); if (res < 0) return res; else { - cplen = Min(res, ((offset + len) & (POLAR_DIRECTIO_ALIGN_LEN - 1))); + cplen = Min(res, ((offset + len) & (PG_IO_ALIGN_SIZE - 1))); cplen = Min(nleft, cplen); } @@ -289,9 +301,9 @@ polar_directio_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) for (i = 0; i < iovcnt; i++) { - if (aligned && (!POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_base) || - !POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_len) || - !POLAR_DIECRTIO_IS_ALIGNED(offset))) + if (aligned && (!POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_base) || + !POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_len) || + !POLAR_DIRECTIO_IS_ALIGNED(offset))) aligned = false; bytes += iov[i].iov_len; @@ -341,15 +353,15 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset) #define POLAR_DIRECTIO_PWRITE_SECTION(start, len) \ do \ { \ - MemSet(buf, 0x0, POLAR_DIRECTIO_ALIGN_LEN); \ - res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off); \ + MemSet(buf, 0x0, PG_IO_ALIGN_SIZE); \ + res = pread(fd, buf, PG_IO_ALIGN_SIZE, off); \ if (res < 0) \ return res; \ memcpy(buf + start, from, len); \ - res = pwrite(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off); \ + res = pwrite(fd, buf, PG_IO_ALIGN_SIZE, off); \ if (res < 0) \ return res; \ - Assert(res == POLAR_DIRECTIO_ALIGN_LEN); \ + Assert(res == PG_IO_ALIGN_SIZE); \ from += len; \ count += len; \ nleft -= len; \ @@ -369,9 +381,9 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset) bool need_truncate = false; struct stat stat_buf; - if (POLAR_DIECRTIO_IS_ALIGNED(buffer) && - POLAR_DIECRTIO_IS_ALIGNED(len) && - POLAR_DIECRTIO_IS_ALIGNED(offset)) + if (POLAR_DIRECTIO_IS_ALIGNED(buffer) && + POLAR_DIRECTIO_IS_ALIGNED(len) && + POLAR_DIRECTIO_IS_ALIGNED(offset)) return pwrite(fd, buffer, len, offset); from = (char *) buffer; @@ -389,7 +401,7 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset) * Whether we should truncate file to expected size or not. stat_buf * constains the original file's states including size. */ - if (!POLAR_DIECRTIO_IS_ALIGNED(offset + len)) + if (!POLAR_DIRECTIO_IS_ALIGNED(offset + len)) { res = fstat(fd, &stat_buf); if (res < 0) @@ -403,8 +415,8 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset) nleft > 0) { off = head_start; - cplen = Min(nleft, POLAR_DIRECTIO_ALIGN_LEN - (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1))); - POLAR_DIRECTIO_PWRITE_SECTION((offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), cplen); + cplen = Min(nleft, PG_IO_ALIGN_SIZE - (offset & (PG_IO_ALIGN_SIZE - 1))); + POLAR_DIRECTIO_PWRITE_SECTION((offset & (PG_IO_ALIGN_SIZE - 1)), cplen); } /* write the middle sections */ @@ -436,7 +448,7 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset) nleft > 0) { off = tail_start; - cplen = Min(nleft, (offset + len) & (POLAR_DIRECTIO_ALIGN_LEN - 1)); + cplen = Min(nleft, (offset + len) & (PG_IO_ALIGN_SIZE - 1)); POLAR_DIRECTIO_PWRITE_SECTION(0, cplen); } @@ -463,9 +475,9 @@ polar_directio_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset for (i = 0; i < iovcnt; i++) { - if (aligned && (!POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_base) || - !POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_len) || - !POLAR_DIECRTIO_IS_ALIGNED(offset))) + if (aligned && (!POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_base) || + !POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_len) || + !POLAR_DIRECTIO_IS_ALIGNED(offset))) aligned = false; bytes += iov[i].iov_len; diff --git a/src/polar_vfs/polar_pfsd.c b/src/polar_vfs/polar_pfsd.c index 84e25c3b226..5dd4015fb97 100644 --- a/src/polar_vfs/polar_pfsd.c +++ b/src/polar_vfs/polar_pfsd.c @@ -42,6 +42,12 @@ static ssize_t polar_pfsd_pwritev(int fd, const struct iovec *iov, int iovcnt, o int max_pfsd_io_size = PFSD_DEFAULT_MAX_IOSIZE; +static inline PolarVFSKind +polar_pfsd_vfs_type(int fd) +{ + return POLAR_VFS_PFS; +} + /* * Pfsd file system interface. * It use original pfsd's file access interface. @@ -74,7 +80,8 @@ const vfs_mgr polar_vfs_pfsd = .vfs_fsync = pfsd_fsync, .vfs_unlink = pfsd_unlink, .vfs_rename = pfsd_rename, - .vfs_fallocate = pfsd_posix_fallocate, + .vfs_posix_fallocate = pfsd_posix_fallocate, + .vfs_fallocate = pfsd_fallocate, .vfs_ftruncate = pfsd_ftruncate, .vfs_truncate = pfsd_truncate, .vfs_opendir = pfsd_opendir, @@ -85,6 +92,7 @@ const vfs_mgr polar_vfs_pfsd = .vfs_mgr_func = NULL, .vfs_chmod = pfsd_chmod, .vfs_mmap = NULL, + .vfs_type = polar_pfsd_vfs_type, #else .vfs_env_init = NULL, .vfs_env_destroy = NULL, @@ -108,6 +116,7 @@ const vfs_mgr polar_vfs_pfsd = .vfs_fsync = NULL, .vfs_unlink = NULL, .vfs_rename = NULL, + .vfs_posix_fallocate = NULL, .vfs_fallocate = NULL, .vfs_ftruncate = NULL, .vfs_truncate = NULL, @@ -119,6 +128,7 @@ const vfs_mgr polar_vfs_pfsd = .vfs_mgr_func = NULL, .vfs_chmod = NULL, .vfs_mmap = NULL, + .vfs_type = NULL, #endif }; diff --git a/src/polar_vfs/polar_vfs.c b/src/polar_vfs/polar_vfs.c index a361e612e8e..66d1524be9f 100644 --- a/src/polar_vfs/polar_vfs.c +++ b/src/polar_vfs/polar_vfs.c @@ -238,11 +238,11 @@ polar_vfs_init(void) if (localfs_mode) { - if (!POLAR_DIECRTIO_IS_ALIGNED(polar_max_direct_io_size)) + if (!POLAR_DIRECTIO_IS_ALIGNED(polar_max_direct_io_size)) elog(FATAL, "polar_max_direct_io_size is not aligned!"); else if (polar_directio_buffer == NULL && posix_memalign((void **) &polar_directio_buffer, - POLAR_DIRECTIO_ALIGN_LEN, + PG_IO_ALIGN_SIZE, polar_max_direct_io_size) != 0) { elog(ERROR, "posix_memalign alloc polar_directio_buffer failed!"); diff --git a/src/polar_vfs/polar_vfs_fe.c b/src/polar_vfs/polar_vfs_fe.c index 762e94f8acb..8377920e13f 100644 --- a/src/polar_vfs/polar_vfs_fe.c +++ b/src/polar_vfs/polar_vfs_fe.c @@ -105,7 +105,12 @@ vfs_mgr polar_vfs[] = .vfs_fsync = fsync, .vfs_unlink = unlink, .vfs_rename = rename, - .vfs_fallocate = posix_fallocate, + .vfs_posix_fallocate = posix_fallocate, +#ifdef __linux__ + .vfs_fallocate = fallocate, +#else + .vfs_fallocate = NULL, +#endif .vfs_ftruncate = ftruncate, .vfs_truncate = truncate, .vfs_opendir = opendir, @@ -116,6 +121,7 @@ vfs_mgr polar_vfs[] = .vfs_mgr_func = polar_get_local_vfs_mgr, .vfs_chmod = chmod, .vfs_mmap = mmap, + .vfs_type = NULL, }, { .vfs_env_init = NULL, @@ -139,6 +145,7 @@ vfs_mgr polar_vfs[] = .vfs_fsync = NULL, .vfs_unlink = NULL, .vfs_rename = NULL, + .vfs_posix_fallocate = NULL, .vfs_fallocate = NULL, .vfs_ftruncate = NULL, .vfs_truncate = NULL, @@ -150,6 +157,7 @@ vfs_mgr polar_vfs[] = .vfs_mgr_func = NULL, .vfs_chmod = NULL, .vfs_mmap = NULL, + .vfs_type = NULL, } }; @@ -407,14 +415,14 @@ polar_vfs_init_fe(bool is_pfs, char *fname, char *storage_cluster_name, char *po if (localfs_mode) { - if (!POLAR_DIECRTIO_IS_ALIGNED(polar_max_direct_io_size)) + if (!POLAR_DIRECTIO_IS_ALIGNED(polar_max_direct_io_size)) { fprintf(stderr, "polar_max_direct_io_size is not aligned!\n"); exit(EXIT_FAILURE); } else if (polar_directio_buffer == NULL && posix_memalign((void **) &polar_directio_buffer, - POLAR_DIRECTIO_ALIGN_LEN, + PG_IO_ALIGN_SIZE, polar_max_direct_io_size) != 0) { fprintf(stderr, "posix_memalign alloc polar_directio_buffer failed!\n"); @@ -457,10 +465,10 @@ polar_vfs_init_fe(bool is_pfs, char *fname, char *storage_cluster_name, char *po * Unmount polar file system for frontend. */ void -polar_vfs_destory_fe(char *ftype, char *disk_name) +polar_vfs_destroy_fe(char *ftype, char *disk_name) { /* - * Do not destory polar vfs when instance is not in shared storage mode. + * Do not destroy polar vfs when instance is not in shared storage mode. */ if (localfs_mode || !polar_enable_shared_storage_mode) return; @@ -713,11 +721,11 @@ polar_vfs_init_simple_fe(char *pgconfig, char *pg_datadir, int flag) } void -polar_vfs_destory_simple_fe(void) +polar_vfs_destroy_simple_fe(void) { if (polar_disk_name != NULL) { - polar_vfs_destory_fe(polar_datadir, polar_disk_name); + polar_vfs_destroy_fe(polar_datadir, polar_disk_name); pg_free(polar_disk_name); polar_disk_name = NULL; } diff --git a/src/polar_vfs/polar_vfs_interface.c b/src/polar_vfs/polar_vfs_interface.c index 1b2aaa17b35..278e0a9ba18 100644 --- a/src/polar_vfs/polar_vfs_interface.c +++ b/src/polar_vfs/polar_vfs_interface.c @@ -94,7 +94,8 @@ static int vfs_access(const char *path, int mode); static int vfs_fsync(int file); static int vfs_unlink(const char *fname); static int vfs_rename(const char *oldfile, const char *newfile); -static int vfs_fallocate(int file, off_t offset, off_t len); +static int vfs_posix_fallocate(int file, off_t offset, off_t len); +static int vfs_fallocate(int file, int mode, off_t offset, off_t len); static int vfs_ftruncate(int file, off_t len); static int vfs_truncate(const char *path, off_t len); @@ -114,6 +115,8 @@ static int vfs_chmod(const char *path, mode_t mode); static inline const char *polar_vfs_file_type_and_path(const char *path, int *kind); static void *vfs_mmap(void *start, size_t length, int prot, int flags, int file, off_t offset); +static PolarVFSKind vfs_type(int fd); + static const vfs_mgr *const vfs[POLAR_VFS_KIND_SIZE] = { /* Local file system interface. */ @@ -171,6 +174,7 @@ static const vfs_mgr vfs_interface = .vfs_fsync = vfs_fsync, .vfs_unlink = vfs_unlink, .vfs_rename = vfs_rename, + .vfs_posix_fallocate = vfs_posix_fallocate, .vfs_fallocate = vfs_fallocate, .vfs_ftruncate = vfs_ftruncate, .vfs_truncate = vfs_truncate, @@ -182,6 +186,7 @@ static const vfs_mgr vfs_interface = .vfs_mgr_func = vfs_get_mgr, .vfs_chmod = vfs_chmod, .vfs_mmap = vfs_mmap, + .vfs_type = vfs_type, }; bool localfs_mode = false; @@ -840,7 +845,40 @@ vfs_rename(const char *oldfile, const char *newfile) } static int -vfs_fallocate(int file, off_t offset, off_t len) +vfs_posix_fallocate(int file, off_t offset, off_t len) +{ + vfs_vfd *vfdP = NULL; + int rc = 0; + int save_errno; + + VFS_HOLD_INTERRUPTS(); + + CHECK_FD_REENTRANT_BEGIN(); + POLAR_VFS_FD_MASK_RMOVE(file); + vfdP = vfs_find_file(file); + + if (unlikely(polar_vfs_debug)) + elog(LOG, "vfs_posix_fallocate from %s", vfdP->file_name); + + if (polar_vfs_io_before_hook) + polar_vfs_io_before_hook(vfdP, 0, VFS_FALLOCATE); + + rc = vfs[vfdP->kind]->vfs_posix_fallocate(vfdP->fd, offset, len); + save_errno = errno; + + if (polar_vfs_io_after_hook) + polar_vfs_io_after_hook(vfdP, 0, VFS_FALLOCATE); + + CHECK_FD_REENTRANT_END(); + + VFS_RESUME_INTERRUPTS(); + + errno = save_errno; + return rc; +} + +static int +vfs_fallocate(int file, int mode, off_t offset, off_t len) { vfs_vfd *vfdP = NULL; int rc = 0; @@ -852,12 +890,13 @@ vfs_fallocate(int file, off_t offset, off_t len) POLAR_VFS_FD_MASK_RMOVE(file); vfdP = vfs_find_file(file); - elog(LOG, "vfs_fallocate from %s", vfdP->file_name); + if (unlikely(polar_vfs_debug)) + elog(LOG, "vfs_fallocate from %s", vfdP->file_name); if (polar_vfs_io_before_hook) polar_vfs_io_before_hook(vfdP, 0, VFS_FALLOCATE); - rc = vfs[vfdP->kind]->vfs_fallocate(vfdP->fd, offset, len); + rc = vfs[vfdP->kind]->vfs_fallocate(vfdP->fd, mode, offset, len); save_errno = errno; if (polar_vfs_io_after_hook) @@ -1272,3 +1311,14 @@ vfs_mmap(void *start, size_t length, int prot, int flags, int file, off_t offset return vfs[vfdP->kind]->vfs_mmap(start, length, prot, flags, vfdP->fd, offset); } + +static PolarVFSKind +vfs_type(int fd) +{ + vfs_vfd *vfdP = NULL; + + POLAR_VFS_FD_MASK_RMOVE(fd); + vfdP = vfs_find_file(fd); + + return vfs[vfdP->kind]->vfs_type(vfdP->fd); +} diff --git a/src/test/modules/test_bulkio/.gitignore b/src/test/modules/test_bulkio/.gitignore new file mode 100644 index 00000000000..5dcb3ff9723 --- /dev/null +++ b/src/test/modules/test_bulkio/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_bulkio/Makefile b/src/test/modules/test_bulkio/Makefile new file mode 100644 index 00000000000..4daf0189666 --- /dev/null +++ b/src/test/modules/test_bulkio/Makefile @@ -0,0 +1,20 @@ +# src/test/modules/test_bulkio/Makefile + +MODULE_big = test_bulkio +OBJS = test_bulkio.o $(WIN32RES) +PGFILEDESC = "test_bulkio - test code for bulk IO interface" + +EXTENSION = test_bulkio +DATA = test_bulkio--1.0.sql +REGRESS = test_bulkio + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_bulkio +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_bulkio/expected/test_bulkio.out b/src/test/modules/test_bulkio/expected/test_bulkio.out new file mode 100644 index 00000000000..dbb150a0e64 --- /dev/null +++ b/src/test/modules/test_bulkio/expected/test_bulkio.out @@ -0,0 +1,10 @@ +CREATE EXTENSION test_bulkio; +-- The default RELSEG_SIZE is 128GB on this PolarDB version, skip test with +-- polar_zero_extend_method = 'none' or 'bulkwrite' +set polar_zero_extend_method = 'fallocate'; +SELECT test_bulkio(); + test_bulkio +------------- + +(1 row) + diff --git a/src/test/modules/test_bulkio/sql/test_bulkio.sql b/src/test/modules/test_bulkio/sql/test_bulkio.sql new file mode 100644 index 00000000000..21ecae8211d --- /dev/null +++ b/src/test/modules/test_bulkio/sql/test_bulkio.sql @@ -0,0 +1,6 @@ +CREATE EXTENSION test_bulkio; + +-- The default RELSEG_SIZE is 128GB on this PolarDB version, skip test with +-- polar_zero_extend_method = 'none' or 'bulkwrite' +set polar_zero_extend_method = 'fallocate'; +SELECT test_bulkio(); diff --git a/src/test/modules/test_bulkio/test_bulkio--1.0.sql b/src/test/modules/test_bulkio/test_bulkio--1.0.sql new file mode 100644 index 00000000000..fffe8a339f3 --- /dev/null +++ b/src/test/modules/test_bulkio/test_bulkio--1.0.sql @@ -0,0 +1,8 @@ +/* src/test/modules/test_bulkio/test_bulkio--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_bulkio" to load this file. \quit + +CREATE FUNCTION test_bulkio() +RETURNS VOID STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_bulkio/test_bulkio.c b/src/test/modules/test_bulkio/test_bulkio.c new file mode 100644 index 00000000000..16b5372e663 --- /dev/null +++ b/src/test/modules/test_bulkio/test_bulkio.c @@ -0,0 +1,131 @@ +/*------------------------------------------------------------------------- + * + * test_bulkio.c + * Test module for bulk IO interface + * + * Copyright (c) 2024, Alibaba Group Holding Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * IDENTIFICATION + * src/test/modules/test_bulkio/test_bulkio.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "miscadmin.h" +#include "storage/smgr.h" + +PG_MODULE_MAGIC; + +static int blk_range = 20; +static ForkNumber forknum = MAIN_FORKNUM; +PGIOAlignedBlock write_buffers[32]; +PGIOAlignedBlock read_buffers[32]; + +static void +test_bulkread(SMgrRelation smgr, BlockNumber begin_blkno) +{ + MemSet(read_buffers, 0, BLCKSZ * blk_range); + + for (int i = 0; i < blk_range; i++) + { + smgrextend(smgr, forknum, begin_blkno + i, &write_buffers[i], true); + smgrread(smgr, forknum, begin_blkno + i, &read_buffers[i]); + + /* Cross validation */ + if (memcmp(&write_buffers[i], &read_buffers[i], BLCKSZ) != 0) + elog(ERROR, "bulkio test read failed"); + } + + MemSet(read_buffers, 0, BLCKSZ * blk_range); + polar_smgrbulkread(smgr, forknum, begin_blkno, blk_range, &read_buffers); + + if (memcmp(&write_buffers, &read_buffers, BLCKSZ * blk_range) != 0) + elog(ERROR, "bulkio test bulk read failed"); + + smgrtruncate(smgr, &forknum, 1, &begin_blkno); +} + +static void +test_bulkwrite(SMgrRelation smgr, BlockNumber begin_blkno) +{ + smgrzeroextend(smgr, forknum, begin_blkno, blk_range, true); + + polar_smgrbulkwrite(smgr, forknum, begin_blkno, blk_range, &write_buffers, true); + + MemSet(read_buffers, 0, BLCKSZ * blk_range); + polar_smgrbulkread(smgr, forknum, begin_blkno, blk_range, &read_buffers); + + if (memcmp(&write_buffers, &read_buffers, BLCKSZ * blk_range) != 0) + elog(ERROR, "bulkio test bulk write failed"); + + smgrtruncate(smgr, &forknum, 1, &begin_blkno); +} + +static void +test_bulkextend(SMgrRelation smgr, BlockNumber begin_blkno) +{ + polar_smgrbulkextend(smgr, forknum, begin_blkno, blk_range, &write_buffers, true); + + MemSet(read_buffers, 0, BLCKSZ * blk_range); + polar_smgrbulkread(smgr, forknum, begin_blkno, blk_range, &read_buffers); + + if (memcmp(&write_buffers, &read_buffers, BLCKSZ * blk_range) != 0) + elog(ERROR, "bulkio test bulk extend failed"); + + smgrtruncate(smgr, &forknum, 1, &begin_blkno); +} + +static void +test_bulkio_aux(SMgrRelation smgr, BlockNumber begin_blkno) +{ + BlockNumber nblocks = smgrnblocks(smgr, forknum); + + if (begin_blkno - nblocks > 0) + smgrzeroextend(smgr, forknum, nblocks, begin_blkno - nblocks, true); + + for (int i = 0; i < blk_range; i++) + MemSet(&write_buffers[i], begin_blkno + i, BLCKSZ); + + test_bulkread(smgr, begin_blkno); + test_bulkwrite(smgr, begin_blkno); + test_bulkextend(smgr, begin_blkno); +} + +PG_FUNCTION_INFO_V1(test_bulkio); +Datum +test_bulkio(PG_FUNCTION_ARGS) +{ + BlockNumber zero_blkno = 0; + RelFileNode perf_rlocator = {MyDatabaseTableSpace, MyDatabaseId, 1}; + SMgrRelation smgr = smgropen(perf_rlocator, InvalidBackendId); + + if (!smgrexists(smgr, forknum)) + smgrcreate(smgr, forknum, false); + else + smgrtruncate(smgr, &forknum, 1, &zero_blkno); + + test_bulkio_aux(smgr, 0); + test_bulkio_aux(smgr, 1 * RELSEG_SIZE - 10); + test_bulkio_aux(smgr, 2 * RELSEG_SIZE - 10); + test_bulkio_aux(smgr, 3 * RELSEG_SIZE - 10); + + smgrdounlinkall(&smgr, 1, false); + smgrclose(smgr); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_bulkio/test_bulkio.control b/src/test/modules/test_bulkio/test_bulkio.control new file mode 100644 index 00000000000..47db048caa2 --- /dev/null +++ b/src/test/modules/test_bulkio/test_bulkio.control @@ -0,0 +1,4 @@ +comment = 'Test code for bulk IO interface' +default_version = '1.0' +module_pathname = '$libdir/test_bulkio' +relocatable = true diff --git a/src/test/modules/test_polar_directio/test_directio.c b/src/test/modules/test_polar_directio/test_directio.c index 1ea2f99bf34..4f01c15c0e8 100644 --- a/src/test/modules/test_polar_directio/test_directio.c +++ b/src/test/modules/test_polar_directio/test_directio.c @@ -89,7 +89,7 @@ test_directio(PG_FUNCTION_ARGS) if (polar_directio_buffer == NULL && posix_memalign((void **) &polar_directio_buffer, - POLAR_DIRECTIO_ALIGN_LEN, + PG_IO_ALIGN_SIZE, polar_max_direct_io_size) != 0) elog(PANIC, "posix_memalign alloc polar_directio_buffer failed!"); @@ -442,7 +442,7 @@ test_aligned_buffer_offset_len(int directio_fd, int bufferio_fd) for (i = 0; i < SUBAPI_LOOP; i++) { len = POLAR_DIRECTIO_ALIGN(random() % polar_max_direct_io_size); - Assert(0 == posix_memalign((void **) &buffer, POLAR_DIRECTIO_ALIGN_LEN, len)); + Assert(0 == posix_memalign((void **) &buffer, PG_IO_ALIGN_SIZE, len)); MemSet(buffer, 0x4, len); Assert(0 == polar_stat(directio_file, &stat_buf)); offset = POLAR_DIRECTIO_ALIGN_DOWN(random() % stat_buf.st_size); diff --git a/src/test/polar_pl/Makefile b/src/test/polar_pl/Makefile index 52bb7ab580f..5fbc74c99b6 100644 --- a/src/test/polar_pl/Makefile +++ b/src/test/polar_pl/Makefile @@ -6,6 +6,9 @@ # #------------------------------------------------------------------------- +export enable_fault_injector +export with_ssl + EXTRA_INSTALL = external/polar_monitor EXTRA_INSTALL += contrib/pg_stat_statements @@ -17,8 +20,6 @@ subdir = src/test/polar_pl top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -export with_ssl - check: $(prove_check) diff --git a/src/test/polar_pl/t/011_polar_bulk_extend.pl b/src/test/polar_pl/t/011_polar_bulk_extend.pl deleted file mode 100644 index 36a6015460e..00000000000 --- a/src/test/polar_pl/t/011_polar_bulk_extend.pl +++ /dev/null @@ -1,104 +0,0 @@ -# 011_polar_bulk_extend.pl -# In this cases, we will check bulk extending in 100 MB table. -# -# Copyright (c) 2024, Alibaba Group Holding Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# IDENTIFICATION -# src/test/polar_pl/t/011_polar_bulk_extend.pl - -use strict; -use warnings; -use PostgreSQL::Test::Cluster; -use PostgreSQL::Test::Utils; -use Test::More; - -plan tests => 4; - -# Start Server -my $node_primary = PostgreSQL::Test::Cluster->new('primary'); -$node_primary->polar_init_primary; -$node_primary->start; - -# Create extension polar_monitor -$node_primary->safe_psql('postgres', - 'CREATE EXTENSION IF NOT EXISTS polar_monitor;'); - - -# Create table -$node_primary->safe_psql('postgres', - q[create table bulk_extend_tbl(id int8, value int8);]); - - -# Close the feature -$node_primary->safe_psql('postgres', - 'alter system set polar_bulk_extend_size = 0;'); -$node_primary->safe_psql('postgres', - 'alter system set polar_min_bulk_extend_table_size = 0;'); -$node_primary->reload; - -# Load Data -$node_primary->safe_psql('postgres', - q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);] -); - -is( $node_primary->safe_psql( - 'postgres', - 'select heap_bulk_extend_times = 0 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';' - ), - 't', - 'heap_bulk_extend_times should be 0'); - -is( $node_primary->safe_psql( - 'postgres', - 'select heap_bulk_extend_blocks = 0 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';' - ), - 't', - 'heap_bulk_extend_blocks should be 0'); - -# Reset table -$node_primary->safe_psql('postgres', q[drop table bulk_extend_tbl;]); - -# Open the feature -$node_primary->safe_psql('postgres', - 'alter system set polar_bulk_extend_size = 512;'); -$node_primary->reload; - -$node_primary->safe_psql('postgres', - q[create table bulk_extend_tbl(id int8, value int8);]); - -# Load Data -$node_primary->safe_psql('postgres', - q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);] -); - -my $bulk_extend_times = $node_primary->safe_psql('postgres', - 'select heap_bulk_extend_times > 20 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';' -); - -my $bulk_extend_blocks = $node_primary->safe_psql('postgres', - 'select heap_bulk_extend_blocks > 10000 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';' -); - -# For stable cases, we use > 20/ > 10000 instead of =25/=13312. And we print acutal values -is($bulk_extend_times, 't', - "heap_bulk_extend_times should be 25 > 20. But actual heap_bulk_extend_times is $bulk_extend_times." -); -is($bulk_extend_blocks, 't', - "heap_bulk_extend_blocks should be 13312 > 10000. But actual heap_bulk_extend_blocks is $bulk_extend_blocks." -); -print - "The actual heap_bulk_extend_times is $bulk_extend_times, actual heap_bulk_extend_blocks is $bulk_extend_blocks"; - -$node_primary->stop; diff --git a/src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl b/src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl deleted file mode 100644 index 6ef57a43aaf..00000000000 --- a/src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl +++ /dev/null @@ -1,93 +0,0 @@ -use strict; -# 012_polar_create_index_bulk_extend.pl -# create index bulk extend test -# -# Copyright (c) 2024, Alibaba Group Holding Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# IDENTIFICATION -# src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl - -use warnings; -use PostgreSQL::Test::Cluster; -use PostgreSQL::Test::Utils; -use Test::More; - -plan tests => 2; - -# Start Server -my $node_primary = PostgreSQL::Test::Cluster->new('primary'); -$node_primary->polar_init_primary; -$node_primary->start; - -# Create extension polar_monitor -$node_primary->safe_psql('postgres', - 'CREATE EXTENSION IF NOT EXISTS polar_monitor;'); - - -# Create table -$node_primary->safe_psql('postgres', - q[create table bulk_extend_tbl(id int8, value int8);]); - - -# Close the feature -$node_primary->safe_psql('postgres', - 'alter system set polar_index_create_bulk_extend_size = 0;'); -$node_primary->safe_psql('postgres', - 'alter system set polar_min_bulk_extend_table_size = 0;'); -$node_primary->reload; - -# Load Data -$node_primary->safe_psql('postgres', - q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);] -); - -# Create index -$node_primary->safe_psql('postgres', - q[CREATE INDEX bulk_extend_idx on bulk_extend_tbl(id);]); - -is( $node_primary->safe_psql( - 'postgres', - 'select idx_create_extend_times = 0 from polar_pg_stat_all_index_extend_stats where relname=\'bulk_extend_tbl\';' - ), - 't', - 'idx_create_extend_times should be 0'); - -# Reset table -$node_primary->safe_psql('postgres', q[drop table bulk_extend_tbl;]); - -# Open the feature -$node_primary->safe_psql('postgres', - 'alter system set polar_index_create_bulk_extend_size = 512;'); -$node_primary->reload; - -$node_primary->safe_psql('postgres', - q[create table bulk_extend_tbl(id int8, value int8);]); - -# Load Data -$node_primary->safe_psql('postgres', - q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);] -); - -$node_primary->safe_psql('postgres', - q[CREATE INDEX bulk_extend_idx on bulk_extend_tbl(id);]); - -is( $node_primary->safe_psql( - 'postgres', - 'select idx_create_extend_times = 13 from polar_pg_stat_all_index_extend_stats where relname=\'bulk_extend_tbl\';' - ), - 't', - 'idx_create_extend_times should be 13'); - -$node_primary->stop; diff --git a/src/test/polar_pl/t/013_polar_index_bulk_extend.pl b/src/test/polar_pl/t/013_polar_index_bulk_extend.pl deleted file mode 100644 index 3cc0aea793b..00000000000 --- a/src/test/polar_pl/t/013_polar_index_bulk_extend.pl +++ /dev/null @@ -1,77 +0,0 @@ -# 013_polar_index_bulk_extend.pl -# polar index insert bulk extend test -# -# Copyright (c) 2024, Alibaba Group Holding Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# IDENTIFICATION -# src/test/polar_pl/t/013_polar_index_bulk_extend.pl - -use strict; -use warnings; -use PostgreSQL::Test::Cluster; -use PostgreSQL::Test::Utils; -use Test::More; - -plan tests => 2; - -my $node = PostgreSQL::Test::Cluster->new('primary'); -$node->polar_init_primary; -$node->start; - -# Set the min bulk extend table size to 0, so the index bulk -# extend always hits. -$node->safe_psql('postgres', - 'alter system set polar_min_bulk_extend_table_size = 0;'); - -# Set the index bulk extend size to 256 (2MB), the index -# size will larger than 2MB. -$node->safe_psql('postgres', - 'alter system set polar_index_bulk_extend_size = 256;'); -$node->reload; - -$node->safe_psql( - 'postgres', - q[create table test_index_bulk_extend(test1 int); - create index test_index on test_index_bulk_extend(test1);]); - -$node->safe_psql('postgres', - q[insert into test_index_bulk_extend values(1);]); - -# 2 * 1024 * 1024 = 2097152 = 2MB -is( $node->safe_psql( - 'postgres', - "select pg_indexes_size('test_index_bulk_extend') > 2097152;"), - 't', - 'index bulk extend 2MB'); - -$node->safe_psql('postgres', q[truncate test_index_bulk_extend;]); - -# Set the index bulk extend size to 512 (4MB), the index -# size will larger than 4MB. -$node->safe_psql('postgres', - 'alter system set polar_index_bulk_extend_size = 512;'); -$node->reload; - -$node->safe_psql('postgres', - q[insert into test_index_bulk_extend values(1);]); - -# 4 * 1024 * 1024 = 4194304 = 4MB -is( $node->safe_psql( - 'postgres', - "select pg_indexes_size('test_index_bulk_extend') > 4194304;"), - 't', - 'index bulk extend 4MB'); - -$node->stop; diff --git a/src/test/polar_pl/t/038_bulk_write.pl b/src/test/polar_pl/t/038_bulk_write.pl new file mode 100644 index 00000000000..26512ef3a25 --- /dev/null +++ b/src/test/polar_pl/t/038_bulk_write.pl @@ -0,0 +1,149 @@ +#!/usr/bin/perl + +# 038_bulk_write.pl +# +# Copyright (c) 2024, Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# IDENTIFICATION +# src/test/polar_pl/t/038_bulk_write.pl + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node_primary; +my $node_replica; +my $node_standby; + +sub test_index_create() +{ + $node_primary->safe_psql('postgres', + 'CREATE TABLE t(id int, ir int4range)'); + $node_primary->safe_psql('postgres', + 'INSERT INTO t SELECT i, int4range(i, i+100) FROM generate_series(1,10000) AS i' + ); + $node_primary->safe_psql('postgres', 'CREATE INDEX ON t(id)'); + $node_primary->safe_psql('postgres', + 'CREATE INDEX ON t USING SPGIST (ir)'); + $node_primary->safe_psql('postgres', 'VACUUM FULL t'); + is( $node_primary->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 10000, + "btree index is ok"); + $node_primary->safe_psql('postgres', 'TRUNCATE t'); + $node_primary->safe_psql('postgres', + 'INSERT INTO t SELECT i, int4range(i, i+100) FROM generate_series(1,10000) AS i' + ); + is( $node_primary->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 10000, + "btree index is ok"); + + $node_primary->wait_for_catchup($node_replica); + is( $node_replica->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 10000, + "btree index is ok in replica"); + + $node_primary->wait_for_catchup($node_standby); + is( $node_standby->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 10000, + "btree index is ok in standby"); + + $node_primary->safe_psql('postgres', 'DROP TABLE t'); + $node_primary->safe_psql('postgres', + 'CREATE TABLE t(id int, ir int4range)'); + $node_primary->safe_psql('postgres', + 'INSERT INTO t SELECT i, int4range(i, i+100) FROM generate_series(1,10000) AS i' + ); + $node_primary->safe_psql('postgres', 'CREATE INDEX ON t(id)'); + $node_primary->stop('immediate'); + $node_primary->start; + is( $node_primary->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 10000, + "btree index is ok after crash"); + + $node_primary->safe_psql('postgres', 'DROP TABLE t'); +} + +$node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->polar_init_primary; + +$node_replica = PostgreSQL::Test::Cluster->new('replica'); +$node_replica->polar_init_replica($node_primary); + +$node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->polar_init_standby($node_primary); + +$node_primary->start; + +$node_primary->polar_create_slot($node_replica->name); +$node_primary->polar_create_slot($node_standby->name); + +$node_replica->start; +$node_standby->start; + +# test bulk write with polar_zero_extend_method = none +$node_primary->append_conf('postgresql.conf', + "polar_zero_extend_method = none"); +$node_primary->reload; + +foreach my $maxpages (1, 2, 16, 128, 512) +{ + print("test maxpages=$maxpages\n"); + $node_primary->append_conf('postgresql.conf', + "polar_bulk_write_maxpages = $maxpages"); + $node_primary->reload; + test_index_create; +} + +# test bulk write with polar_zero_extend_method = bulkwrite +$node_primary->append_conf('postgresql.conf', + "polar_zero_extend_method = bulkwrite"); +$node_primary->reload; + +foreach my $maxpages (1, 2, 16, 128, 512) +{ + print("test maxpages=$maxpages\n"); + $node_primary->append_conf('postgresql.conf', + "polar_bulk_write_maxpages = $maxpages"); + $node_primary->reload; + test_index_create; +} + +# test bulk write with polar_zero_extend_method = fallocate +$node_primary->append_conf('postgresql.conf', + "polar_zero_extend_method = fallocate"); +$node_primary->reload; + +foreach my $maxpages (1, 2, 16, 128, 512) +{ + print("test maxpages=$maxpages\n"); + $node_primary->append_conf('postgresql.conf', + "polar_bulk_write_maxpages = $maxpages"); + $node_primary->reload; + test_index_create; +} + +# done with the node +$node_primary->stop; +$node_replica->stop; +$node_standby->stop; + +done_testing(); diff --git a/src/test/polar_pl/t/044_polar_zero_buffers.pl b/src/test/polar_pl/t/044_polar_zero_buffers.pl new file mode 100644 index 00000000000..1f7989c34ee --- /dev/null +++ b/src/test/polar_pl/t/044_polar_zero_buffers.pl @@ -0,0 +1,84 @@ +#!/usr/bin/perl +# 044_polar_zero_buffers.pl +# Test polar_pwrite_zeros with different size of GUC polar_zero_buffers. +# +# Copyright (c) 2024, Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# IDENTIFICATION +# src/test/polar_pl/t/044_polar_zero_buffers.pl + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +sub test_polar_pwrite_zeros +{ + $node_primary->safe_psql('postgres', 'CREATE TABLE t(id int)'); + $node_primary->safe_psql('postgres', + 'INSERT INTO t SELECT generate_series(1,1000000)'); + $node_primary->safe_psql('postgres', 'CREATE INDEX ON t(id)'); + is( $node_primary->safe_psql( + 'postgres', 'SET enable_indexscan = off; SELECT count(*) FROM t'), + 1000000, + "heap is ok on primary"); + is( $node_primary->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 1000000, + "btree index is ok on primary"); + + $node_primary->wait_for_catchup($node_standby); + + is( $node_standby->safe_psql( + 'postgres', 'SET enable_indexscan = off; SELECT count(*) FROM t'), + 1000000, + "heap is ok on standby"); + is( $node_standby->safe_psql( + 'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'), + 1000000, + "btree index is ok on standby"); + + $node_primary->safe_psql('postgres', 'DROP TABLE t'); +} + +foreach my $zero_buffers (-1, 0, 1, 3, 16, 512) +{ + print("test zero_buffers=$zero_buffers\n"); + $node_primary->append_conf('postgresql.conf', + "polar_zero_buffers = $zero_buffers"); + $node_primary->restart; + $node_standby->append_conf('postgresql.conf', + "polar_zero_buffers = $zero_buffers"); + $node_standby->restart; + test_polar_pwrite_zeros; +} + +done_testing(); diff --git a/src/test/polar_pl/t/045_bulk_extend.pl b/src/test/polar_pl/t/045_bulk_extend.pl new file mode 100644 index 00000000000..6ac33ec1643 --- /dev/null +++ b/src/test/polar_pl/t/045_bulk_extend.pl @@ -0,0 +1,220 @@ +#!/usr/bin/perl +# 045_bulk_extend.pl +# Test bulk extend for heap_tbl table and btree index. +# +# Copyright (c) 2024, Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# IDENTIFICATION +# src/test/polar_pl/t/045_bulk_extend.pl + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize primary node +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf('postgresql.conf', + "polar_heap_bulk_extend_size = 0"); +$node_primary->append_conf('postgresql.conf', + "polar_index_bulk_extend_size = 0"); +$node_primary->append_conf('postgresql.conf', + "polar_recovery_bulk_extend_size = 0"); +$node_primary->append_conf('postgresql.conf', "max_connections = 10"); +$node_primary->append_conf('postgresql.conf', "shared_buffers = 16MB"); +$node_primary->append_conf('postgresql.conf', "enable_seqscan = off"); +$node_primary->start; +my $backup_name = 'my_backup'; + +# Take backup +$node_primary->backup($backup_name); + +# Create streaming standby linking to primary +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +$node_primary->safe_psql('postgres', 'CREATE EXTENSION amcheck'); +$node_primary->safe_psql('postgres', 'CREATE EXTENSION bloom'); + +$node_primary->safe_psql('postgres', 'CREATE TABLE heap_tbl(id int)'); +$node_primary->safe_psql('postgres', + 'CREATE INDEX btree_idx ON heap_tbl(id)'); +$node_primary->safe_psql('postgres', + 'CREATE TABLE misc_tbl(id int4, arr int4[], gp point, sp point, m int4)'); +$node_primary->safe_psql('postgres', + 'CREATE INDEX gin_idx ON misc_tbl USING gin(arr)'); +$node_primary->safe_psql('postgres', + 'CREATE INDEX gist_idx ON misc_tbl USING gist(gp)'); +$node_primary->safe_psql('postgres', + 'CREATE INDEX spgist_idx ON misc_tbl USING spgist(sp)'); +$node_primary->safe_psql('postgres', + 'CREATE INDEX bloom_idx ON misc_tbl USING bloom(m, id)'); + +my ($base_heap_size, $base_btree_size, $base_gin_size, + $base_gist_size, $base_spgist_size, $base_bloom_size); + +sub bulk_extend_sanity_check +{ + my $node = shift; + my $extend_size = shift; + + my $heap_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('heap_tbl')/8192"); + my $btree_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('btree_idx')/8192"); + my $gin_size = + $node->safe_psql('postgres', "SELECT pg_relation_size('gin_idx')/8192"); + my $gist_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('gist_idx')/8192"); + my $spgist_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('spgist_idx')/8192"); + my $bloom_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('bloom_idx')/8192"); + + if ($extend_size == 0) + { + if ($node eq $node_primary) + { + $base_heap_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('heap_tbl')/8192"); + $base_btree_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('btree_idx')/8192"); + $base_gin_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('gin_idx')/8192"); + $base_gist_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('gist_idx')/8192"); + $base_spgist_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('spgist_idx')/8192"); + $base_bloom_size = $node->safe_psql('postgres', + "SELECT pg_relation_size('bloom_idx')/8192"); + } + } + else + { + ok(($heap_size - $base_heap_size) < $extend_size, + 'no waste in heap bulk extend'); + print("heap_size: $heap_size, base_heap_size: $base_heap_size\n"); + ok(($btree_size - $base_btree_size) < $extend_size, + 'no waste in btree bulk extend'); + print("btree_size: $btree_size, base_btree_size: $base_btree_size\n"); + ok(($gin_size - $base_gin_size) < $extend_size, + 'no waste in gin bulk extend'); + print("gin_size: $gin_size, base_gin_size: $base_gin_size\n"); + ok(($gist_size - $base_gist_size) < $extend_size, + 'no waste in gist bulk extend'); + print("gist_size: $gist_size, base_gist_size: $base_gist_size\n"); + ok(($spgist_size - $base_spgist_size) < $extend_size, + 'no waste in spgist bulk extend'); + print( + "spgist_size: $spgist_size, base_spgist_size: $base_spgist_size\n" + ); + ok(($bloom_size - $base_bloom_size) < $extend_size, + 'no waste in bloom bulk extend'); + print("bloom_size: $bloom_size, base_bloom_size: $base_bloom_size\n"); + } + + # heap and btree got amcheck, use it + $node->safe_psql('postgres', "SELECT verify_heapam('heap_tbl')"); + if ($node eq $node_primary) + { + $node->safe_psql('postgres', + "SELECT bt_index_parent_check('btree_idx', 't', 't')"); + } + else + { + $node->safe_psql('postgres', + "SELECT bt_index_check('btree_idx', 't')"); + } + + is( $node->safe_psql( + 'postgres', + 'WITH rand AS (SELECT (floor(random() * 99998) + 3)::int v) + SELECT count(*) FROM misc_tbl, rand + WHERE arr @> array[1, rand.v::int]' + ), + 1, + 'gin index check'); + is( $node->safe_psql( + 'postgres', + 'WITH rand AS (SELECT ceil(random() * 100000)::int v) + SELECT count(*) FROM misc_tbl, rand + WHERE gp <@ box(point(rand.v*10,rand.v*10), point((rand.v+1)*10, (rand.v+1)*10))' + ), + 1, + 'gist index check'); + is( $node->safe_psql( + 'postgres', + 'WITH rand AS (SELECT ceil(random() * 100000)::int v) + SELECT count(*) FROM misc_tbl, rand + WHERE sp <@ box(point(rand.v*10,rand.v*10), point((rand.v+1)*10, (rand.v+1)*10))' + ), + 1, + 'spgist index check'); + is( $node->safe_psql( + 'postgres', + 'WITH rand AS (SELECT (ceil(random() * 99999) + 1)::int v) + SELECT count(*) FROM misc_tbl, rand + WHERE m = rand.v%10 and id = rand.v' + ), + 1, + 'bloom index check'); +} + +sub test_bulk_extend +{ + my $extend_size = shift; + + print("Test bulk extend with size of $extend_size blocks\n"); + + $node_primary->safe_psql('postgres', 'TRUNCATE TABLE heap_tbl'); + $node_primary->safe_psql('postgres', 'TRUNCATE TABLE misc_tbl'); + + $node_primary->safe_psql('postgres', + 'INSERT INTO heap_tbl SELECT generate_series(1,1000000)'); + $node_primary->safe_psql( + 'postgres', + "INSERT INTO misc_tbl + SELECT g, array[1, 2, g], point(g*10+1, g*10+1), point(g*10+1, g*10+1), g%10 + FROM generate_series(1, 100000) g" + ); + + bulk_extend_sanity_check($node_primary, $extend_size); + + $node_primary->stop('immediate'); + $node_primary->start; + bulk_extend_sanity_check($node_primary, $extend_size); + + $node_primary->wait_for_catchup($node_standby); + bulk_extend_sanity_check($node_standby, $extend_size); +} + +foreach my $extend_size (0, 1, 3, 16, 512) +{ + print("test extend_size=$extend_size\n"); + $node_primary->append_conf('postgresql.conf', + "polar_heap_bulk_extend_size = $extend_size"); + $node_primary->append_conf('postgresql.conf', + "polar_index_bulk_extend_size = $extend_size"); + $node_primary->append_conf('postgresql.conf', + "polar_recovery_bulk_extend_size = $extend_size"); + $node_primary->reload; + test_bulk_extend $extend_size; +} + +done_testing(); diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index d55aec3a1d0..80f4bfe7cbb 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -2,6 +2,9 @@ -- CREATE_INDEX -- Create ancillary data structures (i.e. indices) -- +-- Disable bulk extend to make table size unchanged, thus make plan stable. +set polar_heap_bulk_extend_size = 0; +set polar_index_bulk_extend_size = 0; -- directory paths are passed to us in environment variables \getenv abs_srcdir PG_ABS_SRCDIR -- diff --git a/src/test/regress/expected/polar/polar_index_bulk_extend_for_coverage.out b/src/test/regress/expected/polar/polar_index_bulk_extend_for_coverage.out deleted file mode 100644 index 747e20f0a1e..00000000000 --- a/src/test/regress/expected/polar/polar_index_bulk_extend_for_coverage.out +++ /dev/null @@ -1,58 +0,0 @@ --- This regression test case is used for coverage testing. --- The feature index bulk extend test is in test/polar_pl. --- Index bulk extend is used in big table index insert, --- the regression test cases don't create an big table. -ALTER SYSTEM SET polar_index_bulk_extend_size = 512; -ALTER SYSTEM SET polar_min_bulk_extend_table_size = 0; -SELECT pg_reload_conf(); - pg_reload_conf ----------------- - t -(1 row) - -SELECT pg_sleep(2); - pg_sleep ----------- - -(1 row) - -show polar_index_bulk_extend_size; - polar_index_bulk_extend_size ------------------------------- - 4MB -(1 row) - -show polar_min_bulk_extend_table_size; - polar_min_bulk_extend_table_size ----------------------------------- - 0 -(1 row) - -CREATE TABLE test_index_bulk_extend(test1 int, test2 int); -CREATE INDEX test_index_bulk on test_index_bulk_extend(test1); -INSERT INTO test_index_bulk_extend values(generate_series(1, 10000), generate_series(1, 10000)); -SELECT * FROM test_index_bulk_extend ORDER BY test1 limit 10; - test1 | test2 --------+------- - 1 | 1 - 2 | 2 - 3 | 3 - 4 | 4 - 5 | 5 - 6 | 6 - 7 | 7 - 8 | 8 - 9 | 9 - 10 | 10 -(10 rows) - -DROP INDEX test_index_bulk; -DROP TABLE test_index_bulk_extend; -ALTER SYSTEM RESET polar_index_bulk_extend_size; -ALTER SYSTEM RESET polar_min_bulk_extend_table_size; -SELECT pg_reload_conf(); - pg_reload_conf ----------------- - t -(1 row) - diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out index 391b36d1318..5900541141b 100644 --- a/src/test/regress/expected/test_setup.out +++ b/src/test/regress/expected/test_setup.out @@ -1,6 +1,9 @@ -- -- TEST_SETUP --- prepare environment expected by regression test scripts -- +-- Disable bulk extend to make table size unchanged, thus make plan stable. +set polar_heap_bulk_extend_size = 0; +set polar_index_bulk_extend_size = 0; -- directory paths and dlsuffix are passed to us in environment variables \getenv abs_srcdir PG_ABS_SRCDIR \getenv libdir PG_LIBDIR diff --git a/src/test/regress/polar_check_schedule b/src/test/regress/polar_check_schedule index 69c82eb9253..cc27a7895ad 100644 --- a/src/test/regress/polar_check_schedule +++ b/src/test/regress/polar_check_schedule @@ -20,4 +20,4 @@ polar_dir: polar test: force_unlogged_logged force_trans_ro_non_sup test: polar_parallel_bgwriter test: polar_invalid_memory_alloc_1 polar_shm_unused -test: polar_support_gbk_encoding polar_copy_into_gbk polar_index_bulk_extend_for_coverage +test: polar_support_gbk_encoding polar_copy_into_gbk diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index d8fded3d930..362531b6d51 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -3,6 +3,10 @@ -- Create ancillary data structures (i.e. indices) -- +-- Disable bulk extend to make table size unchanged, thus make plan stable. +set polar_heap_bulk_extend_size = 0; +set polar_index_bulk_extend_size = 0; + -- directory paths are passed to us in environment variables \getenv abs_srcdir PG_ABS_SRCDIR diff --git a/src/test/regress/sql/polar/polar_index_bulk_extend_for_coverage.sql b/src/test/regress/sql/polar/polar_index_bulk_extend_for_coverage.sql deleted file mode 100644 index e57144cb0a1..00000000000 --- a/src/test/regress/sql/polar/polar_index_bulk_extend_for_coverage.sql +++ /dev/null @@ -1,19 +0,0 @@ --- This regression test case is used for coverage testing. --- The feature index bulk extend test is in test/polar_pl. --- Index bulk extend is used in big table index insert, --- the regression test cases don't create an big table. -ALTER SYSTEM SET polar_index_bulk_extend_size = 512; -ALTER SYSTEM SET polar_min_bulk_extend_table_size = 0; -SELECT pg_reload_conf(); -SELECT pg_sleep(2); -show polar_index_bulk_extend_size; -show polar_min_bulk_extend_table_size; -CREATE TABLE test_index_bulk_extend(test1 int, test2 int); -CREATE INDEX test_index_bulk on test_index_bulk_extend(test1); -INSERT INTO test_index_bulk_extend values(generate_series(1, 10000), generate_series(1, 10000)); -SELECT * FROM test_index_bulk_extend ORDER BY test1 limit 10; -DROP INDEX test_index_bulk; -DROP TABLE test_index_bulk_extend; -ALTER SYSTEM RESET polar_index_bulk_extend_size; -ALTER SYSTEM RESET polar_min_bulk_extend_table_size; -SELECT pg_reload_conf(); \ No newline at end of file diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql index 02c0c84c3ad..2a0f3b5959b 100644 --- a/src/test/regress/sql/test_setup.sql +++ b/src/test/regress/sql/test_setup.sql @@ -2,6 +2,10 @@ -- TEST_SETUP --- prepare environment expected by regression test scripts -- +-- Disable bulk extend to make table size unchanged, thus make plan stable. +set polar_heap_bulk_extend_size = 0; +set polar_index_bulk_extend_size = 0; + -- directory paths and dlsuffix are passed to us in environment variables \getenv abs_srcdir PG_ABS_SRCDIR \getenv libdir PG_LIBDIR diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0eff305b4cd..3fee816fb01 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -323,6 +323,8 @@ BuildAccumulator BuiltinScript BulkInsertState BulkInsertStateData +BulkWriteBuffer +BulkWriteState CACHESIGN CAC_state CCFastEqualFN @@ -1699,6 +1701,7 @@ PGEventResultDestroy PGFInfoFunction PGFileType PGFunction +PGIOAlignedBlock PGLZ_HistEntry PGLZ_Strategy PGMessageField @@ -1986,6 +1989,7 @@ PendingFsyncEntry PendingRelDelete PendingRelSync PendingUnlinkEntry +PendingWrite PendingWriteback PerLockTagEntry PerlInterpreter diff --git a/src/tools/polar_copyright_check.pl b/src/tools/polar_copyright_check.pl index 98b010f9f44..19e8dcfe2fa 100755 --- a/src/tools/polar_copyright_check.pl +++ b/src/tools/polar_copyright_check.pl @@ -59,7 +59,10 @@ my $invalid_comment_body = 'Invalid comment body'; -my @common_typo = ('poalr', 'wirte', 'wrod', 'confict'); +my @common_typo = ( + 'poalr', 'wirte', 'wrod', 'confict', + 'enalbe', 'cleard', 'recognisable', 'exsits', + 'conficts', 'sucess'); my @standard_comment_prefix = ('\/\* POLAR px\:', '\/\* POLAR\:'); #------ the max diff line before checking PG community files ------ @@ -552,6 +555,14 @@ sub c_apache_license_format_check return 0; } + # identification indent + if ($lines[$start_line] !~ m/^ \*\t .+/) + { + print + " $logger_error $invalid_comment_body: invalid identification path indent.\n"; + return 0; + } + # trim the identification and check my $trim = substr($lines[$start_line], 2); $trim = lstrip($trim);