diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index 805e8810ecf..a56a699a8d8 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -166,7 +166,7 @@ blbuildempty(Relation index)
 	Page		metapage;
 
 	/* Construct metapage. */
-	metapage = (Page) palloc_io_aligned(BLCKSZ, 0);
+	metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 	BloomFillMetapage(index, metapage);
 
 	/*
@@ -178,7 +178,7 @@ blbuildempty(Relation index)
 	 */
 	PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO);
 	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BLOOM_METAPAGE_BLKNO,
-			  (char *) metapage, true);
+			  metapage, true);
 	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
 				BLOOM_METAPAGE_BLKNO, metapage, true);
 
diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index a434cf93efd..f091fd58166 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -15,6 +15,7 @@
 
 #include "access/amapi.h"
 #include "access/generic_xlog.h"
+#include "access/hio.h"
 #include "access/reloptions.h"
 #include "bloom.h"
 #include "catalog/index.h"
@@ -391,7 +392,7 @@ BloomNewBuffer(Relation index)
 	if (needLock)
 		LockRelationForExtension(index, ExclusiveLock);
 
-	buffer = ReadBuffer(index, P_NEW);
+	buffer = polar_index_add_blocks(index);
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 	if (needLock)
diff --git a/contrib/pageinspect/Makefile b/contrib/pageinspect/Makefile
index ad5a3ac5112..0e290ced337 100644
--- a/contrib/pageinspect/Makefile
+++ b/contrib/pageinspect/Makefile
@@ -22,7 +22,7 @@ DATA =  pageinspect--1.10--1.11.sql \
 	pageinspect--1.0--1.1.sql
 PGFILEDESC = "pageinspect - functions to inspect contents of database pages"
 
-REGRESS = page btree brin gin gist hash checksum oldextversions
+REGRESS = page btree brin gin gist hash checksum oldextversions index_bulk_extend
 
 ifdef USE_PGXS
 PG_CONFIG = pg_config
diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c
index 62f2c1b3159..2211a8ab23f 100644
--- a/contrib/pageinspect/btreefuncs.c
+++ b/contrib/pageinspect/btreefuncs.c
@@ -520,6 +520,18 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version)
 		uargs->page = palloc(BLCKSZ);
 		memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ);
 
+		/*
+		 * POLAR: During bulk extend, page inspect may access zero page, which
+		 * makes PageGetSpecialPointer Assert error. In order to handle this
+		 * situation, we init these zero pages.
+		 */
+		if (PageIsNew(uargs->page))
+		{
+			_bt_pageinit(uargs->page, BufferGetPageSize(buffer));
+			elog(DEBUG1, "page from block" INT64_FORMAT " is new in index bulk extend", blkno);
+		}
+		/* POLAR end */
+
 		UnlockReleaseBuffer(buffer);
 		relation_close(rel, AccessShareLock);
 
diff --git a/contrib/pageinspect/expected/gist.out b/contrib/pageinspect/expected/gist.out
index d1adbab8ae2..2b1d54a6279 100644
--- a/contrib/pageinspect/expected/gist.out
+++ b/contrib/pageinspect/expected/gist.out
@@ -1,13 +1,6 @@
--- The gist_page_opaque_info() function prints the page's LSN. Normally,
--- that's constant 1 (GistBuildLSN) on every page of a freshly built GiST
--- index. But with wal_level=minimal, the whole relation is dumped to WAL at
--- the end of the transaction if it's smaller than wal_skip_threshold, which
--- updates the LSNs. Wrap the tests on gist_page_opaque_info() in the
--- same transaction with the CREATE INDEX so that we see the LSNs before
--- they are possibly overwritten at end of transaction.
-BEGIN;
--- Create a test table and GiST index.
-CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+-- The gist_page_opaque_info() function prints the page's LSN.
+-- Use an unlogged index, so that the LSN is predictable.
+CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
     generate_series(1,1000) i;
 CREATE INDEX test_gist_idx ON test_gist USING gist (p);
 -- Page 0 is the root, the rest are leaf pages
@@ -29,7 +22,6 @@ SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
  0/1 | 0/0 |         1 | {leaf}
 (1 row)
 
-COMMIT;
 SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
  itemoffset |   ctid    | itemlen | dead |             keys              
 ------------+-----------+---------+------+-------------------------------
diff --git a/contrib/pageinspect/expected/index_bulk_extend.out b/contrib/pageinspect/expected/index_bulk_extend.out
new file mode 100644
index 00000000000..3fd298e8a6d
--- /dev/null
+++ b/contrib/pageinspect/expected/index_bulk_extend.out
@@ -0,0 +1,45 @@
+create schema test_index_bulk_extend;
+set search_path to test_index_bulk_extend;
+set client_min_messages to error; 
+create extension if not exists pageinspect;
+show polar_index_bulk_extend_size;
+ polar_index_bulk_extend_size 
+------------------------------
+ 1MB
+(1 row)
+
+drop table if exists p;
+create table p(a int, b varchar, c numeric,d int8);
+-- INSERT DATA
+insert into p select
+i,
+md5(i::text),
+i / 982.0,
+i * -1
+from
+generate_series(0,100000 - 1)i;
+-- INIT INDEX
+create index p_c_d_idx on p(c,d);
+-- DML
+insert into p select
+i,
+i::text || 'sdha&$#*&',
+i / 160.0,
+i / 32
+from
+generate_series(0,100000 - 1)i;
+-- For index bulk extend expansion
+select pg_relation_size('p_c_d_idx') < 3 * pg_relation_size('p');
+ ?column? 
+----------
+ t
+(1 row)
+
+-- For index bulk extend core
+select count(*) from generate_series(1, pg_relation_size('p_c_d_idx') / current_setting('block_size')::bigint - 1) AS blkno,bt_page_items('p_c_d_idx', blkno);
+ count  
+--------
+ 202716
+(1 row)
+
+drop schema test_index_bulk_extend cascade;
diff --git a/contrib/pageinspect/sql/gist.sql b/contrib/pageinspect/sql/gist.sql
index d263542ba15..85bc44b8000 100644
--- a/contrib/pageinspect/sql/gist.sql
+++ b/contrib/pageinspect/sql/gist.sql
@@ -1,14 +1,6 @@
--- The gist_page_opaque_info() function prints the page's LSN. Normally,
--- that's constant 1 (GistBuildLSN) on every page of a freshly built GiST
--- index. But with wal_level=minimal, the whole relation is dumped to WAL at
--- the end of the transaction if it's smaller than wal_skip_threshold, which
--- updates the LSNs. Wrap the tests on gist_page_opaque_info() in the
--- same transaction with the CREATE INDEX so that we see the LSNs before
--- they are possibly overwritten at end of transaction.
-BEGIN;
-
--- Create a test table and GiST index.
-CREATE TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
+-- The gist_page_opaque_info() function prints the page's LSN.
+-- Use an unlogged index, so that the LSN is predictable.
+CREATE UNLOGGED TABLE test_gist AS SELECT point(i,i) p, i::text t FROM
     generate_series(1,1000) i;
 CREATE INDEX test_gist_idx ON test_gist USING gist (p);
 
@@ -17,8 +9,6 @@ SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 0));
 SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 1));
 SELECT * FROM gist_page_opaque_info(get_raw_page('test_gist_idx', 2));
 
-COMMIT;
-
 SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
 SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 1), 'test_gist_idx') LIMIT 5;
 
diff --git a/contrib/pageinspect/sql/index_bulk_extend.sql b/contrib/pageinspect/sql/index_bulk_extend.sql
new file mode 100644
index 00000000000..f0d473391a6
--- /dev/null
+++ b/contrib/pageinspect/sql/index_bulk_extend.sql
@@ -0,0 +1,38 @@
+create schema test_index_bulk_extend;
+set search_path to test_index_bulk_extend;
+set client_min_messages to error; 
+
+create extension if not exists pageinspect;
+
+show polar_index_bulk_extend_size;
+
+drop table if exists p;
+create table p(a int, b varchar, c numeric,d int8);
+
+
+-- INSERT DATA
+insert into p select
+i,
+md5(i::text),
+i / 982.0,
+i * -1
+from
+generate_series(0,100000 - 1)i;
+-- INIT INDEX
+create index p_c_d_idx on p(c,d);
+
+-- DML
+insert into p select
+i,
+i::text || 'sdha&$#*&',
+i / 160.0,
+i / 32
+from
+generate_series(0,100000 - 1)i;
+
+-- For index bulk extend expansion
+select pg_relation_size('p_c_d_idx') < 3 * pg_relation_size('p');
+-- For index bulk extend core
+select count(*) from generate_series(1, pg_relation_size('p_c_d_idx') / current_setting('block_size')::bigint - 1) AS blkno,bt_page_items('p_c_d_idx', blkno);
+
+drop schema test_index_bulk_extend cascade;
\ No newline at end of file
diff --git a/contrib/pg_freespacemap/expected/pg_freespacemap.out b/contrib/pg_freespacemap/expected/pg_freespacemap.out
index 5c6d50ef82b..eb574c23736 100644
--- a/contrib/pg_freespacemap/expected/pg_freespacemap.out
+++ b/contrib/pg_freespacemap/expected/pg_freespacemap.out
@@ -54,17 +54,12 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace
 
 DELETE FROM freespace_tab;
 VACUUM freespace_tab;
--- In bulk extend, we will pre-extend pages.
--- And these pages will not be expected to vacuum truncated to avoid
--- repeating bulk extenion and truncating.
--- So the relation will exist in free space map.
 WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace')
   SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail
     FROM rel, LATERAL pg_freespace(rel.id) AS fsm
     ORDER BY 1, 2;
        id        | blkno | is_avail 
 -----------------+-------+----------
- freespace_tab   |     0 | t
  freespace_brin  |     0 | f
  freespace_brin  |     1 | f
  freespace_brin  |     2 | t
@@ -80,7 +75,7 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace
  freespace_hash  |     7 | f
  freespace_hash  |     8 | f
  freespace_hash  |     9 | f
-(16 rows)
+(15 rows)
 
 -- failures with incorrect block number
 SELECT * FROM pg_freespace('freespace_tab', -1);
diff --git a/contrib/pg_freespacemap/sql/pg_freespacemap.sql b/contrib/pg_freespacemap/sql/pg_freespacemap.sql
index 5fc3ee38948..06275d8fac8 100644
--- a/contrib/pg_freespacemap/sql/pg_freespacemap.sql
+++ b/contrib/pg_freespacemap/sql/pg_freespacemap.sql
@@ -20,10 +20,6 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace
 
 DELETE FROM freespace_tab;
 VACUUM freespace_tab;
--- In bulk extend, we will pre-extend pages.
--- And these pages will not be expected to vacuum truncated to avoid
--- repeating bulk extenion and truncating.
--- So the relation will exist in free space map.
 WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace')
   SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail
     FROM rel, LATERAL pg_freespace(rel.id) AS fsm
diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index caff5c4a80f..f50aa69eb2e 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -36,7 +36,7 @@ typedef enum
 	PREWARM_BUFFER
 } PrewarmType;
 
-static PGAlignedBlock blockbuffer;
+static PGIOAlignedBlock blockbuffer;
 
 /*
  * pg_prewarm(regclass, mode text, fork text,
diff --git a/external/Makefile b/external/Makefile
index 629d1f918e9..8d0467dda55 100644
--- a/external/Makefile
+++ b/external/Makefile
@@ -4,7 +4,7 @@ subdir = external
 top_builddir = ..
 include $(top_builddir)/src/Makefile.global
 
-SUBDIRS = 
+SUBDIRS =
 
 # multi-arch/0-dependency/fast-compile extensions can be added here
 # sort extention by names, less git conflict
@@ -15,6 +15,7 @@ SUBDIRS += polar_monitor_preload
 SUBDIRS += polar_parameter_manager
 SUBDIRS += polar_proxy_utils
 SUBDIRS += polar_resource_manager
+SUBDIRS += polar_smgrperf
 SUBDIRS += polar_stat_env
 SUBDIRS += polar_worker
 
diff --git a/external/polar_monitor/polar_monitor--1.0.sql b/external/polar_monitor/polar_monitor--1.0.sql
index ea75e515f12..1e4a70d803a 100644
--- a/external/polar_monitor/polar_monitor--1.0.sql
+++ b/external/polar_monitor/polar_monitor--1.0.sql
@@ -674,32 +674,6 @@ LANGUAGE C PARALLEL SAFE;
 
 REVOKE ALL ON FUNCTION polar_xlog_buffer_stat_reset() FROM PUBLIC;
 
-/* Per Index */
-CREATE FUNCTION polar_pg_stat_get_bulk_create_index_extend_times(
-    IN oid,
-    OUT int8
-)
-AS 'MODULE_PATHNAME', 'polar_pg_stat_get_bulk_create_index_extend_times'
-LANGUAGE C PARALLEL SAFE;
-
--- Create View for create index extend stats
-CREATE VIEW polar_pg_stat_all_index_extend_stats AS
-    SELECT
-        C.oid AS relid,
-        N.nspname AS schemaname,
-        C.relname AS relname,
-        polar_pg_stat_get_bulk_create_index_extend_times(C.oid) AS idx_create_extend_times
-    FROM pg_class C LEFT JOIN
-            pg_index I ON C.oid = I.indrelid LEFT JOIN
-            pg_class T ON C.reltoastrelid = T.oid LEFT JOIN
-            pg_index X ON T.oid = X.indrelid
-            LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
-    WHERE C.relkind IN ('r', 't', 'm')
-    GROUP BY C.oid, N.nspname, C.relname, T.oid, X.indrelid;
-
-REVOKE ALL ON FUNCTION polar_pg_stat_get_bulk_create_index_extend_times(IN oid,OUT int8) FROM PUBLIC;
-/* POLAR end */
-
 CREATE FUNCTION polar_get_slot_node_type(slot_name text)
 RETURNS text
 AS 'MODULE_PATHNAME', 'polar_get_slot_node_type'
diff --git a/external/polar_monitor/polar_monitor.c b/external/polar_monitor/polar_monitor.c
index 55476b92aed..4161a30d344 100644
--- a/external/polar_monitor/polar_monitor.c
+++ b/external/polar_monitor/polar_monitor.c
@@ -220,24 +220,6 @@ polar_pg_stat_get_bulk_read_blocks_IO(PG_FUNCTION_ARGS)
 	PG_RETURN_INT64(result);
 }
 
-/* POLAR: Bulk create index extend stats */
-/* Per table (or index) */
-PG_FUNCTION_INFO_V1(polar_pg_stat_get_bulk_create_index_extend_times);
-Datum
-polar_pg_stat_get_bulk_create_index_extend_times(PG_FUNCTION_ARGS)
-{
-	Oid			relid = PG_GETARG_OID(0);
-	int64		result;
-	PgStat_StatTabEntry *tabentry;
-
-	if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL)
-		result = 0;
-	else
-		result = (int64) (tabentry->polar_bulk_create_index_extends_times);
-
-	PG_RETURN_INT64(result);
-}
-
 PG_FUNCTION_INFO_V1(polar_get_slot_node_type);
 Datum
 polar_get_slot_node_type(PG_FUNCTION_ARGS)
diff --git a/external/polar_smgrperf/.gitignore b/external/polar_smgrperf/.gitignore
new file mode 100644
index 00000000000..5dcb3ff9723
--- /dev/null
+++ b/external/polar_smgrperf/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/external/polar_smgrperf/Makefile b/external/polar_smgrperf/Makefile
new file mode 100644
index 00000000000..9312373f9f8
--- /dev/null
+++ b/external/polar_smgrperf/Makefile
@@ -0,0 +1,21 @@
+# external/polar_smgrperf/Makefile
+
+MODULE_big = polar_smgrperf
+OBJS = polar_smgrperf.o $(WIN32RES)
+
+EXTENSION = polar_smgrperf
+DATA = polar_smgrperf--1.0.sql
+PGFILEDESC = "polar_smgrperf - perf test on smgr"
+
+TAP_TESTS = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = external/polar_smgrperf
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/external/polar_smgrperf/polar_smgrperf--1.0.sql b/external/polar_smgrperf/polar_smgrperf--1.0.sql
new file mode 100644
index 00000000000..ca4ca055201
--- /dev/null
+++ b/external/polar_smgrperf/polar_smgrperf--1.0.sql
@@ -0,0 +1,45 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION polar_smgrperf" to load this file. \quit
+
+CREATE FUNCTION polar_smgrperf_prepare(
+    nblocks INT DEFAULT 131072)
+RETURNS VOID
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION polar_smgrperf_cleanup()
+RETURNS VOID
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION polar_smgrperf_read(
+    bs INT DEFAULT 1,
+    begin_blkno INT DEFAULT 0,
+    end_blkno INT DEFAULT 131072,
+    sequential BOOLEAN DEFAULT TRUE)
+RETURNS VOID
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION polar_smgrperf_write(
+    bs INT DEFAULT 1,
+    begin_blkno INT DEFAULT 0,
+    end_blkno INT DEFAULT 131072,
+    sequential BOOLEAN DEFAULT TRUE)
+RETURNS VOID
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION polar_smgrperf_extend(
+    bs INT DEFAULT 1)
+RETURNS VOID
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION polar_smgrperf_nblocks(
+    relnumber OID DEFAULT 1,
+    nblocks_cached BOOLEAN DEFAULT FALSE,
+    fd_cached BOOLEAN DEFAULT TRUE
+) RETURNS VOID
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
diff --git a/external/polar_smgrperf/polar_smgrperf.c b/external/polar_smgrperf/polar_smgrperf.c
new file mode 100644
index 00000000000..913c205137a
--- /dev/null
+++ b/external/polar_smgrperf/polar_smgrperf.c
@@ -0,0 +1,385 @@
+/*-------------------------------------------------------------------------
+ *
+ * polar_smgrperf.c
+ *
+ * Copyright (c) 2024, Alibaba Group Holding Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * IDENTIFICATION
+ *	  external/polar_smgrperf/polar_smgrperf.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "common/file_utils.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/smgr.h"
+#include "utils/timeout.h"
+
+PG_MODULE_MAGIC;
+
+#define INVALID_PROC_NUMBER	InvalidBackendId
+#define RelFileLocator		RelFileNode
+
+#define MAX_RELSEG			(MaxBlockNumber / RELSEG_SIZE)
+#define MAX_NBLOCKS			(MAX_RELSEG * RELSEG_SIZE)
+
+#define PERF_REL_NUMBER		1
+#define PERF_RLOCATOR(relnumber)	((RelFileLocator) {MyDatabaseTableSpace, MyDatabaseId, relnumber})
+#define PERF_SMGROPEN(relnumber)	smgropen(PERF_RLOCATOR(relnumber), INVALID_PROC_NUMBER)
+
+#define REPORT_PERF_STATS_PREPARE(with_bandwidth_option) \
+	{ \
+		sigjmp_buf local_sigjmp_buf; \
+		perf_exception_stack = PG_exception_stack; \
+		with_bandwidth = with_bandwidth_option; \
+		if (sigsetjmp(local_sigjmp_buf, 1) != 0) \
+		{ \
+			PG_exception_stack = perf_exception_stack; \
+			report_summary_perf_stats(); \
+			if (perf_report_timerid != -1) \
+				disable_timeout(perf_report_timerid, false); \
+			pg_re_throw(); \
+		} \
+		PG_exception_stack = &local_sigjmp_buf; \
+		MemSet(&stats, 0, sizeof(perf_stats)); \
+		MemSet(&accum_stats, 0, sizeof(perf_stats)); \
+		if (perf_report_timerid == -1) \
+			perf_report_timerid = RegisterTimeout(USER_TIMEOUT, report_perf_stats_timeout_handler); \
+		enable_timeout_after(perf_report_timerid, 1000); \
+	}
+
+typedef struct perf_stats
+{
+	uint64		count;
+	uint64		blocks;
+	uint64		time;
+}			perf_stats;
+
+static perf_stats stats;
+static perf_stats accum_stats;
+
+static ForkNumber forknum = MAIN_FORKNUM;
+static BlockNumber zero_blkno = 0;
+static void *zero_buffer = NULL;
+static int	max_bs = 0;
+static int	perf_report_timerid = -1;
+static bool with_bandwidth = true;
+static sigjmp_buf *perf_exception_stack = NULL;
+static bool report_perf_stats_pending = false;
+static instr_time start;
+
+static inline BlockNumber
+select_next_blkno(BlockNumber current_blkno, BlockNumber begin_blkno, BlockNumber end_blkno, int bs, bool sequential)
+{
+	BlockNumber next_blkno = InvalidBlockNumber;
+
+	if (sequential)
+	{
+		if (current_blkno == InvalidBlockNumber)
+			next_blkno = begin_blkno;
+		else
+			next_blkno = current_blkno + bs;
+
+		if (next_blkno + bs > end_blkno)
+			next_blkno = begin_blkno;
+	}
+	else
+		next_blkno = begin_blkno + random() % (end_blkno - begin_blkno - bs + 1);
+
+	return next_blkno;
+}
+
+static void
+report_perf_stats(perf_stats * stats, char *prefix)
+{
+	double		iops,
+				bps,
+				mbps,
+				lat;
+#define NANOPERSECOND ((uint64) 1000 * 1000 * 1000)
+
+	if (stats->time == 0)
+		return;
+
+	iops = (double) stats->count * NANOPERSECOND / stats->time;
+	lat = (double) stats->time / stats->count / 1000;	/* to micro-second */
+
+	HOLD_INTERRUPTS();
+
+	if (with_bandwidth)
+	{
+		bps = (double) stats->blocks * NANOPERSECOND / stats->time;
+		mbps = (double) stats->blocks * NANOPERSECOND * BLCKSZ / 1024 / 1024 / stats->time;
+
+		elog(INFO, "%siops=%.1f/s, lat=%.1fus, bps=%.1f/s, mbps=%.1fMB/s",
+			 prefix, iops, lat, bps, mbps);
+	}
+	else
+		elog(INFO, "%siops=%.1f/s, lat=%.2fus", prefix, iops, lat);
+
+	RESUME_INTERRUPTS();
+
+	MemSet(stats, 0, sizeof(perf_stats));
+
+#undef NANOPERSECOND
+}
+
+static void
+report_perf_stats_timeout_handler(void)
+{
+	report_perf_stats_pending = true;
+}
+
+static void
+report_summary_perf_stats(void)
+{
+	report_perf_stats(&accum_stats, "Summary: ");
+}
+
+static void
+collect_perf_stats_begin(void)
+{
+	INSTR_TIME_SET_CURRENT(start);
+}
+
+static void
+collect_perf_stats_end(int blocks)
+{
+	instr_time	duration;
+
+	CHECK_FOR_INTERRUPTS();
+
+	INSTR_TIME_SET_CURRENT(duration);
+	INSTR_TIME_SUBTRACT(duration, start);
+	stats.time += INSTR_TIME_GET_NANOSEC(duration);
+	stats.blocks += blocks;
+	stats.count++;
+
+	if (report_perf_stats_pending)
+	{
+		accum_stats.count += stats.count;
+		accum_stats.blocks += stats.blocks;
+		accum_stats.time += stats.time;
+
+		report_perf_stats(&stats, "");
+
+		enable_timeout_after(perf_report_timerid, 1000);
+
+		report_perf_stats_pending = false;
+	}
+}
+
+static void
+smgrperf_initialize()
+{
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use smgrperf functions"))));
+
+	/* initialize zero buffer */
+	zero_buffer = polar_zero_buffer;
+	max_bs = polar_zero_buffer_size / BLCKSZ;
+}
+
+PG_FUNCTION_INFO_V1(polar_smgrperf_prepare);
+Datum
+polar_smgrperf_prepare(PG_FUNCTION_ARGS)
+{
+	int			nblocks = PG_GETARG_INT32(0);
+	SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER);
+
+	if (nblocks < 0 || nblocks > MAX_NBLOCKS)
+		elog(ERROR, "nblocks should be in [1, %d], current %d", MAX_NBLOCKS, nblocks);
+
+	smgrperf_initialize();
+
+	if (!smgrexists(smgr, forknum))
+		smgrcreate(smgr, forknum, false);
+
+	smgrtruncate(smgr, &forknum, 1, &zero_blkno);
+
+	smgrzeroextend(smgr, forknum, 0, nblocks, true);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(polar_smgrperf_cleanup);
+Datum
+polar_smgrperf_cleanup(PG_FUNCTION_ARGS)
+{
+	SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER);
+
+	smgrperf_initialize();
+
+	smgrdounlinkall(&smgr, 1, false);
+	smgrclose(smgr);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(polar_smgrperf_read);
+Datum
+polar_smgrperf_read(PG_FUNCTION_ARGS)
+{
+	int			bs = PG_GETARG_INT32(0);
+	int			begin_blkno = PG_GETARG_INT32(1);
+	int			end_blkno = PG_GETARG_INT32(2);
+	bool		sequential = PG_GETARG_BOOL(3);
+	BlockNumber current_blkno = InvalidBlockNumber;
+	SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER);
+
+	smgrperf_initialize();
+
+	if (bs < 1 || bs > max_bs)
+		elog(ERROR, "bs should be in [1, %d], current %d", max_bs, bs);
+
+	if (begin_blkno < 0 || begin_blkno >= end_blkno)
+		elog(ERROR, "\"begin_blkno\" should be in [0, %d), current %d", end_blkno, begin_blkno);
+
+	if (end_blkno <= begin_blkno || end_blkno > MAX_NBLOCKS)
+		elog(ERROR, "\"end_blkno\" should be in (%d, %d], current %d", begin_blkno, MAX_NBLOCKS, end_blkno);
+
+	REPORT_PERF_STATS_PREPARE(true);
+
+	while (true)
+	{
+		current_blkno = select_next_blkno(current_blkno, begin_blkno, end_blkno, bs, sequential);
+
+		collect_perf_stats_begin();
+		polar_smgrbulkread(smgr, forknum, current_blkno, bs, zero_buffer);
+		collect_perf_stats_end(bs);
+	}
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(polar_smgrperf_write);
+Datum
+polar_smgrperf_write(PG_FUNCTION_ARGS)
+{
+	int			bs = PG_GETARG_INT32(0);
+	int			begin_blkno = PG_GETARG_INT32(1);
+	int			end_blkno = PG_GETARG_INT32(2);
+	bool		sequential = PG_GETARG_BOOL(3);
+	BlockNumber current_blkno = InvalidBlockNumber;
+	SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER);
+
+	smgrperf_initialize();
+
+	if (bs < 1 || bs > max_bs)
+		elog(ERROR, "bs should be in [1, %d], current %d", max_bs, bs);
+
+	if (begin_blkno < 0 || begin_blkno >= end_blkno)
+		elog(ERROR, "\"begin_blkno\" should be in [0, %d), current %d", end_blkno, begin_blkno);
+
+	if (end_blkno <= begin_blkno || end_blkno > MAX_NBLOCKS)
+		elog(ERROR, "\"end_blkno\" should be in (%d, %d], current %d", begin_blkno, MAX_NBLOCKS, end_blkno);
+
+	REPORT_PERF_STATS_PREPARE(true);
+
+	while (true)
+	{
+		current_blkno = select_next_blkno(current_blkno, begin_blkno, end_blkno, bs, sequential);
+
+		collect_perf_stats_begin();
+		polar_smgrbulkwrite(smgr, forknum, current_blkno, bs, zero_buffer, false);
+		collect_perf_stats_end(bs);
+	}
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(polar_smgrperf_extend);
+Datum
+polar_smgrperf_extend(PG_FUNCTION_ARGS)
+{
+	int			bs = PG_GETARG_INT32(0);
+	BlockNumber current_blkno = 0;
+	SMgrRelation smgr = PERF_SMGROPEN(PERF_REL_NUMBER);
+
+	smgrperf_initialize();
+
+	if (bs < 1 || bs > max_bs)
+		elog(ERROR, "bs should be in [1, %d], current %d", max_bs, bs);
+
+	REPORT_PERF_STATS_PREPARE(true);
+
+	if (!smgrexists(smgr, forknum))
+		smgrcreate(smgr, forknum, false);
+
+	smgrtruncate(smgr, &forknum, 1, &zero_blkno);
+
+	while (true)
+	{
+		if ((current_blkno + bs) >= RELSEG_SIZE)
+		{
+			smgrtruncate(smgr, &forknum, 1, &zero_blkno);
+
+			current_blkno = 0;
+		}
+
+		collect_perf_stats_begin();
+		smgrzeroextend(smgr, forknum, current_blkno, bs, true);
+		collect_perf_stats_end(bs);
+
+		current_blkno += bs;
+	}
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(polar_smgrperf_nblocks);
+Datum
+polar_smgrperf_nblocks(PG_FUNCTION_ARGS)
+{
+	Oid			relnumber = PG_GETARG_INT32(0);
+	bool		nblocks_cached = PG_GETARG_BOOL(1);
+	bool		fd_cached = PG_GETARG_BOOL(2);
+
+	SMgrRelation smgr = smgropen(PERF_RLOCATOR(relnumber), INVALID_PROC_NUMBER);
+
+	smgrperf_initialize();
+
+	if (relnumber == InvalidOid)
+		elog(ERROR, "relnumber cannot be %d", InvalidOid);
+
+	REPORT_PERF_STATS_PREPARE(false);
+
+	elog(INFO, "Testing smgrnblocks on file with %u blocks", smgrnblocks(smgr, forknum));
+
+	while (true)
+	{
+		if (!fd_cached)
+		{
+			smgrclose(smgr);
+			smgr = smgropen(PERF_RLOCATOR(relnumber), INVALID_PROC_NUMBER);
+		}
+
+		collect_perf_stats_begin();
+
+		if (nblocks_cached)
+			smgrnblocks(smgr, forknum);
+		else
+			smgrnblocks_real(smgr, forknum);
+
+		collect_perf_stats_end(0);
+	}
+
+	PG_RETURN_VOID();
+}
diff --git a/external/polar_smgrperf/polar_smgrperf.control b/external/polar_smgrperf/polar_smgrperf.control
new file mode 100644
index 00000000000..c9db0f3db59
--- /dev/null
+++ b/external/polar_smgrperf/polar_smgrperf.control
@@ -0,0 +1,5 @@
+# polar_smgrperf extension
+comment = 'smgr perf test extension'
+default_version = '1.0'
+module_pathname = '$libdir/polar_smgrperf'
+schema = 'public'
diff --git a/external/polar_smgrperf/t/001_smgrperf.pl b/external/polar_smgrperf/t/001_smgrperf.pl
new file mode 100644
index 00000000000..3e6d1adcc1d
--- /dev/null
+++ b/external/polar_smgrperf/t/001_smgrperf.pl
@@ -0,0 +1,114 @@
+#!/usr/bin/perl
+# 001_smgrperf.pl
+#	  Test smgrperf tool, for coverage.
+#
+# Copyright (c) 2024, Alibaba Group Holding Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# IDENTIFICATION
+#	  external/polar_smgrperf/t/001_smgrperf.pl
+
+use strict;
+use warnings;
+use Config;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init();
+$node_primary->append_conf('postgresql.conf', 'statement_timeout = 3s');
+$node_primary->start;
+
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION polar_smgrperf');
+
+my $stderr;
+
+# Run smgrperf tests
+$node_primary->psql(
+	'postgres',
+	qq[
+		set polar_zero_extend_method to none;
+		select polar_smgrperf_extend();
+	],
+	stderr => \$stderr);
+like(
+	$stderr,
+	qr/ERROR:  canceling statement due to statement timeout/,
+	'polar_smgrperf_extend canceled by statement timeout');
+like($stderr, qr/INFO:  Summary:/, 'polar_smgrperf_extend (none) ok');
+
+$node_primary->psql(
+	'postgres',
+	qq[
+		set polar_zero_extend_method to bulkwrite;
+		select polar_smgrperf_extend();
+	],
+	stderr => \$stderr);
+like(
+	$stderr,
+	qr/ERROR:  canceling statement due to statement timeout/,
+	'polar_smgrperf_extend canceled by statement timeout');
+like($stderr, qr/INFO:  Summary:/, 'polar_smgrperf_extend (bulkwrite) ok');
+
+$node_primary->psql(
+	'postgres',
+	qq[
+		set polar_zero_extend_method to fallocate;
+		select polar_smgrperf_extend();
+	],
+	stderr => \$stderr);
+like(
+	$stderr,
+	qr/ERROR:  canceling statement due to statement timeout/,
+	'polar_smgrperf_extend canceled by statement timeout');
+like($stderr, qr/INFO:  Summary:/, 'polar_smgrperf_extend (fallocate) ok');
+
+$node_primary->safe_psql('postgres',
+	'set statement_timeout=0; select polar_smgrperf_prepare()');
+
+$node_primary->psql(
+	'postgres',
+	'select polar_smgrperf_read()',
+	stderr => \$stderr);
+like(
+	$stderr,
+	qr/ERROR:  canceling statement due to statement timeout/,
+	'polar_smgrperf_read canceled by statement timeout');
+like($stderr, qr/INFO:  Summary:/, 'polar_smgrperf_read ok');
+
+$node_primary->psql(
+	'postgres',
+	'select polar_smgrperf_write()',
+	stderr => \$stderr);
+like(
+	$stderr,
+	qr/ERROR:  canceling statement due to statement timeout/,
+	'polar_smgrperf_write canceled by statement timeout');
+like($stderr, qr/INFO:  Summary:/, 'polar_smgrperf_write ok');
+
+$node_primary->psql(
+	'postgres',
+	'select polar_smgrperf_nblocks()',
+	stderr => \$stderr);
+like(
+	$stderr,
+	qr/ERROR:  canceling statement due to statement timeout/,
+	'polar_smgrperf_nblocks canceled by statement timeout');
+like($stderr, qr/INFO:  Summary:/, 'polar_smgrperf_nblocks ok');
+
+$node_primary->safe_psql('postgres', 'select polar_smgrperf_cleanup()');
+
+done_testing();
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index 20f470648be..89ab52d4fa6 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -16,6 +16,7 @@
 
 #include "access/gin_private.h"
 #include "access/ginxlog.h"
+#include "access/hio.h"
 #include "access/reloptions.h"
 #include "access/xloginsert.h"
 #include "catalog/pg_collation.h"
@@ -331,7 +332,7 @@ GinNewBuffer(Relation index)
 	if (needLock)
 		LockRelationForExtension(index, ExclusiveLock);
 
-	buffer = ReadBuffer(index, P_NEW);
+	buffer = polar_index_add_blocks(index);
 	LockBuffer(buffer, GIN_EXCLUSIVE);
 
 	if (needLock)
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 6c48f56b3b2..798feee860c 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -43,7 +43,8 @@
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
+
 #include "utils/memutils.h"
 #include "utils/rel.h"
 #include "utils/tuplesort.h"
@@ -106,11 +107,8 @@ typedef struct
 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
 
 	BlockNumber pages_allocated;
-	BlockNumber pages_written;
 
-	int			ready_num_pages;
-	BlockNumber ready_blknos[XLR_MAX_BLOCK_ID];
-	Page		ready_pages[XLR_MAX_BLOCK_ID];
+	BulkWriteState *bulkstate;
 } GISTBuildState;
 
 #define GIST_SORTED_BUILD_PAGE_NUM 4
@@ -142,7 +140,6 @@ static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 											   IndexTuple itup);
 static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 												 GistSortedBuildLevelState *levelstate);
-static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
 
 static void gistInitBuffering(GISTBuildState *buildstate);
 static int	calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
@@ -404,27 +401,18 @@ gist_indexsortbuild(GISTBuildState *state)
 {
 	IndexTuple	itup;
 	GistSortedBuildLevelState *levelstate;
-	Page		page;
+	BulkWriteBuffer rootbuf;
 
-	state->pages_allocated = 0;
-	state->pages_written = 0;
-	state->ready_num_pages = 0;
+	/* Reserve block 0 for the root page */
+	state->pages_allocated = 1;
 
-	/*
-	 * Write an empty page as a placeholder for the root page. It will be
-	 * replaced with the real root page at the end.
-	 */
-	page = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
-	smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-			   page, true);
-	state->pages_allocated++;
-	state->pages_written++;
+	state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
 
 	/* Allocate a temporary buffer for the first leaf page batch. */
 	levelstate = palloc0(sizeof(GistSortedBuildLevelState));
-	levelstate->pages[0] = page;
+	levelstate->pages[0] = palloc(BLCKSZ);
 	levelstate->parent = NULL;
-	gistinitpage(page, F_LEAF);
+	gistinitpage(levelstate->pages[0], F_LEAF);
 
 	/*
 	 * Fill index pages with tuples in the sorted order.
@@ -454,31 +442,15 @@ gist_indexsortbuild(GISTBuildState *state)
 		levelstate = parent;
 	}
 
-	gist_indexsortbuild_flush_ready_pages(state);
-
 	/* Write out the root */
 	PageSetLSN(levelstate->pages[0], GistBuildLSN);
-	PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
-	smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-			  levelstate->pages[0], true);
-	if (RelationNeedsWAL(state->indexrel))
-		log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
-					levelstate->pages[0], true);
-
-	pfree(levelstate->pages[0]);
+	rootbuf = smgr_bulk_get_buf(state->bulkstate);
+	memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
+	smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
+
 	pfree(levelstate);
 
-	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
-	 */
-	if (RelationNeedsWAL(state->indexrel))
-		smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM);
+	smgr_bulk_finish(state->bulkstate);
 }
 
 /*
@@ -508,7 +480,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state,
 			levelstate->current_page++;
 
 		if (levelstate->pages[levelstate->current_page] == NULL)
-			levelstate->pages[levelstate->current_page] = palloc_io_aligned(BLCKSZ, 0);
+			levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
 
 		newPage = levelstate->pages[levelstate->current_page];
 		gistinitpage(newPage, old_page_flags);
@@ -571,6 +543,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 	for (; dist != NULL; dist = dist->next)
 	{
 		char	   *data;
+		BulkWriteBuffer buf;
 		Page		target;
 
 		/* check once per page */
@@ -578,7 +551,8 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 
 		/* Create page and copy data */
 		data = (char *) (dist->list);
-		target = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
+		buf = smgr_bulk_get_buf(state->bulkstate);
+		target = (Page) buf;
 		gistinitpage(target, isleaf ? F_LEAF : 0);
 		for (int i = 0; i < dist->block.num; i++)
 		{
@@ -591,20 +565,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		}
 		union_tuple = dist->itup;
 
-		if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
-			gist_indexsortbuild_flush_ready_pages(state);
-
-		/*
-		 * The page is now complete. Assign a block number to it, and add it
-		 * to the list of finished pages. (We don't write it out immediately,
-		 * because we want to WAL-log the pages in batches.)
-		 */
-		blkno = state->pages_allocated++;
-		state->ready_blknos[state->ready_num_pages] = blkno;
-		state->ready_pages[state->ready_num_pages] = target;
-		state->ready_num_pages++;
-		ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
-
 		/*
 		 * Set the right link to point to the previous page. This is just for
 		 * debugging purposes: GiST only follows the right link if a page is
@@ -619,6 +579,15 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		 */
 		if (levelstate->last_blkno)
 			GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
+
+		/*
+		 * The page is now complete. Assign a block number to it, and pass it
+		 * to the bulk writer.
+		 */
+		blkno = state->pages_allocated++;
+		PageSetLSN(target, GistBuildLSN);
+		smgr_bulk_write(state->bulkstate, blkno, buf, true);
+		ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
 		levelstate->last_blkno = blkno;
 
 		/*
@@ -629,7 +598,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 		if (parent == NULL)
 		{
 			parent = palloc0(sizeof(GistSortedBuildLevelState));
-			parent->pages[0] = (Page) palloc_io_aligned(BLCKSZ, 0);
+			parent->pages[0] = palloc(BLCKSZ);
 			parent->parent = NULL;
 			gistinitpage(parent->pages[0], 0);
 
@@ -639,39 +608,6 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
 	}
 }
 
-static void
-gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
-{
-	if (state->ready_num_pages == 0)
-		return;
-
-	for (int i = 0; i < state->ready_num_pages; i++)
-	{
-		Page		page = state->ready_pages[i];
-		BlockNumber blkno = state->ready_blknos[i];
-
-		/* Currently, the blocks must be buffered in order. */
-		if (blkno != state->pages_written)
-			elog(ERROR, "unexpected block number to flush GiST sorting build");
-
-		PageSetLSN(page, GistBuildLSN);
-		PageSetChecksumInplace(page, blkno);
-		smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
-				   true);
-
-		state->pages_written++;
-	}
-
-	if (RelationNeedsWAL(state->indexrel))
-		log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages,
-					 state->ready_blknos, state->ready_pages, true);
-
-	for (int i = 0; i < state->ready_num_pages; i++)
-		pfree(state->ready_pages[i]);
-
-	state->ready_num_pages = 0;
-}
-
 
 /*-------------------------------------------------------------------------
  * Routines for non-sorted build
diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
index 4a0a91b7120..eabf7460182 100644
--- a/src/backend/access/gist/gistbuildbuffers.c
+++ b/src/backend/access/gist/gistbuildbuffers.c
@@ -186,9 +186,8 @@ gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
 {
 	GISTNodeBufferPage *pageBuffer;
 
-	pageBuffer = (GISTNodeBufferPage *)
-		MemoryContextAllocIOAligned(gfbb->context,
-									BLCKSZ, MCXT_ALLOC_ZERO);
+	pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
+															   BLCKSZ);
 	pageBuffer->prev = InvalidBlockNumber;
 
 	/* Set page free space */
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index d4bf0c7563d..d7029532db0 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -16,6 +16,7 @@
 #include <math.h>
 
 #include "access/gist_private.h"
+#include "access/hio.h"
 #include "access/htup_details.h"
 #include "access/reloptions.h"
 #include "catalog/pg_opclass.h"
@@ -883,7 +884,7 @@ gistNewBuffer(Relation r)
 	if (needLock)
 		LockRelationForExtension(r, ExclusiveLock);
 
-	buffer = ReadBuffer(r, P_NEW);
+	buffer = polar_index_add_blocks(r);
 	LockBuffer(buffer, GIST_EXCLUSIVE);
 
 	if (needLock)
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 9da5fb48658..0af9d6b0b95 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -992,7 +992,7 @@ static bool
 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 {
 	BlockNumber lastblock;
-	PGAlignedBlock zerobuf;
+	PGIOAlignedBlock zerobuf;
 	Page		page;
 	HashPageOpaque ovflopaque;
 
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 080ba8b4f5c..dec8a6c4098 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -22,14 +22,34 @@
 #include "access/visibilitymap.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
 
-/* POLAR */
-#include "utils/guc.h"
-/* POLAR end */
+/* GUCs */
+int			polar_heap_bulk_extend_size = 512;
+int			polar_index_bulk_extend_size = 128;
 
-static Buffer polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkInsertState bistate);
+static Buffer polar_heap_add_blocks(Relation relation, BulkInsertState bistate);
+
+int
+polar_get_bulk_extend_size(BlockNumber first_block, int bulk_extend_size)
+{
+	/* Avoid small table bloat */
+	if (first_block < bulk_extend_size)
+		bulk_extend_size = 1;
+
+	/* Avoid failure on extremely small DB */
+	bulk_extend_size = Min(NBuffers / 4, bulk_extend_size);
+
+	/* Avoid acceed maximum possible length */
+	bulk_extend_size = Min(MaxBlockNumber - first_block, bulk_extend_size);
+
+	/* Extend by one page at least */
+	bulk_extend_size = Max(1, bulk_extend_size);
+
+	return bulk_extend_size;
+}
 
 /*
  * RelationPutHeapTuple - place tuple at specified page
@@ -351,7 +371,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	BlockNumber targetBlock,
 				otherBlock;
 	bool		needLock;
-	int			bulk_extend_size = polar_bulk_extend_size;
+	int			bulk_extend_size = polar_heap_bulk_extend_size;
 
 	len = MAXALIGN(len);		/* be conservative */
 
@@ -369,7 +389,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	 * POLAR: when enable preallocate_file, use fsm record blocks => if
 	 * preallocate_file is enabled, use fsm record blocks
 	 */
-	if (polar_enable_shared_storage_mode && bulk_extend_size > 0)
+	if (bulk_extend_size > 0)
 		use_fsm = true;
 
 	/*
@@ -629,7 +649,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
 			}
 
 			/* POLAR: If we don't need file prealloc, use origin normal method */
-			if (!(polar_enable_shared_storage_mode && bulk_extend_size > 0))
+			if (bulk_extend_size <= 0)
 			{
 				/* Time to bulk-extend */
 				RelationAddExtraBlocks(relation, bistate);
@@ -647,8 +667,8 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	 * rather than relying on the kernel to do it for us?
 	 */
 	/* POLAR: preallocate multiple block and only use one block */
-	if (polar_enable_shared_storage_mode && bulk_extend_size > 0)
-		buffer = polar_relation_add_extra_blocks_and_return_last_buffer(relation, bistate);
+	if (bulk_extend_size > 0)
+		buffer = polar_heap_add_blocks(relation, bistate);
 	else
 		buffer = ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
 
@@ -754,10 +774,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
 	return buffer;
 }
 
-
 /*
  * POLAR:
- * polar_relation_add_extra_blocks_and_return_last_buffer - Extend a relation by multiple blocks
+ * polar_heap_add_blocks - Extend a relation by multiple blocks
  * to avoid future contention on the relation extension lock and expensive pfs extend operation.
  *
  * If bistate isn't NULL, bistate->current_buf is assigned to last buffer alloced.
@@ -770,15 +789,18 @@ RelationGetBufferForTuple(Relation relation, Size len,
  * relation extension lock has been acquired if relation is not local.
  */
 static Buffer
-polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkInsertState bistate)
+polar_heap_add_blocks(Relation relation, BulkInsertState bistate)
 {
-	BlockNumber first_block_num_extended = InvalidBlockNumber;
+	BlockNumber first_block = InvalidBlockNumber;
 	int			block_count = 0;
 	Buffer		last_buffer = InvalidBuffer;
 	Buffer	   *buffers = NULL;
 	int			index = 0;
-	char	   *bulk_buf_block = NULL;
 	BufferAccessStrategy strategy = NULL;
+	SMgrRelation smgr = RelationGetSmgr(relation);
+
+	if (polar_heap_bulk_extend_size == 0)
+		return ReadBufferBI(relation, P_NEW, RBM_ZERO_AND_LOCK, bistate);
 
 	if (bistate != NULL)
 	{
@@ -791,27 +813,17 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 		}
 	}
 
-	/* Open it at the smgr level if not already done */
-	RelationGetSmgr(relation);
-
 	/* bulk extend times */
 	polar_pgstat_count_bulk_extend_times(relation);
 
 	PG_TRY();
 	{
 		/* init bulk extend backend-local-variable */
-		polar_smgr_init_bulk_extend(relation->rd_smgr, MAIN_FORKNUM);
-
-		first_block_num_extended = relation->rd_smgr->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM];
-		block_count = Min(polar_bulk_extend_size, (BlockNumber) RELSEG_SIZE - (first_block_num_extended % ((BlockNumber) RELSEG_SIZE)));
-		if (block_count < 1)
-			block_count = 1;
+		polar_smgr_init_bulk_extend(smgr, MAIN_FORKNUM);
 
-		/* avoid small table bloat */
-		if (first_block_num_extended < polar_min_bulk_extend_table_size)
-			block_count = 1;
+		first_block = smgr->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM];
+		block_count = polar_get_bulk_extend_size(first_block, polar_heap_bulk_extend_size);
 
-		bulk_buf_block = (char *) palloc_io_aligned(block_count * BLCKSZ, MCXT_ALLOC_ZERO);
 		buffers = (Buffer *) palloc(block_count * sizeof(Buffer));
 
 		for (index = 0; index < block_count; index++)
@@ -825,11 +837,10 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 			{
 				/*
 				 * Lock last buffer in bulk extend, when last buffers are
-				 * between buf head lock releasing and LockBuffer, the last
-				 * buffers can be taken by other backends. Under this
-				 * condition, the page will be init twice by two backend. we
-				 * fix the bug by releasing buf head lock after LockBuffer in
-				 * last buffer.
+				 * between IO lock releasing and LockBuffer, the last buffer
+				 * can be taken by other backends. Under this condition, the
+				 * page will be init twice by two backend. we fix the bug by
+				 * releasing IO lock after LockBuffer in last buffer.
 				 */
 				buffers[index] = ReadBufferExtended(relation, MAIN_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, strategy);
 			}
@@ -843,27 +854,20 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 		 * error recovery, very important, reset bulk extend
 		 * backend-local-variable
 		 */
-		if (relation->rd_smgr != NULL)
-			polar_smgr_clear_bulk_extend(relation->rd_smgr, MAIN_FORKNUM);
+		polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
 
-	/*
-	 * Reset bulk extend backend-local-variable. The reason why we can use
-	 * backend-local-variable in bulk extend is that we don't allow to extend
-	 * concurrently.
-	 */
-	polar_smgr_clear_bulk_extend(relation->rd_smgr, MAIN_FORKNUM);
+	/* reset bulk extend backend-local-variable */
+	polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM);
 
-	/* bulk extend io */
-	polar_smgrbulkextend(relation->rd_smgr, MAIN_FORKNUM, first_block_num_extended, block_count, bulk_buf_block, false);
+	/* bulk extend polar store */
+	smgrzeroextend(smgr, MAIN_FORKNUM, first_block, block_count, false);
 
-	/* Update block countters */
+	/* Update block counters */
 	polar_pgstat_count_bulk_extend_blocks(relation, block_count);
 
-	pfree(bulk_buf_block);
-
 	/* ----------------
 	 * Until here, all alloced buffer are zero page(BM_VALID, non-BM_DIRTY). It is safe.
 	 * 1. They are not initialized, still zero-page.
@@ -907,7 +911,7 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 		MarkBufferDirty(buffer);
 
 		/* we'll need this info below */
-		Assert((first_block_num_extended + index) == BufferGetBlockNumber(buffer));
+		Assert((first_block + index) == BufferGetBlockNumber(buffer));
 		freespace = PageGetHeapFreeSpace(page);
 
 		UnlockReleaseBuffer(buffer);
@@ -917,7 +921,7 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 		 * chance of making this page visible to other concurrently inserting
 		 * backends, and we want that to happen without delay.
 		 */
-		RecordPageWithFreeSpace(relation, first_block_num_extended + index, freespace);
+		RecordPageWithFreeSpace(relation, first_block + index, freespace);
 	}
 
 	/*
@@ -927,7 +931,7 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 	 * inserted. skip last block.
 	 */
 	if (block_count > 0)
-		FreeSpaceMapVacuumRange(relation, first_block_num_extended, first_block_num_extended + block_count);
+		FreeSpaceMapVacuumRange(relation, first_block, first_block + block_count);
 
 	/* last block */
 	last_buffer = buffers[block_count];
@@ -941,3 +945,111 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn
 
 	return last_buffer;
 }
+
+/*
+ * POLAR: index insert bulk extend. If we can not find free pages in index relation while
+ * doing index insert, we will do index bulk extend. The free blocks will be registered
+ * in FSM.
+ */
+Buffer
+polar_index_add_blocks(Relation relation)
+{
+	BlockNumber first_block = InvalidBlockNumber;
+	int			block_count = 0;
+	Buffer		last_buffer = InvalidBuffer;
+	Buffer	   *buffers = NULL;
+	int			index = 0;
+	SMgrRelation smgr = RelationGetSmgr(relation);
+
+	if (polar_index_bulk_extend_size == 0)
+		return ReadBuffer(relation, P_NEW);
+
+	PG_TRY();
+	{
+		/* init bulk extend backend-local-variable */
+		polar_smgr_init_bulk_extend(smgr, MAIN_FORKNUM);
+
+		first_block = smgr->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM];
+		block_count = polar_get_bulk_extend_size(first_block, polar_index_bulk_extend_size);
+
+		buffers = (Buffer *) palloc(block_count * sizeof(Buffer));
+
+		/*
+		 * The difference between polar_heap_add_blocks here is that: All the
+		 * buffer is RBM_NORMAL, not RBM_ZERO_AND_LOCK for the last buffer.
+		 * Because the btree index will not try to get the last blkno when fsm
+		 * is not free, which is the scene in heap_insert. The return last
+		 * buffer will be locked by locked from caller.
+		 */
+		for (index = 0; index < block_count; index++)
+		{
+			/*
+			 * Extend by one page.  This should generally match the main-line
+			 * extension code in RelationGetBufferForTuple, except that we
+			 * hold the relation extension lock throughout.
+			 */
+			buffers[index] = ReadBufferExtended(relation, MAIN_FORKNUM, P_NEW, RBM_NORMAL, NULL);
+		}
+	}
+	PG_CATCH();
+	{
+		/*
+		 * error recovery, very important, reset bulk extend
+		 * backend-local-variable
+		 */
+		polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	/* reset bulk extend backend-local-variable */
+	polar_smgr_clear_bulk_extend(smgr, MAIN_FORKNUM);
+
+	/* bulk extend polar store */
+	smgrzeroextend(smgr, MAIN_FORKNUM, first_block, block_count, false);
+
+	/* process left (block_count - 1) blocks, skip last block */
+	block_count--;
+	for (index = 0; index < block_count; index++)
+	{
+		Buffer		buffer;
+		Page		page;
+
+		buffer = buffers[index];
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+		page = BufferGetPage(buffer);
+		if (!PageIsNew(page))
+			elog(ERROR, "index bulk extend page %u of relation \"%s\" should be empty but is not",
+				 BufferGetBlockNumber(buffer),
+				 RelationGetRelationName(relation));
+
+		/*
+		 * The difference between polar_heap_add_blocks here is that: we don't
+		 * need to init new page and MarkBufferDirty. Because for btree index,
+		 * when it get one page from fsm, it always call _bt_pageinit for a
+		 * new page. While heap page should be inited by the caller.
+		 */
+
+		Assert((first_block + index) == BufferGetBlockNumber(buffer));
+		UnlockReleaseBuffer(buffer);
+
+		/*
+		 * We just register the free pages into FSM, no need to mark all the
+		 * new buffers dirty
+		 */
+		RecordFreeIndexPage(relation, first_block + index);
+	}
+
+	/*
+	 * Finally, vacuum the FSM. Update the upper-level FSM pages to ensure
+	 * that searchers can find them.
+	 */
+	if (block_count > 0)
+		FreeSpaceMapVacuumRange(relation, first_block, first_block + block_count);
+
+	/* last block */
+	last_buffer = buffers[block_count];
+	pfree(buffers);
+
+	return last_buffer;
+}
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 21c6dbf0cc3..aa163ef664e 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -87,8 +87,8 @@
  * is optimized for bulk inserting a lot of tuples, knowing that we have
  * exclusive access to the heap.  raw_heap_insert builds new pages in
  * local storage.  When a page is full, or at the end of the process,
- * we insert it to WAL as a single record and then write it to disk
- * directly through smgr.  Note, however, that any data sent to the new
+ * we insert it to WAL as a single record and then write it to disk with
+ * the bulk smgr writer.  Note, however, that any data sent to the new
  * heap's TOAST table will go through the normal bufmgr.
  *
  *
@@ -119,9 +119,9 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
+#include "storage/bulk_write.h"
 #include "storage/fd.h"
 #include "storage/procarray.h"
-#include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -136,9 +136,9 @@ typedef struct RewriteStateData
 {
 	Relation	rs_old_rel;		/* source heap */
 	Relation	rs_new_rel;		/* destination heap */
-	Page		rs_buffer;		/* page currently being built */
+	BulkWriteState *rs_bulkstate;	/* writer for the destination */
+	BulkWriteBuffer rs_buffer;	/* page currently being built */
 	BlockNumber rs_blockno;		/* block where page will go */
-	bool		rs_buffer_valid;	/* T if any tuples in buffer */
 	bool		rs_logical_rewrite; /* do we need to do logical rewriting */
 	TransactionId rs_oldest_xmin;	/* oldest xmin used by caller to determine
 									 * tuple visibility */
@@ -259,14 +259,14 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 
 	state->rs_old_rel = old_heap;
 	state->rs_new_rel = new_heap;
-	state->rs_buffer = (Page) palloc_io_aligned(BLCKSZ, 0);
+	state->rs_buffer = NULL;
 	/* new_heap needn't be empty, just locked */
 	state->rs_blockno = RelationGetNumberOfBlocks(new_heap);
-	state->rs_buffer_valid = false;
 	state->rs_oldest_xmin = oldest_xmin;
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_bulkstate = smgr_bulk_start_rel(new_heap, MAIN_FORKNUM);
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -318,30 +318,13 @@ end_heap_rewrite(RewriteState state)
 	}
 
 	/* Write the last page, if any */
-	if (state->rs_buffer_valid)
+	if (state->rs_buffer)
 	{
-		if (RelationNeedsWAL(state->rs_new_rel))
-			log_newpage(&state->rs_new_rel->rd_node,
-						MAIN_FORKNUM,
-						state->rs_blockno,
-						state->rs_buffer,
-						true);
-
-		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
-
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, (char *) state->rs_buffer, true);
+		smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+		state->rs_buffer = NULL;
 	}
 
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is the same as in storage.c's RelationCopyStorage(): we're
-	 * writing data that's not in shared buffers, and so a CHECKPOINT
-	 * occurring during the rewriteheap operation won't have fsync'd data we
-	 * wrote before the checkpoint.
-	 */
-	if (RelationNeedsWAL(state->rs_new_rel))
-		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+	smgr_bulk_finish(state->rs_bulkstate);
 
 	logical_end_heap_rewrite(state);
 
@@ -615,7 +598,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
 static void
 raw_heap_insert(RewriteState state, HeapTuple tup)
 {
-	Page		page = state->rs_buffer;
+	Page		page;
 	Size		pageFreeSpace,
 				saveFreeSpace;
 	Size		len;
@@ -668,7 +651,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 												   HEAP_DEFAULT_FILLFACTOR);
 
 	/* Now we can check to see if there's enough free space already. */
-	if (state->rs_buffer_valid)
+	page = (Page) state->rs_buffer;
+	if (page)
 	{
 		pageFreeSpace = PageGetHeapFreeSpace(page);
 
@@ -679,35 +663,19 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * contains a tuple.  Hence, unlike RelationGetBufferForTuple(),
 			 * enforce saveFreeSpace unconditionally.
 			 */
-
-			/* XLOG stuff */
-			if (RelationNeedsWAL(state->rs_new_rel))
-				log_newpage(&state->rs_new_rel->rd_node,
-							MAIN_FORKNUM,
-							state->rs_blockno,
-							page,
-							true);
-
-			/*
-			 * Now write the page. We say skipFsync = true because there's no
-			 * need for smgr to schedule an fsync for this write; we'll do it
-			 * ourselves in end_heap_rewrite.
-			 */
-			PageSetChecksumInplace(page, state->rs_blockno);
-
-			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-					   state->rs_blockno, (char *) page, true);
-
+			smgr_bulk_write(state->rs_bulkstate, state->rs_blockno, state->rs_buffer, true);
+			state->rs_buffer = NULL;
+			page = NULL;
 			state->rs_blockno++;
-			state->rs_buffer_valid = false;
 		}
 	}
 
-	if (!state->rs_buffer_valid)
+	if (!page)
 	{
 		/* Initialize a new empty page */
+		state->rs_buffer = smgr_bulk_get_buf(state->rs_bulkstate);
+		page = (Page) state->rs_buffer;
 		PageInit(page, BLCKSZ, 0);
-		state->rs_buffer_valid = true;
 	}
 
 	/* And now we can insert the tuple into the page */
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 35fe597f239..9290b03a319 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -66,7 +66,7 @@
 #include "utils/timestamp.h"
 
 /* POLAR */
-#include "storage/polar_bufmgr.h"
+#include "access/hio.h"
 
 /*
  * Space/time tradeoff parameters: do these need to be user-tunable?
@@ -75,7 +75,7 @@
  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  * is less) potentially-freeable pages.
  */
-#define REL_TRUNCATE_MINIMUM	1000
+#define REL_TRUNCATE_MINIMUM	MAX_BUFFERS_TO_EXTEND_BY
 #define REL_TRUNCATE_FRACTION	16
 
 /*
@@ -2853,21 +2853,24 @@ static bool
 should_attempt_truncation(LVRelState *vacrel)
 {
 	BlockNumber possibly_freeable;
+	BlockNumber min_possibly_freeable;
 
 	if (!vacrel->do_rel_truncate || vacrel->failsafe_active ||
 		old_snapshot_threshold >= 0)
 		return false;
 
-	possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
-
-	/* POLAR: We don't expect that vacuum cleanup our prealloc file blocks */
-	Assert(vacrel->rel);
-	if (polar_bulk_extend_size > 0 &&
-		!RelationUsesLocalBuffers(vacrel->rel) &&
-		possibly_freeable <= polar_bulk_extend_size)
-		return false;
+	/*
+	 * POLAR: If the table can be truncated to empty, let's do this without
+	 * considering bulk extend. Else if the table is truncatable, reserve bulk
+	 * extended pages.
+	 */
+	if (vacrel->nonempty_pages == 0)
+		min_possibly_freeable = 0;
+	else
+		min_possibly_freeable = polar_heap_bulk_extend_size;
 
-	if (possibly_freeable > 0 &&
+	possibly_freeable = vacrel->rel_pages - vacrel->nonempty_pages;
+	if (possibly_freeable > min_possibly_freeable &&
 		(possibly_freeable >= REL_TRUNCATE_MINIMUM ||
 		 possibly_freeable >= vacrel->rel_pages / REL_TRUNCATE_FRACTION))
 		return true;
@@ -2965,6 +2968,15 @@ lazy_truncate_heap(LVRelState *vacrel)
 		 * were vacuuming.
 		 */
 		new_rel_pages = count_nondeletable_pages(vacrel, &lock_waiter_detected);
+
+		/*
+		 * POLAR: If the table can be truncated to empty, let's do this
+		 * without considering bulk extend. Else if the table is truncatable,
+		 * reserve bulk extended pages.
+		 */
+		if (new_rel_pages != 0 && orig_rel_pages > new_rel_pages)
+			new_rel_pages = Min(orig_rel_pages, new_rel_pages + polar_heap_bulk_extend_size);
+
 		vacrel->blkno = new_rel_pages;
 
 		if (new_rel_pages >= orig_rel_pages)
@@ -3133,14 +3145,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
 
 		/* Done scanning if we found a tuple here */
 		if (hastup)
-		{
-			Assert(vacrel->rel);
-			/* POLAR: bulk_extend_page is empty page, should be included */
-			if (polar_bulk_extend_size > 0 && !RelationUsesLocalBuffers(vacrel->rel))
-				return blkno + polar_bulk_extend_size;
-			else
-				return blkno + 1;
-		}
+			return blkno + 1;
 	}
 
 	/*
diff --git a/src/backend/access/logindex/polar_fullpage.c b/src/backend/access/logindex/polar_fullpage.c
index 7361d5be272..ea126f4f98e 100644
--- a/src/backend/access/logindex/polar_fullpage.c
+++ b/src/backend/access/logindex/polar_fullpage.c
@@ -17,7 +17,7 @@
  * limitations under the License.
  *
  * IDENTIFICATION
- *           src/backend/access/logindex/polar_fullpage.c
+ *	  src/backend/access/logindex/polar_fullpage.c
  *
  *-------------------------------------------------------------------------
  */
@@ -33,6 +33,7 @@
 #include "access/xlog_internal.h"
 #include "access/xloginsert.h"
 #include "catalog/pg_control.h"
+#include "common/file_utils.h"
 #include "pgstat.h"
 #include "replication/walreceiver.h"
 #include "storage/buf_internals.h"
@@ -221,46 +222,33 @@ install_fullpage_file_segment(polar_fullpage_ctl_t ctl, uint64 *seg_no,
 static void
 fill_fullpage_file_zero_pages(int fd, char *tmppath)
 {
-#define ONE_MB (1024 * 1024L)
-	char	   *data;
-	int			nbytes = 0;
-
-	data = palloc_io_aligned(ONE_MB, MCXT_ALLOC_ZERO);
+	int			rc;
 
 	pgstat_report_wait_start(WAIT_EVENT_FULLPAGE_FILE_INIT_WRITE);
 
-	for (nbytes = 0; nbytes < FULLPAGE_SEGMENT_SIZE; nbytes += ONE_MB)
-	{
-		int			rc = 0;
-
-		errno = 0;
-		rc = (int) polar_pwrite(fd, data, ONE_MB, nbytes);
+	rc = polar_pwrite_zeros(fd, FULLPAGE_SEGMENT_SIZE, 0);
 
-		if (rc != ONE_MB)
-		{
-			int			save_errno = errno;
+	if (rc < 0)
+	{
+		int			save_errno = errno;
 
-			/*
-			 * If we fail to make the file, delete it to release disk space
-			 */
-			polar_unlink(tmppath);
+		/*
+		 * If we fail to make the file, delete it to release disk space
+		 */
+		polar_unlink(tmppath);
 
-			polar_close(fd);
+		polar_close(fd);
 
-			/* if write didn't set errno, assume problem is no disk space */
-			errno = save_errno ? save_errno : ENOSPC;
+		/* if write didn't set errno, assume problem is no disk space */
+		errno = save_errno ? save_errno : ENOSPC;
 
-			pfree(data);
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not write to file \"%s\": %m", tmppath)));
-		}
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", tmppath)));
 	}
 
 	pgstat_report_wait_end();
 
-	pfree(data);
-
 	if (polar_fsync(fd) != 0)
 	{
 		int			save_errno = errno;
@@ -327,9 +315,25 @@ polar_fullpage_file_init(polar_fullpage_ctl_t ctl, uint64 fullpage_no)
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
-	/* POLAR: File allocate, juse change file metadata once */
-	if (polar_fallocate(fd, 0, FULLPAGE_SEGMENT_SIZE) != 0)
-		elog(ERROR, "polar_posix_fallocate fail in polar_fullpage_file_init");
+#ifdef __linux__
+
+	/*
+	 * POLAR: use FALLOC_FL_NO_HIDE_STALE on PFS to optimize appending writes.
+	 */
+	if (polar_enable_fallocate_no_hide_stale &&
+		polar_vfs_type(fd) == POLAR_VFS_PFS &&
+		polar_fallocate(fd, FALLOC_FL_NO_HIDE_STALE, 0, FULLPAGE_SEGMENT_SIZE) != 0)
+	{
+		int			save_errno = errno;
+
+		polar_unlink(tmppath);
+		polar_close(fd);
+		errno = save_errno;
+
+		elog(ERROR, "fallocate failed \"%s\": %m", tmppath);
+	}
+	/* POLAR end */
+#endif
 
 	fill_fullpage_file_zero_pages(fd, tmppath);
 
@@ -652,7 +656,7 @@ polar_log_fullpage_snapshot_image(polar_fullpage_ctl_t ctl, Buffer buffer, XLogR
 	 * call.
 	 */
 	if (fullpage == NULL)
-		fullpage = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
+		fullpage = MemoryContextAllocAligned(TopMemoryContext, BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 
 	Assert(polar_is_primary());
 
diff --git a/src/backend/access/logindex/polar_logindex_redo.c b/src/backend/access/logindex/polar_logindex_redo.c
index fe032f5c1eb..f5923880fb2 100644
--- a/src/backend/access/logindex/polar_logindex_redo.c
+++ b/src/backend/access/logindex/polar_logindex_redo.c
@@ -170,51 +170,27 @@ polar_logindex_abort_replaying_buffer(void)
 }
 
 /*
- * For the block file in tag, extend it to tag->blockNum blocks.
+ * For the block file in tag, extend it to at least tag->blockNum blocks (might
+ * be longer to reduce smgrzeroextend invocations).
  */
 static void
 polar_extend_block_if_not_exist(BufferTag *tag)
 {
 	SMgrRelation smgr;
 	BlockNumber nblocks;
-	char	   *extendBuf = NULL;
-	int			copy_bulk_extend_size;
-	int			copy_min_bulk_extend_table_size;
 
 	smgr = smgropen(tag->rnode, InvalidBackendId);
 	smgrcreate(smgr, tag->forkNum, true);
 	nblocks = smgrnblocks(smgr, tag->forkNum);
 
-	/* When close the bulk extend, we still palloc one size */
-	copy_bulk_extend_size = polar_recovery_bulk_extend_size;
-	copy_min_bulk_extend_table_size = polar_min_bulk_extend_table_size;
-
-	while (tag->blockNum >= nblocks)
+	if (tag->blockNum >= nblocks)
 	{
-		int			block_count;
+		int			block_count = polar_get_recovery_bulk_extend_size(tag->blockNum, nblocks);
 
-		if (copy_bulk_extend_size > 0
-			&& nblocks >= copy_min_bulk_extend_table_size
-			 /* Avoid small table bloat */ )
-		{
-			block_count = Min(copy_bulk_extend_size,
-							  (BlockNumber) RELSEG_SIZE - nblocks % ((BlockNumber) RELSEG_SIZE));
-		}
-		else
-			block_count = 1;
+		Assert(nblocks + block_count > tag->blockNum);
 
-		/*
-		 * When close the bulk extend, we still palloc one size. We palloc
-		 * only once.
-		 */
-		if (extendBuf == NULL)
-			extendBuf = palloc_io_aligned(BLCKSZ * Max(block_count, copy_bulk_extend_size), MCXT_ALLOC_ZERO);
-		polar_smgrbulkextend(smgr, tag->forkNum, nblocks, block_count, extendBuf, false);
-		nblocks += block_count;
+		smgrzeroextend(smgr, tag->forkNum, nblocks, block_count, false);
 	}
-
-	if (extendBuf != NULL)
-		pfree(extendBuf);
 }
 
 void
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index d3acbe0fb73..75ddf15fa5d 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -39,6 +39,7 @@
 #include "utils/snapmgr.h"
 
 /* POLAR */
+#include "access/hio.h"
 #include "storage/smgr.h"
 #include "utils/guc.h"
 
@@ -68,9 +69,6 @@ static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
 static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
 							   FullTransactionId safexid);
 
-/* POLAR */
-static Buffer polar_index_add_extra_blocks_and_return_last_buffer(Relation reln, BlockNumber blockNum);
-
 /*
  *	_bt_initmetapage() -- Fill a page buffer with a correct metapage image
  */
@@ -983,14 +981,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		if (needLock)
 			LockRelationForExtension(rel, ExclusiveLock);
 
-		/*
-		 * POLAR: if polar_index_bulk_extend_size > 0, use index bulk extend
-		 * instead of extend one page
-		 */
-		if (polar_enable_shared_storage_mode && polar_index_bulk_extend_size > 0)
-			buf = polar_index_add_extra_blocks_and_return_last_buffer(rel, P_NEW);
-		else
-			buf = ReadBuffer(rel, P_NEW);
+		buf = polar_index_add_blocks(rel);
 
 		/* Acquire buffer lock on new page */
 		_bt_lockbuf(rel, buf, BT_WRITE);
@@ -3121,136 +3112,3 @@ _bt_pendingfsm_add(BTVacState *vstate,
 	vstate->pendingpages[vstate->npendingpages].safexid = safexid;
 	vstate->npendingpages++;
 }
-
-/*
- * POLAR: index insert bulk extend. If we can not find free pages in index relation while
- * doing index insert, we will do index bulk extend. The free blocks will be registered
- * in FSM.
- * We now only support bulk extend for btree index.
- * Note: raw LockBuffer() calls are disallowed in nbtree; all
- * buffer lock requests need to go through wrapper functions such
- * as _bt_lockbuf().
- */
-static Buffer
-polar_index_add_extra_blocks_and_return_last_buffer(Relation reln, BlockNumber blockNum)
-{
-	BlockNumber first_block_num_extended = InvalidBlockNumber;
-	int			block_count = 0;
-	Buffer		last_buffer = InvalidBuffer;
-	Buffer	   *buffers = NULL;
-	int			index = 0;
-	char	   *bulk_buf_block = NULL;
-
-	PG_TRY();
-	{
-		/* init bulk extend backend-local-variable */
-		polar_smgr_init_bulk_extend(RelationGetSmgr(reln), MAIN_FORKNUM);
-
-		first_block_num_extended = RelationGetSmgr(reln)->polar_nblocks_faked_for_bulk_extend[MAIN_FORKNUM];
-		block_count = Min(polar_index_bulk_extend_size, (BlockNumber) RELSEG_SIZE - (first_block_num_extended % ((BlockNumber) RELSEG_SIZE)));
-		if (block_count < 1)
-			block_count = 1;
-
-		/* avoid small table bloat */
-		if (first_block_num_extended < polar_min_bulk_extend_table_size)
-			block_count = 1;
-
-		bulk_buf_block = (char *) palloc_io_aligned(block_count * BLCKSZ, MCXT_ALLOC_ZERO);
-		buffers = (Buffer *) palloc0(block_count * sizeof(Buffer));
-
-		/*
-		 * The difference between
-		 * polar_relation_add_extra_blocks_and_return_last_buffer here is
-		 * that: All the buffer is RBM_NORMAL, not RBM_ZERO_AND_LOCK for the
-		 * last buffer. Because the btree index will not try to get the last
-		 * blkno when fsm is not free. The btree index must get free page from
-		 * fsm. When we check PageIsNew(), these bulk extend pages will not
-		 * added into fsm. Moreover, The caller will init bulk extend pages
-		 * which get from fsm, but bulk extend doesn't init index pages. There
-		 * is no problem for init page twice. The return last buffer will be
-		 * locked by _bt_lockbuf from caller.
-		 */
-		for (index = 0; index < block_count; index++)
-		{
-			/*
-			 * Extend by one page.  This should generally match the main-line
-			 * extension code in RelationGetBufferForTuple, except that we
-			 * hold the relation extension lock throughout.
-			 */
-			buffers[index] = ReadBufferExtended(reln, MAIN_FORKNUM, P_NEW, RBM_NORMAL, NULL);
-		}
-	}
-	PG_CATCH();
-	{
-		/*
-		 * error recovery, very important, reset bulk extend
-		 * backend-local-variable
-		 */
-		if (RelationGetSmgr(reln) != NULL)
-			polar_smgr_clear_bulk_extend(RelationGetSmgr(reln), MAIN_FORKNUM);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();
-
-	/* reset bulk extend backend-local-variable */
-	polar_smgr_clear_bulk_extend(RelationGetSmgr(reln), MAIN_FORKNUM);
-
-	/* bulk extend polar store */
-	polar_smgrbulkextend(RelationGetSmgr(reln), MAIN_FORKNUM, first_block_num_extended, block_count, bulk_buf_block, false);
-
-	pfree(bulk_buf_block);
-
-	/* process left (block_count-1) blocks, skip last block */
-	block_count--;
-	for (index = 0; index < block_count; index++)
-	{
-		Buffer		buffer;
-		Page		page;
-
-		buffer = buffers[index];
-
-		/*
-		 * In polar_relation_add_extra_blocks_and_return_last_buffer, we have
-		 * no need to init page. We just check zero page. we only need to add
-		 * share lock to index buffer.
-		 */
-		_bt_lockbuf(reln, buffer, BT_READ);
-		page = BufferGetPage(buffer);
-		if (!PageIsNew(page))
-			elog(ERROR, "index bulk extend page %u of relation \"%s\" should be empty but is not",
-				 BufferGetBlockNumber(buffer),
-				 RelationGetRelationName(reln));
-
-		/*
-		 * The difference between
-		 * polar_relation_add_extra_blocks_and_return_last_buffer here is
-		 * that: we don't need to init new page and MarkBufferDirty. Because
-		 * for btree index, when it get one page from fsm, it always call
-		 * _bt_pageinit for a new page. While heap page should be inited by
-		 * the caller.
-		 */
-
-		Assert((first_block_num_extended + index) == BufferGetBlockNumber(buffer));
-		/* unlock index buffer and release pin */
-		_bt_relbuf(reln, buffer);
-
-		/*
-		 * We just register the free pages into FSM, no need to mark all the
-		 * new buffers dirty
-		 */
-		RecordFreeIndexPage(reln, first_block_num_extended + index);
-	}
-
-	/*
-	 * Finally, vacuum the FSM. Update the upper-level FSM pages to ensure
-	 * that searchers can find them.
-	 */
-	if (block_count > 0)
-		IndexFreeSpaceMapVacuum(reln);
-
-	/* last block */
-	last_buffer = buffers[block_count];
-	pfree(buffers);
-
-	return last_buffer;
-}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 406f2a930ba..8fd10487efd 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -29,11 +29,11 @@
 #include "nodes/execnodes.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "storage/bulk_write.h"
 #include "storage/condition_variable.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
-#include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
 #include "utils/memutils.h"
@@ -153,31 +153,18 @@ bthandler(PG_FUNCTION_ARGS)
 void
 btbuildempty(Relation index)
 {
-	Page		metapage;
+	bool		allequalimage = _bt_allequalimage(index, false);
+	BulkWriteState *bulkstate;
+	BulkWriteBuffer metabuf;
 
-	/* Construct metapage. */
-	metapage = (Page) palloc_io_aligned(BLCKSZ, 0);
-	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
+	bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
 
-	/*
-	 * Write the page and log it.  It might seem that an immediate sync would
-	 * be sufficient to guarantee that the file exists on disk, but recovery
-	 * itself might remove it while replaying, for example, an
-	 * XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE record.  Therefore, we need
-	 * this even when wal_level=minimal.
-	 */
-	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BTREE_METAPAGE,
-			  (char *) metapage, true);
-	log_newpage(&RelationGetSmgr(index)->smgr_rnode.node, INIT_FORKNUM,
-				BTREE_METAPAGE, metapage, true);
+	/* Construct metapage. */
+	metabuf = smgr_bulk_get_buf(bulkstate);
+	_bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
+	smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
 
-	/*
-	 * An immediate sync is required even if we xlog'd the page, because the
-	 * write did not go through shared_buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
+	smgr_bulk_finish(bulkstate);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index aa7d378f3d9..d0c921346d5 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -23,13 +23,8 @@
  * many upper pages if the keys are reasonable-size) without risking a lot of
  * cascading splits during early insertions.
  *
- * Formerly the index pages being built were kept in shared buffers, but
- * that is of no value (since other backends have no interest in them yet)
- * and it created locking problems for CHECKPOINT, because the upper-level
- * pages were held exclusive-locked for long periods.  Now we just build
- * the pages in local memory and smgrwrite or smgrextend them as we finish
- * them.  They will need to be re-read into shared buffers on first use after
- * the build finishes.
+ * We use the bulk smgr loading facility to bypass the buffer cache and
+ * WAL-log the pages efficiently.
  *
  * This code isn't concerned about the FSM at all. The caller is responsible
  * for initializing that.
@@ -58,7 +53,7 @@
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
 #include "utils/rel.h"
 #include "utils/sortsupport.h"
@@ -235,7 +230,7 @@ typedef struct BTBuildState
  */
 typedef struct BTPageState
 {
-	Page		btps_page;		/* workspace for page building */
+	BulkWriteBuffer btps_buf;	/* workspace for page building */
 	BlockNumber btps_blkno;		/* block # to write this page at */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
@@ -252,19 +247,9 @@ typedef struct BTWriteState
 {
 	Relation	heap;
 	Relation	index;
+	BulkWriteState *bulkstate;
 	BTScanInsert inskey;		/* generic insertion scankey */
-	bool		btws_use_wal;	/* dump pages to WAL? */
 	BlockNumber btws_pages_alloced; /* # pages allocated */
-	BlockNumber btws_pages_written; /* # pages written out */
-	Page		btws_zeropage;	/* workspace for filling zeroes */
-
-	/*
-	 * POLAR: bulk extend index file. polar_index_create_bulk_extend_size_copy
-	 * is a copy of polar_index_create_bulk_extend_size, to avoid the impact
-	 * of polar_index_create_bulk_extend_size realtime modifications.
-	 */
-	int			polar_index_create_bulk_extend_size_copy;
-	/* POLAR end */
 } BTWriteState;
 
 
@@ -276,7 +261,7 @@ static void _bt_spool(BTSpool *btspool, ItemPointer self,
 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
 							   bool *isnull, bool tupleIsAlive, void *state);
-static Page _bt_blnewpage(uint32 level);
+static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
 static void _bt_slideleft(Page rightmostpage);
 static void _bt_sortaddtup(Page page, Size itemsize,
@@ -302,11 +287,6 @@ static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
 									   Sharedsort *sharedsort2, int sortmem,
 									   bool progress);
 
-/* POLAR */
-__attribute__((__unused__))
-static bool polar_bt_check_bulk_extend(BTWriteState *wstate);
-
-/* POLAR end */
 
 /*
  *	btbuild() -- build a new btree index.
@@ -583,19 +563,12 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
 	/* _bt_mkscankey() won't set allequalimage without metapage */
 	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
-	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
 
 	/* reserve the metapage */
 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
-	wstate.btws_pages_written = 0;
-	wstate.btws_zeropage = NULL;	/* until needed */
 
 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
-
-	/* POLAR: bulk extend index file */
-	wstate.polar_index_create_bulk_extend_size_copy = polar_index_create_bulk_extend_size;
-
 	_bt_load(&wstate, btspool, btspool2);
 }
 
@@ -631,13 +604,15 @@ _bt_build_callback(Relation index,
 /*
  * allocate workspace for a new, clean btree page, not linked to any siblings.
  */
-static Page
-_bt_blnewpage(uint32 level)
+static BulkWriteBuffer
+_bt_blnewpage(BTWriteState *wstate, uint32 level)
 {
+	BulkWriteBuffer buf;
 	Page		page;
 	BTPageOpaque opaque;
 
-	page = (Page) palloc_io_aligned(BLCKSZ, 0);
+	buf = smgr_bulk_get_buf(wstate->bulkstate);
+	page = (Page) buf;
 
 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);
@@ -652,136 +627,17 @@ _bt_blnewpage(uint32 level)
 	/* Make the P_HIKEY line pointer appear allocated */
 	((PageHeader) page)->pd_lower += sizeof(ItemIdData);
 
-	return page;
+	return buf;
 }
 
 /*
  * emit a completed btree page, and release the working storage.
  */
 static void
-_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+_bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
 {
-	/* XLOG stuff */
-	if (wstate->btws_use_wal)
-	{
-		/* We use the XLOG_FPI record type for this */
-		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
-	}
-
-	/*
-	 * If we have to write pages nonsequentially, fill in the space with
-	 * zeroes until we come back and overwrite.  This is not logically
-	 * necessary on standard Unix filesystems (unwritten space will read as
-	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
-	 * pages aren't WAL-logged though.
-	 */
-	while (blkno > wstate->btws_pages_written)
-	{
-		/* ----------------
-		 * POLAR: bulk extend index relation in "create index", not "insert a index tuple".
-		 * Extra zero-page are safe.
-		 * In "create index", index_create(). Before it finished, it can't be accessed by other backend.
-		 *    1. For a index relation, only one backend writes the index relation, even in parallel index build.
-		 *    2. Logical nblocks is controlled by wstate->btws_pages_alloced, not file lseek.
-		 *    3. zero-page will be overwrite by smgrwrite().
-		 *    4. When index_create() finished, small amount of last zero pages will be truncated.
-		 * ----------------
-		 */
-		if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0)
-		{
-			int			polar_block_count = Min(wstate->polar_index_create_bulk_extend_size_copy, (BlockNumber) RELSEG_SIZE - (blkno % ((BlockNumber) RELSEG_SIZE)));
-
-			if (polar_block_count < 0)
-				polar_block_count = 1;
-			/* ----------------
-			 * btws_zeropage has fixed size wstate->polar_index_create_bulk_extend_size_copy.
-			 * polar_block_count <= wstate->polar_index_create_bulk_extend_size_copy.
-			 * ----------------
-			 */
-			if (!wstate->btws_zeropage)
-				wstate->btws_zeropage = (Page) palloc_io_aligned(BLCKSZ * wstate->polar_index_create_bulk_extend_size_copy, MCXT_ALLOC_ZERO);
-
-			polar_smgrbulkextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, polar_block_count, (char *) wstate->btws_zeropage, true);
-			wstate->btws_pages_written += polar_block_count;
-
-			polar_pgstat_count_bulk_create_index_extend_times(wstate->heap);
-		}						/* POLAR end */
-		else
-		{
-			if (!wstate->btws_zeropage)
-				wstate->btws_zeropage = (Page) palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
-			/* don't set checksum for all-zero page */
-			smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
-					   wstate->btws_pages_written++,
-					   (char *) wstate->btws_zeropage,
-					   true);
-		}
-	}
-
-	PageSetChecksumInplace(page, blkno);
-
-	/*
-	 * Now write the page.  There's no need for smgr to schedule an fsync for
-	 * this write; we'll do it ourselves before ending the build.
-	 */
-	if (blkno == wstate->btws_pages_written)
-	{
-		/* ----------------
-		 * POLAR: bulk extend index relation in "create index", not "insert a index tuple".
-		 * Extra zero-page are safe.
-		 * In "create index", index_create(). Before it finished, it can't be accessed by other backend.
-		 *    1. For a index relation, only one backend writes the index relation, even in parallel index build.
-		 *    2. Logical nblocks is controlled by wstate->btws_pages_alloced, not file lseek.
-		 *    3. zero-page will be overwrite by smgrwrite().
-		 *    4. When index_create() finished, small amount of last zero pages will be truncated.
-		 * ----------------
-		 */
-		if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0)
-		{
-			int			polar_block_count = Min(wstate->polar_index_create_bulk_extend_size_copy, (BlockNumber) RELSEG_SIZE - (blkno % ((BlockNumber) RELSEG_SIZE)));
-
-			if (polar_block_count < 0)
-				polar_block_count = 1;
-
-			/* ----------------
-			 * btws_zeropage has fixed size wstate->polar_index_create_bulk_extend_size_copy.
-			 * polar_block_count <= wstate->polar_index_create_bulk_extend_size_copy.
-			 * ----------------
-			 */
-			if (!wstate->btws_zeropage)
-				wstate->btws_zeropage = (Page) palloc_io_aligned(BLCKSZ * wstate->polar_index_create_bulk_extend_size_copy, MCXT_ALLOC_ZERO);
-
-			/* first page hold blkno's content */
-			memcpy((char *) wstate->btws_zeropage, (char *) page, BLCKSZ);
-
-			polar_smgrbulkextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, polar_block_count, (char *) wstate->btws_zeropage, true);
-
-			wstate->btws_pages_written += polar_block_count;
-
-			polar_pgstat_count_bulk_create_index_extend_times(wstate->heap);
-		}						/* POLAR end */
-		else
-		{
-			/* extending the file... */
-			smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-					   (char *) page, true);
-			wstate->btws_pages_written++;
-		}
-	}
-	else
-	{
-		/* overwriting a block we zero-filled before */
-		smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				  (char *) page, true);
-	}
-
-	pfree(page);
-	if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0
-		&& wstate->btws_zeropage)
-	{
-		pfree(wstate->btws_zeropage);
-		wstate->btws_zeropage = NULL;
-	}
+	smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
+	/* smgr_bulk_write took ownership of 'buf' */
 }
 
 /*
@@ -794,7 +650,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
 
 	/* create initial page for level */
-	state->btps_page = _bt_blnewpage(level);
+	state->btps_buf = _bt_blnewpage(wstate, level);
 
 	/* and assign it a page position */
 	state->btps_blkno = wstate->btws_pages_alloced++;
@@ -930,6 +786,7 @@ static void
 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 			 Size truncextra)
 {
+	BulkWriteBuffer nbuf;
 	Page		npage;
 	BlockNumber nblkno;
 	OffsetNumber last_off;
@@ -944,7 +801,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 	 */
 	CHECK_FOR_INTERRUPTS();
 
-	npage = state->btps_page;
+	nbuf = state->btps_buf;
+	npage = (Page) nbuf;
 	nblkno = state->btps_blkno;
 	last_off = state->btps_lastoff;
 	last_truncextra = state->btps_lastextra;
@@ -1000,6 +858,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		/*
 		 * Finish off the page and write it out.
 		 */
+		BulkWriteBuffer obuf = nbuf;
 		Page		opage = npage;
 		BlockNumber oblkno = nblkno;
 		ItemId		ii;
@@ -1007,7 +866,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		IndexTuple	oitup;
 
 		/* Create new page of same level */
-		npage = _bt_blnewpage(state->btps_level);
+		nbuf = _bt_blnewpage(wstate, state->btps_level);
+		npage = (Page) nbuf;
 
 		/* and assign it a page position */
 		nblkno = wstate->btws_pages_alloced++;
@@ -1119,10 +979,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		}
 
 		/*
-		 * Write out the old page.  We never need to touch it again, so we can
-		 * free the opage workspace too.
+		 * Write out the old page. _bt_blwritepage takes ownership of the
+		 * 'opage' buffer.
 		 */
-		_bt_blwritepage(wstate, opage, oblkno);
+		_bt_blwritepage(wstate, obuf, oblkno);
 
 		/*
 		 * Reset last_off to point to new page
@@ -1155,7 +1015,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 	_bt_sortaddtup(npage, itupsz, itup, last_off,
 				   !isleaf && last_off == P_FIRSTKEY);
 
-	state->btps_page = npage;
+	state->btps_buf = nbuf;
 	state->btps_blkno = nblkno;
 	state->btps_lastoff = last_off;
 }
@@ -1207,7 +1067,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	BTPageState *s;
 	BlockNumber rootblkno = P_NONE;
 	uint32		rootlevel = 0;
-	Page		metapage;
+	BulkWriteBuffer metabuf;
 
 	/*
 	 * Each iteration of this loop completes one more level of the tree.
@@ -1218,7 +1078,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		BTPageOpaque opaque;
 
 		blkno = s->btps_blkno;
-		opaque = BTPageGetOpaque(s->btps_page);
+		opaque = BTPageGetOpaque((Page) s->btps_buf);
 
 		/*
 		 * We have to link the last page on this level to somewhere.
@@ -1252,9 +1112,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		 * This is the rightmost page, so the ItemId array needs to be slid
 		 * back one slot.  Then we can dump out the page.
 		 */
-		_bt_slideleft(s->btps_page);
-		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
-		s->btps_page = NULL;	/* writepage freed the workspace */
+		_bt_slideleft((Page) s->btps_buf);
+		_bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
+		s->btps_buf = NULL;		/* writepage took ownership of the buffer */
 	}
 
 	/*
@@ -1263,10 +1123,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
 	 */
-	metapage = (Page) palloc_io_aligned(BLCKSZ, 0);
-	_bt_initmetapage(metapage, rootblkno, rootlevel,
+	metabuf = smgr_bulk_get_buf(wstate->bulkstate);
+	_bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
-	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+	_bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
 }
 
 /*
@@ -1287,7 +1147,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
 	bool		deduplicate;
-	ForkNumber	bulk_index_extend_forknum;
+
+	wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
 
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
@@ -1444,7 +1305,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 				 */
 				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
 					sizeof(ItemIdData);
-				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
+				Assert(dstate->maxpostingsize <= BTMaxItemSize((Page) state->btps_buf) &&
 					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
 				dstate->htids = palloc(dstate->maxpostingsize);
 
@@ -1514,32 +1375,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
 	/* Close down final pages and write the metapage */
 	_bt_uppershutdown(wstate, state);
-
-	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
-	 */
-	if (wstate->btws_use_wal)
-		smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
-
-	/*
-	 * POLAR: bulk extend index file, truncate amount of zero page at the end
-	 * of file
-	 */
-	if (polar_enable_shared_storage_mode && wstate->polar_index_create_bulk_extend_size_copy > 0)
-	{
-		if (wstate->btws_pages_alloced < wstate->btws_pages_written)
-		{
-			bulk_index_extend_forknum = MAIN_FORKNUM;
-			Assert(polar_bt_check_bulk_extend(wstate));
-			smgrtruncate(RelationGetSmgr(wstate->index), &bulk_index_extend_forknum, 1, &(wstate->btws_pages_alloced));
-		}
-	}
+	smgr_bulk_finish(wstate->bulkstate);
 }
 
 /*
@@ -2127,39 +1963,3 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
 	if (btspool2)
 		tuplesort_end(btspool2->sortstate);
 }
-
-/*
- * POLAR: check if page content is expected.
- * return false means something error, true OK.
- */
-pg_attribute_unused()
-static bool
-polar_bt_check_bulk_extend(BTWriteState *wstate)
-{
-	PGAlignedBlock pg;
-
-	MemSet(pg.data, 0, BLCKSZ);
-	/* wstate->btws_pages_alloced always >= 1 */
-
-	/*
-	 * btws_pages_alloced page must be PageInit(). These pages are not
-	 * zeroed-filled.
-	 */
-	smgrread(RelationGetSmgr(wstate->index), MAIN_FORKNUM, wstate->btws_pages_alloced - 1, pg.data);
-	if (PageIsNew(pg.data))
-		return false;
-
-	if (wstate->btws_pages_alloced < wstate->btws_pages_written)
-	{
-		memset(pg.data, 0, BLCKSZ);
-		smgrread(RelationGetSmgr(wstate->index), MAIN_FORKNUM, wstate->btws_pages_alloced, pg.data);
-
-		/*
-		 * The pages between btws_pages_alloced and btws_pages_written are
-		 * bulk extend. These pages are zeroed-filled.
-		 */
-		if (!PageIsNew(pg.data))
-			return false;
-	}
-	return true;
-}
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index e7b748700d0..f97f9b0bac0 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -25,7 +25,7 @@
 #include "catalog/index.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
-#include "storage/smgr.h"
+#include "storage/bulk_write.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -155,49 +155,27 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 void
 spgbuildempty(Relation index)
 {
-	Page		page;
+	BulkWriteState *bulkstate;
+	BulkWriteBuffer buf;
 
-	/* Construct metapage. */
-	page = (Page) palloc_io_aligned(BLCKSZ, 0);
-	SpGistInitMetapage(page);
+	bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
 
-	/*
-	 * Write the page and log it unconditionally.  This is important
-	 * particularly for indexes created on tablespaces and databases whose
-	 * creation happened after the last redo pointer as recovery removes any
-	 * of their existing content when the corresponding create records are
-	 * replayed.
-	 */
-	PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
-			  (char *) page, true);
-	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_METAPAGE_BLKNO, page, true);
+	/* Construct metapage. */
+	buf = smgr_bulk_get_buf(bulkstate);
+	SpGistInitMetapage((Page) buf);
+	smgr_bulk_write(bulkstate, SPGIST_METAPAGE_BLKNO, buf, true);
 
 	/* Likewise for the root page. */
-	SpGistInitPage(page, SPGIST_LEAF);
-
-	PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_ROOT_BLKNO,
-			  (char *) page, true);
-	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_ROOT_BLKNO, page, true);
+	buf = smgr_bulk_get_buf(bulkstate);
+	SpGistInitPage((Page) buf, SPGIST_LEAF);
+	smgr_bulk_write(bulkstate, SPGIST_ROOT_BLKNO, buf, true);
 
 	/* Likewise for the null-tuples root page. */
-	SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
-
-	PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_NULL_BLKNO,
-			  (char *) page, true);
-	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_NULL_BLKNO, page, true);
+	buf = smgr_bulk_get_buf(bulkstate);
+	SpGistInitPage((Page) buf, SPGIST_LEAF | SPGIST_NULLS);
+	smgr_bulk_write(bulkstate, SPGIST_NULL_BLKNO, buf, true);
 
-	/*
-	 * An immediate sync is required even if we xlog'd the pages, because the
-	 * writes did not go through shared buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
+	smgr_bulk_finish(bulkstate);
 }
 
 /*
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 99fbe655318..bd993421251 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -16,6 +16,7 @@
 #include "postgres.h"
 
 #include "access/amvalidate.h"
+#include "access/hio.h"
 #include "access/htup_details.h"
 #include "access/reloptions.h"
 #include "access/spgist_private.h"
@@ -428,7 +429,7 @@ SpGistNewBuffer(Relation index)
 	if (needLock)
 		LockRelationForExtension(index, ExclusiveLock);
 
-	buffer = ReadBuffer(index, P_NEW);
+	buffer = polar_index_add_blocks(index);
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 	if (needLock)
diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c
index a4dddb0dcea..55503b18105 100644
--- a/src/backend/access/transam/generic_xlog.c
+++ b/src/backend/access/transam/generic_xlog.c
@@ -58,14 +58,17 @@ typedef struct
 	char		delta[MAX_DELTA_SIZE];	/* delta between page images */
 } PageData;
 
-/* State of generic xlog record construction */
+/*
+ * State of generic xlog record construction.  Must be allocated at an I/O
+ * aligned address.
+ */
 struct GenericXLogState
 {
+	/* Page images (properly aligned, must be first) */
+	PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
 	/* Info about each page, see above */
 	PageData	pages[MAX_GENERIC_XLOG_PAGES];
 	bool		isLogged;
-	/* Page images (properly aligned) */
-	PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES];
 };
 
 static void writeFragment(PageData *pageData, OffsetNumber offset,
@@ -268,7 +271,9 @@ GenericXLogStart(Relation relation)
 	GenericXLogState *state;
 	int			i;
 
-	state = (GenericXLogState *) palloc(sizeof(GenericXLogState));
+	state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState),
+												PG_IO_ALIGN_SIZE,
+												0);
 	state->isLogged = RelationNeedsWAL(relation);
 
 	for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 593768dc92f..1d3be6c0d1a 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2787,8 +2787,7 @@ AbortTransaction(void)
 	pgstat_report_wait_end();
 	pgstat_progress_end_command();
 
-	/* Clean up buffer I/O and buffer context locks, too */
-	AbortBufferIO();
+	/* Clean up buffer context locks, too */
 	UnlockBuffers();
 
 	/* Reset WAL record construction state */
@@ -5145,7 +5144,6 @@ AbortSubTransaction(void)
 
 	pgstat_report_wait_end();
 	pgstat_progress_end_command();
-	AbortBufferIO();
 	UnlockBuffers();
 
 	/* Reset WAL record construction state */
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index dd66766aec3..2d624cc85a6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -154,7 +154,6 @@ bool		XLOG_DEBUG = false;
 #endif
 
 int			wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
-int			polar_wal_init_set_size = POLAR_DEFAULT_XLOG_FILL_ZERO_SIZE;
 
 /*
  * Number of WAL insertion locks to use. A higher value allows more insertions
@@ -3111,20 +3110,30 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	memset(zbuffer.data, 0, XLOG_BLCKSZ);
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
-	save_errno = 0;
-	if (wal_init_zero)
+
+#ifdef __linux__
+
+	/*
+	 * POLAR: use FALLOC_FL_NO_HIDE_STALE on PFS to optimize appending writes.
+	 */
+	if (polar_enable_fallocate_no_hide_stale &&
+		polar_vfs_type(fd) == POLAR_VFS_PFS &&
+		polar_fallocate(fd, FALLOC_FL_NO_HIDE_STALE, 0, (off_t) wal_segment_size) != 0)
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int			blocks;
-		static char zero_data[POLAR_MAX_XLOG_FILL_ZERO_SIZE];
+		save_errno = errno;
+		polar_unlink(tmppath);
+		polar_close(fd);
+		errno = save_errno;
 
-		blocks = polar_wal_init_set_size / XLOG_BLCKSZ;
-		polar_wal_init_set_size = INTALIGN(blocks) * XLOG_BLCKSZ;
+		elog(ERROR, "fallocate failed \"%s\": %m", tmppath);
+	}
+	/* POLAR end */
+#endif
 
-		if (polar_wal_init_set_size > POLAR_MAX_XLOG_FILL_ZERO_SIZE)
-			memset(zero_data, 0, POLAR_MAX_XLOG_FILL_ZERO_SIZE);
-		else
-			memset(zero_data, 0, polar_wal_init_set_size);
+	save_errno = 0;
+	if (wal_init_zero)
+	{
+		ssize_t		rc;
 
 		/*
 		 * Zero-fill the file.  With this setting, we do this the hard way to
@@ -3135,29 +3144,10 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
+		rc = polar_pwrite_zeros(fd, wal_segment_size, 0);
 
-		/* Prepare to write out a lot of copies of our zero buffer at once. */
-		for (int i = 0; i < lengthof(iov); ++i)
-		{
-			iov[i].iov_base = zero_data;
-			iov[i].iov_len = polar_wal_init_set_size;
-		}
-
-		/* Loop, writing as many blocks as we can for each system call. */
-		blocks = wal_segment_size / polar_wal_init_set_size;
-		for (int i = 0; i < blocks;)
-		{
-			int			iovcnt = Min(blocks - i, lengthof(iov));
-			off_t		offset = i * polar_wal_init_set_size;
-
-			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
-			{
-				save_errno = errno;
-				break;
-			}
-
-			i += iovcnt;
-		}
+		if (rc < 0)
+			save_errno = errno;
 	}
 	else
 	{
@@ -4610,7 +4600,7 @@ XLOGShmemSize(void)
 	/* xlblocks array */
 	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
 	/* extra alignment padding for XLOG I/O buffers */
-	size = add_size(size, XLOG_BLCKSZ);
+	size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
 	/* and the buffers themselves */
 	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
 
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 4e26c40810d..b63d4fe9c59 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -34,6 +34,7 @@
 #include "utils/rel.h"
 
 /* POLAR */
+#include "access/hio.h"
 #include "access/polar_logindex_redo.h"
 #include "storage/bufpage.h"
 #include "storage/buf_internals.h"
@@ -45,6 +46,7 @@
 
 /* GUC variable */
 bool		ignore_invalid_pages = false;
+int			polar_recovery_bulk_extend_size = 512;
 
 /*
  * Are we doing recovery from XLOG?
@@ -478,33 +480,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
  * exceed one file size.
  */
 static Buffer
-polar_xlog_relation_bulk_extend_within_segment(SMgrRelationData *smgr,
-											   RelFileNode rnode,
-											   ReadBufferMode mode,
-											   ForkNumber forknum,
-											   int num_bulk_block_once)
+polar_recovery_add_blocks(SMgrRelation smgr, ReadBufferMode mode,
+						  ForkNumber forknum, BlockNumber blockNum)
 {
-	char	   *bulk_buf_block = NULL;
-	BlockNumber first_block_num_extended = InvalidBlockNumber;
+	BlockNumber first_block = InvalidBlockNumber;
 	Buffer		buffer = InvalidBuffer;
-
-	Assert(num_bulk_block_once > 0);
+	int			block_count = 0;
 
 	PG_TRY();
 	{
 		int			index = 0;
 
-		/*
-		 * POLAR: acquire relation file lock to avoid extend the same block
-		 * concurrently which may result in EOF error when proc1 read block
-		 * extended and modified by proc2.
-		 */
-		polar_acquire_relfile_lock(rnode, forknum, ExclusiveLock);
-
 		polar_smgr_init_bulk_extend(smgr, forknum);
-		first_block_num_extended = smgr->polar_nblocks_faked_for_bulk_extend[forknum];
 
-		bulk_buf_block = (char *) palloc_io_aligned(num_bulk_block_once * BLCKSZ, MCXT_ALLOC_ZERO);
+		first_block = smgr->polar_nblocks_faked_for_bulk_extend[forknum];
+		block_count = polar_get_bulk_extend_size(first_block, polar_recovery_bulk_extend_size);
 
 		do
 		{
@@ -518,9 +508,8 @@ polar_xlog_relation_bulk_extend_within_segment(SMgrRelationData *smgr,
 				 * read transaction arrives now, and this buffer will be
 				 * evicted,it will not launch any write to disk. So, it is
 				 * Safe. By the way, maybe the most safe algo is to do this
-				 * after polar_smgrbulkextend, but it's ok to leave it here
-				 * currently. The last buffer will hold on buffer content
-				 * exclusive lock.
+				 * after smgrzeroextend, but it's ok to leave it here
+				 * currently.
 				 */
 				if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
 				{
@@ -528,87 +517,24 @@ polar_xlog_relation_bulk_extend_within_segment(SMgrRelationData *smgr,
 				}
 				ReleaseBuffer(buffer);
 			}
-			buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL, true);
-
-		} while (index < num_bulk_block_once);
+			buffer = ReadBufferWithoutRelcache(smgr->smgr_rnode.node, forknum, P_NEW, mode, NULL, true);
+		} while (index < block_count);
 	}
 	PG_CATCH();
 	{
 		polar_smgr_clear_bulk_extend(smgr, forknum);
-
-		/* POLAR: release relation file lock after extend */
-		polar_release_relfile_lock(rnode, forknum, ExclusiveLock);
-
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
 
 	polar_smgr_clear_bulk_extend(smgr, forknum);
 
-	if (bulk_buf_block == NULL)
-		elog(FATAL, "Block buffer is NULL in recovery bulk extend");
-
 	/*
-	 * In polar_smgrbulkextend, if first_block_num_extended is a new segment
-	 * start block, then, it will open a new file, and write it from 0 to
-	 * num_bulk_block_once * 8K. else, will just return an existing file
-	 * handler to write into.
+	 * in smgrzeroextend, if first_block is a new segment start block, then,
+	 * it will open a new file, and write it from 0 to block_count * 8K. else,
+	 * will just return an existing file handler to write into.
 	 */
-	polar_smgrbulkextend(smgr, forknum, first_block_num_extended, num_bulk_block_once, bulk_buf_block, false);
-
-	/* POLAR: release relation file lock after extend */
-	polar_release_relfile_lock(rnode, forknum, ExclusiveLock);
-
-	pfree(bulk_buf_block);
-	return buffer;
-}
-
-/*
- * POLAR:
- * polar_xlog_relation_bulk_extend_blocks -- to extend relation file size more once,
- * worked only on polar store env, with recoverying mode.
- *
- * Returns: the target block's buffer.
- *
- * Params:
- * current_total_blocks: the total number of rel blocks currently, which is the
- *		start position of PwriteExtend.
- * target_blkno: the block id to be replayed, target_blkno - current_total_blocks is
- *		the total blocks to be extended.
- */
-static Buffer
-polar_xlog_relation_bulk_extend_blocks(SMgrRelationData *smgr,
-									   RelFileNode rnode,
-									   ReadBufferMode mode,
-									   ForkNumber forknum,
-									   BlockNumber current_total_blocks,
-									   BlockNumber target_blkno,
-									   int one_bulk_max_size)
-{
-	int			left_blocks_seg = 0;
-	int			num_bulk_block = 0;
-	int			left_blocks = target_blkno + 1 - current_total_blocks;
-	Buffer		buffer = InvalidBuffer;
-
-	do
-	{
-		if (buffer != InvalidBuffer)
-		{
-			if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
-			{
-				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-			}
-			ReleaseBuffer(buffer);
-		}
-		left_blocks_seg = (BlockNumber) RELSEG_SIZE -
-			(current_total_blocks % (BlockNumber) RELSEG_SIZE);
-		num_bulk_block = Min(left_blocks, left_blocks_seg);
-		num_bulk_block = Min(num_bulk_block, one_bulk_max_size);
-		buffer = polar_xlog_relation_bulk_extend_within_segment(smgr,
-																rnode, mode, forknum, num_bulk_block);
-		current_total_blocks += num_bulk_block;
-		left_blocks -= num_bulk_block;
-	} while (left_blocks > 0);
+	smgrzeroextend(smgr, forknum, first_block, block_count, false);
 
 	return buffer;
 }
@@ -650,8 +576,6 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	BlockNumber lastblock = InvalidBlockNumber;
 	Buffer		buffer;
 	SMgrRelation smgr;
-	int			one_bulk_max_size = 0;
-	bool		can_bulk = false;
 
 	Assert(blkno != P_NEW);
 
@@ -703,66 +627,32 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 		Assert(InRecovery || POLAR_IN_LOGINDEX_PARALLEL_REPLAY());
 		buffer = InvalidBuffer;
 
-		/* POLAR: bulk block extend opt start */
-		one_bulk_max_size = polar_recovery_bulk_extend_size;
-		can_bulk = false;
-		if (InHotStandby)
-		{
-			can_bulk = true;
-		}
-		else if (InRecovery && polar_enable_primary_recovery_bulk_extend)
-		{
-			/*
-			 * can_bulk set true when RW crashes recovery and page in xlog
-			 * will not exist in heap table. The latter will occur in insert
-			 * and truncate relfilenode. In online promote, can_bulk will not
-			 * be true because RO type has been RW when xlog replay.
-			 * InRecovery here is always false in online promote.
-			 */
-			can_bulk = true;
-		}
-
-		/* get last blocks */
-		if (lastblock == InvalidBlockNumber)
-			lastblock = smgrnblocks(smgr, forknum);
-		/* avoid small table bloat */
-		if (lastblock < polar_min_bulk_extend_table_size)
-			can_bulk = false;
-
-		if (can_bulk && polar_enable_shared_storage_mode &&
-			one_bulk_max_size > 0)
-		{
-			buffer = polar_xlog_relation_bulk_extend_blocks(smgr,
-															rnode, mode, forknum, lastblock, blkno,
-															one_bulk_max_size);
-		}
-		else
+		do
 		{
-			do
+			if (buffer != InvalidBuffer)
 			{
-				if (buffer != InvalidBuffer)
-				{
-					if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
-						LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-					ReleaseBuffer(buffer);
-				}
+				if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+					LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+				ReleaseBuffer(buffer);
+			}
 
-				/*
-				 * POLAR: acquire relation file lock to avoid extend the same
-				 * block concurrently which may result in EOF error when proc1
-				 * read block extended and modified by proc2.
-				 */
-				polar_acquire_relfile_lock(rnode, forknum, ExclusiveLock);
+			/*
+			 * POLAR: acquire relation file lock to avoid extend the same
+			 * block concurrently which may result in EOF error when proc1
+			 * read block extended and modified by proc2.
+			 */
+			polar_acquire_relfile_lock(rnode, forknum, ExclusiveLock);
 
-				buffer = ReadBufferWithoutRelcache(rnode, forknum,
-												   P_NEW, mode, NULL, true);
+			if ((InHotStandby || InRecovery) &&
+				polar_recovery_bulk_extend_size > 0)
+				buffer = polar_recovery_add_blocks(smgr, mode, forknum, blkno);
+			else
+				buffer = ReadBufferWithoutRelcache(rnode, forknum, P_NEW, mode, NULL, true);
 
-				/* POLAR: release relation file lock after extend */
-				polar_release_relfile_lock(rnode, forknum, ExclusiveLock);
-			}
-			while (BufferGetBlockNumber(buffer) < blkno);
+			/* POLAR: release relation file lock after extend */
+			polar_release_relfile_lock(rnode, forknum, ExclusiveLock);
 		}
-		/* POLAR: bulk block extend opt end */
+		while (BufferGetBlockNumber(buffer) < blkno);
 
 		/* Handle the corner case that P_NEW returns non-consecutive pages */
 		if (BufferGetBlockNumber(buffer) != blkno)
@@ -1374,3 +1264,23 @@ WALReadRaiseError(WALReadError *errinfo)
 						errinfo->wre_req)));
 	}
 }
+
+int
+polar_get_recovery_bulk_extend_size(BlockNumber target_block, BlockNumber nblocks)
+{
+	int			bulk_extend_size = polar_recovery_bulk_extend_size;
+
+	Assert(target_block >= nblocks);
+
+	/* Avoid small table bloat */
+	if (nblocks < polar_recovery_bulk_extend_size)
+		bulk_extend_size = 1;
+
+	/* Avoid acceed maximum possible length */
+	bulk_extend_size = Min(MaxBlockNumber - nblocks, bulk_extend_size);
+
+	/* Extend the relation to blockNum + 1 at least */
+	bulk_extend_size = Max(target_block - nblocks + 1, bulk_extend_size);
+
+	return bulk_extend_size;
+}
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index c8250205919..270f53c323f 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -28,6 +28,7 @@
 #include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
 #include "miscadmin.h"
+#include "storage/bulk_write.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
@@ -474,14 +475,14 @@ void
 RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 					ForkNumber forkNum, char relpersistence)
 {
-	PGAlignedBlock buf;
-	Page		page;
 	bool		use_wal;
 	bool		copying_initfork;
 	BlockNumber nblocks;
 	BlockNumber blkno;
-
-	page = (Page) buf.data;
+	BulkWriteState *bulkstate;
+	void	   *buffer;
+	int			block_count;
+	int			max_block_count;
 
 	/*
 	 * The init fork for an unlogged relation in many respects has to be
@@ -500,64 +501,67 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
+	max_block_count = Max(1, polar_bulk_read_size);
+	buffer = palloc_aligned(max_block_count * BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+
+	bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal, relpersistence);
+
 	nblocks = smgrnblocks(src, forkNum);
 
-	for (blkno = 0; blkno < nblocks; blkno++)
+	for (blkno = 0; blkno < nblocks; blkno += block_count)
 	{
+		BulkWriteBuffer buf;
+
 		/* If we got a cancel signal during the copy of the data, quit */
 		CHECK_FOR_INTERRUPTS();
 
-		smgrread(src, forkNum, blkno, buf.data);
+		block_count = Min(max_block_count, nblocks - blkno);
+
+		polar_smgrbulkread(src, forkNum, blkno, block_count, buffer);
 
-		if (!PageIsVerifiedExtended(page, blkno,
-									PIV_LOG_WARNING | PIV_REPORT_STAT))
+		for (int i = 0; i < block_count; i++)
 		{
-			/*
-			 * For paranoia's sake, capture the file path before invoking the
-			 * ereport machinery.  This guards against the possibility of a
-			 * relcache flush caused by, e.g., an errcontext callback.
-			 * (errcontext callbacks shouldn't be risking any such thing, but
-			 * people have been known to forget that rule.)
-			 */
-			char	   *relpath = relpathbackend(src->smgr_rnode.node,
-												 src->smgr_rnode.backend,
-												 forkNum);
-
-			ereport(ERROR,
-					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("invalid page in block %u of relation %s",
-							blkno, relpath)));
-		}
+			BlockNumber cur_blkno = blkno + i;
+			Page		page = (Page) ((char *) buffer + i * BLCKSZ);
 
-		/*
-		 * WAL-log the copied page. Unfortunately we don't know what kind of a
-		 * page this is, so we have to log the full page including any unused
-		 * space.
-		 */
-		if (use_wal)
-			log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
+			if (!PageIsVerifiedExtended(page, cur_blkno,
+										PIV_LOG_WARNING | PIV_REPORT_STAT))
+			{
+				/*
+				 * For paranoia's sake, capture the file path before invoking
+				 * the ereport machinery.  This guards against the possibility
+				 * of a relcache flush caused by, e.g., an errcontext
+				 * callback. (errcontext callbacks shouldn't be risking any
+				 * such thing, but people have been known to forget that
+				 * rule.)
+				 */
+				char	   *relpath = relpathbackend(src->smgr_rnode.node,
+													 src->smgr_rnode.backend,
+													 forkNum);
+
+				ereport(ERROR,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("invalid page in block %u of relation %s",
+								cur_blkno, relpath)));
+			}
 
-		PageSetChecksumInplace(page, blkno);
+			buf = smgr_bulk_get_buf(bulkstate);
 
-		/*
-		 * Now write the page.  We say skipFsync = true because there's no
-		 * need for smgr to schedule an fsync for this write; we'll do it
-		 * ourselves below.
-		 */
-		smgrextend(dst, forkNum, blkno, buf.data, true);
+			memcpy(buf, page, BLCKSZ);
+
+			/*
+			 * Queue the page for WAL-logging and writing out.  Unfortunately
+			 * we don't know what kind of a page this is, so we have to log
+			 * the full page including any unused space.
+			 */
+			smgr_bulk_write(bulkstate, cur_blkno, buf, false);
+		}
 	}
+	Assert(blkno == nblocks);
 
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
-	 * occurring during the copy has no way to flush the previously written
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
-	 * they might still not be on disk when the crash occurs.
-	 */
-	if (use_wal || copying_initfork)
-		smgrimmedsync(dst, forkNum);
+	smgr_bulk_finish(bulkstate);
+
+	pfree(buffer);
 }
 
 /*
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index eaf155d8084..297f1399319 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -533,7 +533,6 @@ AutoVacLauncherMain(int argc, char *argv[])
 		 */
 		LWLockReleaseAll();
 		pgstat_report_wait_end();
-		AbortBufferIO();
 		UnlockBuffers();
 		/* this is probably dead code, but let's be safe: */
 		if (AuxProcessResourceOwner)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 18c7dfb78c0..14f519cf22a 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -187,7 +187,6 @@ BackgroundWriterMain(void)
 		 */
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index a27962c3e38..ef3f4d49806 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -282,7 +282,6 @@ CheckpointerMain(void)
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
 		pgstat_report_wait_end();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/postmaster/polar_parallel_bgwriter.c b/src/backend/postmaster/polar_parallel_bgwriter.c
index 6557be3192f..94e9233b0a0 100644
--- a/src/backend/postmaster/polar_parallel_bgwriter.c
+++ b/src/backend/postmaster/polar_parallel_bgwriter.c
@@ -342,7 +342,6 @@ polar_parallel_bgwriter_worker_main(Datum main_arg)
 		 */
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 18c17623811..99210711779 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -174,7 +174,6 @@ WalWriterMain(void)
 		LWLockReleaseAll();
 		ConditionVariableCancelSleep();
 		pgstat_report_wait_end();
-		AbortBufferIO();
 		UnlockBuffers();
 		ReleaseAuxProcessResources(false);
 		AtEOXact_Buffers(false);
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 3110a2dcc86..07e7494d3b1 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -990,8 +990,8 @@ polar_logical_read_xlog_page_bulk(XLogReaderState *state,
 			pfree(state->bulk_read_buffer);
 
 		state->bulk_read_buffer_size = polar_logical_repl_xlog_bulk_read_size;
-		state->bulk_read_buffer = palloc_extended(state->bulk_read_buffer_size * XLOG_BLCKSZ,
-												  MCXT_ALLOC_NO_OOM);
+		state->bulk_read_buffer = palloc_aligned(state->bulk_read_buffer_size * XLOG_BLCKSZ,
+												 PG_IO_ALIGN_SIZE, MCXT_ALLOC_NO_OOM);
 
 		/*
 		 * Fail to allocate bulk buffer, turn to use original read buffer
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 01165d36eb7..50c6b23cc4a 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -19,6 +19,7 @@
 #include "storage/proc.h"
 
 /* POLAR */
+#include "common/file_utils.h"
 #include "storage/polar_copybuf.h"
 #include "storage/polar_fd.h"
 #include "storage/polar_flush.h"
@@ -63,6 +64,48 @@ CkptSortItem *CkptBufferIds;
  *		multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
  */
 
+static Size
+polar_zero_buffer_shmem_size()
+{
+	/* -1 indicates a request for auto-tune. */
+	if (polar_zero_buffers == -1)
+	{
+		/* Request according to NBuffers, which is in [16, INT_MAX / 2) */
+		polar_zero_buffers = 4;
+
+		if (NBuffers >= 1024)
+			polar_zero_buffers = 32;
+
+		if (NBuffers >= 16384)
+			polar_zero_buffers = 512;
+	}
+
+	/* 0 disables the zero buffer. */
+	if (polar_zero_buffers == 0)
+		return 0;
+
+	polar_zero_buffer_size = polar_zero_buffers * BLCKSZ;
+
+	return polar_zero_buffer_size + PG_IO_ALIGN_SIZE;
+}
+
+static void
+polar_zero_buffer_init()
+{
+	bool		found;
+
+	if (polar_zero_buffer_size == 0)
+		return;
+
+	polar_zero_buffer = (char *)
+		TYPEALIGN(PG_IO_ALIGN_SIZE,
+				  ShmemInitStruct("Zero Buffer Blocks",
+								  polar_zero_buffer_size + PG_IO_ALIGN_SIZE,
+								  &found));
+
+	if (!found)
+		MemSet(polar_zero_buffer, 0, polar_zero_buffer_size);
+}
 
 /*
  * Initialize shared buffer pool
@@ -84,10 +127,11 @@ InitBufferPool(void)
 						NBuffers * sizeof(BufferDescPadded),
 						&foundDescs);
 
+	/* Align buffer pool on IO page size boundary. */
 	BufferBlocks = (char *)
-		TYPEALIGN(POLAR_BUFFER_ALIGN_LEN,
+		TYPEALIGN(PG_IO_ALIGN_SIZE,
 				  ShmemInitStruct("Buffer Blocks",
-								  ((NBuffers * (Size) BLCKSZ) + POLAR_BUFFER_ALIGN_LEN),
+								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
 								  &foundBufs));
 
 	/* Align condition variables to cacheline boundary. */
@@ -169,6 +213,9 @@ InitBufferPool(void)
 	/* POLAR: init copy buffer pool */
 	polar_init_copy_buffer_pool();
 
+	/* POLAR: init global zero buffer */
+	polar_zero_buffer_init();
+
 	/* Initialize per-backend file flush context */
 	WritebackContextInit(&BackendWritebackContext,
 						 &backend_flush_after);
@@ -211,5 +258,8 @@ BufferShmemSize(void)
 	/* POLAR: add copy buffer shared memory size */
 	size = add_size(size, polar_copy_buffer_shmem_size());
 
+	/* POLAR: size of global zero buffer */
+	size = add_size(size, polar_zero_buffer_shmem_size());
+
 	return size;
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index d60e504d6cd..a07bbbec1b1 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -188,23 +188,6 @@ int			checkpoint_flush_after = 0;
 int			bgwriter_flush_after = 0;
 int			backend_flush_after = 0;
 
-/* local state for StartBufferIO and related functions */
-static BufferDesc *InProgressBuf = NULL;
-static bool IsForInput;
-
-/*
- * POLAR: bulk io local state for StartBufferIO/TerminateBufferIO/AbortBufferIO and related functions.
- *
- * notice: bulk read io may be mixed with temporary write io, for flushing dirty evicted page.
- *	       So polar_bulk_io_is_for_input[] is required for error recovery.
- */
-bool		polar_bulk_io_is_in_progress = false;
-int			polar_bulk_io_in_progress_count = 0;
-BufferDesc **polar_bulk_io_in_progress_buf = NULL;
-static bool *polar_bulk_io_is_for_input = NULL;
-
-/* POLAR end */
-
 /* local state for LockBufferForCleanup */
 static BufferDesc *PinCountWaitBuf = NULL;
 
@@ -250,6 +233,9 @@ static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move
 static inline int32 GetPrivateRefCount(Buffer buffer);
 static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
 
+/* POLAR */
+bool		polar_has_partial_write;
+int			polar_bulk_read_size = 16;
 
 static bool polar_apply_io_locked_page(BufferDesc *bufHdr, XLogRecPtr replay_from,
 									   XLogRecPtr checkpoint_lsn, SMgrRelation smgr,
@@ -862,6 +848,19 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 							 mode, strategy, &hit);
 }
 
+/*
+ * Convenience wrapper for ReadBuffer_common, exported for outer usage.
+ */
+Buffer
+polar_read_buffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+						 BlockNumber blockNum, ReadBufferMode mode,
+						 BufferAccessStrategy strategy)
+{
+	bool		hit;
+
+	return ReadBuffer_common(smgr, relpersistence, forkNum,
+							 blockNum, mode, strategy, &hit);
+}
 
 /*
  * ReadBuffer_common -- common logic for all ReadBuffer variants
@@ -1071,7 +1070,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		/* new buffers are zero-filled */
 		MemSet((char *) bufBlock, 0, BLCKSZ);
 		/* don't set checksum for all-zero page */
-		smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
+		smgrzeroextend(smgr, forkNum, blockNum, 1, false);
 
 		/*
 		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
@@ -1096,7 +1095,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			if (track_io_timing)
 				INSTR_TIME_SET_CURRENT(io_start);
 
-			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+			smgrread(smgr, forkNum, blockNum, bufBlock);
 
 			if (track_io_timing)
 			{
@@ -2914,7 +2913,6 @@ InitBufferPoolAccess(void)
 static void
 AtProcExit_Buffers(int code, Datum arg)
 {
-	AbortBufferIO();
 	UnlockBuffers();
 
 	CheckForBufferLeaks();
@@ -4055,7 +4053,7 @@ RelationCopyStorageUsingBuffer(RelFileNode srcnode,
 	bool		use_wal;
 	BlockNumber nblocks;
 	BlockNumber blkno;
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	BufferAccessStrategy bstrategy_src;
 	BufferAccessStrategy bstrategy_dst;
 
@@ -5038,14 +5036,7 @@ polar_start_buffer_io_extend(BufferDesc *buf,
 {
 	uint32		buf_state;
 
-	/* POLAR: bulk io */
-	if (!polar_bulk_io_is_in_progress)
-	{
-		/* single io */
-		Assert(!InProgressBuf);
-	}
-	/* POLAR end */
-
+	ResourceOwnerEnlargeBufferIOs(CurrentResourceOwner);
 
 	for (;;)
 	{
@@ -5085,27 +5076,8 @@ polar_start_buffer_io_extend(BufferDesc *buf,
 	buf_state |= BM_IO_IN_PROGRESS;
 	UnlockBufHdr(buf, buf_state);
 
-	/* POLAR: bulk io */
-	if (!polar_bulk_io_is_in_progress)
-	{
-		/* single io */
-		InProgressBuf = buf;
-		IsForInput = forInput;
-	}
-	else
-	{
-		/* bulk io */
-		polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count] = buf;
-
-		/*
-		 * bulk read io may be mixed with temporary write io, for flushing
-		 * dirty evicted page. So polar_bulk_io_is_for_input[] is required for
-		 * error recovery.
-		 */
-		polar_bulk_io_is_for_input[polar_bulk_io_in_progress_count] = forInput;
-		polar_bulk_io_in_progress_count++;
-	}
-	/* POLAR end */
+	ResourceOwnerRememberBufferIO(CurrentResourceOwner,
+								  BufferDescriptorGetBuffer(buf));
 
 	return true;
 }
@@ -5131,32 +5103,6 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 {
 	uint32		buf_state;
 
-	/* POLAR：bulk io */
-
-	/*
-	 * Because assert will be ignored during release mode, we only use
-	 * Assert()
-	 */
-	/* ----------------
-	 * single io:
-	 * if (!polar_bulk_io_is_in_progress)
-	   {  Assert(buf == InProgressBuf); }
-	 * ----------------
-	 */
-	Assert(polar_bulk_io_is_in_progress || buf == InProgressBuf);
-	/* ----------------
-	 * bulk io:
-	 * if (polar_bulk_io_is_in_progress)
-	 * {
-	 *   Assert(polar_bulk_io_in_progress_count > 0);
-	 *   Assert(buf == polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]);
-	 * }
-	 * ----------------
-	 */
-	Assert(!polar_bulk_io_is_in_progress || polar_bulk_io_in_progress_count > 0);
-	Assert(!polar_bulk_io_is_in_progress || buf == polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]);
-	/* POLAR end */
-
 	buf_state = LockBufHdr(buf);
 
 	Assert(buf_state & BM_IO_IN_PROGRESS);
@@ -5168,26 +5114,14 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 	buf_state |= set_flag_bits;
 	UnlockBufHdr(buf, buf_state);
 
-	/* POLAR: bulk io */
-	if (!polar_bulk_io_is_in_progress)
-	{
-		/* single io */
-		InProgressBuf = NULL;
-	}
-	else
-	{
-		/* bulk io */
-		polar_bulk_io_in_progress_count--;
-	}
-	/* POLAR end */
-
-	InProgressBuf = NULL;
+	ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+								BufferDescriptorGetBuffer(buf));
 
 	ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
 }
 
 /*
- * AbortBufferIO: Clean up any active buffer I/O after an error.
+ * AbortBufferIO: Clean up active buffer I/O after an error.
  *
  *	All LWLocks we might have held have been released,
  *	but we haven't yet released buffer pins, so the buffer is still pinned.
@@ -5196,71 +5130,42 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
  *	possible the error condition wasn't related to the I/O.
  */
 void
-AbortBufferIO(void)
+AbortBufferIO(Buffer buf)
 {
-	BufferDesc *buf = InProgressBuf;
+	BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
+	uint32		buf_state;
 
-polar_bulk_read:
+	buf_state = LockBufHdr(buf_hdr);
+	Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
 
-	/*
-	 * POLAR: deal with local buffer(buf->buf_id < 0) local buffer doesn't
-	 * need to release source, just decrease io_in_progress_count
-	 */
-	if (buf && buf->buf_id < 0)
-		polar_bulk_io_in_progress_count--;
-	else if (buf)
+	if (!(buf_state & BM_VALID))
 	{
-		uint32		buf_state;
-
-		buf_state = LockBufHdr(buf);
-		Assert(buf_state & BM_IO_IN_PROGRESS);
-		if (IsForInput)
-		{
-			Assert(!(buf_state & BM_DIRTY));
-
-			/* We'd better not think buffer is valid yet */
-			Assert(!(buf_state & BM_VALID));
-			UnlockBufHdr(buf, buf_state);
-		}
-		else
-		{
-			Assert(buf_state & BM_DIRTY);
-			UnlockBufHdr(buf, buf_state);
-			/* Issue notice if this is not the first failure... */
-			if (buf_state & BM_IO_ERROR)
-			{
-				/* Buffer is pinned, so we can read tag without spinlock */
-				char	   *path;
-
-				path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
-				ereport(WARNING,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg("could not write block %u of %s",
-								buf->tag.blockNum, path),
-						 errdetail("Multiple failures --- write error might be permanent.")));
-				pfree(path);
-			}
-		}
-		TerminateBufferIO(buf, false, BM_IO_ERROR);
+		Assert(!(buf_state & BM_DIRTY));
+		UnlockBufHdr(buf_hdr, buf_state);
 	}
-
-	/* POLAR: bulk io recovery */
-	if (polar_bulk_io_is_in_progress)
+	else
 	{
-		if (polar_bulk_io_in_progress_count > 0)
-		{
-			buf = polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1];
-			IsForInput = polar_bulk_io_is_for_input[polar_bulk_io_in_progress_count - 1];
+		Assert(buf_state & BM_DIRTY);
+		UnlockBufHdr(buf_hdr, buf_state);
 
-			/*
-			 * In TerminateBufferIO(), polar_bulk_io_in_progress_count was
-			 * reduced by 1.
-			 */
-			goto polar_bulk_read;
+		/* Issue notice if this is not the first failure... */
+		if (buf_state & BM_IO_ERROR)
+		{
+			/* Buffer is pinned, so we can read tag without spinlock */
+			char	   *path;
+
+			path = relpathperm(buf_hdr->tag.rnode, buf_hdr->tag.forkNum);
+			ereport(WARNING,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg("could not write block %u of %s",
+							buf_hdr->tag.blockNum, path),
+					 errdetail("Multiple failures --- write error might be permanent.")));
+			pfree(path);
 		}
-		polar_bulk_io_is_in_progress = false;
 	}
 
+	TerminateBufferIO(buf_hdr, false, BM_IO_ERROR);
+
 	/*
 	 * POLAR: we must reset read_min_lsn where ERROR, otherwise bgwriter
 	 * cannot clean hashtable or logindex anymore
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index e01dddd1108..7d5a7b2db03 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -202,8 +202,6 @@ have_free_buffer(void)
  *
  *	To ensure that no one else can pin the buffer before we do, we must
  *	return the buffer with the buffer header spinlock still held.
- *	POLAR: if reading bulk non-first page and most buffers are pinned. return NULL
- *	instead of log(error).
  */
 BufferDesc *
 StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
@@ -358,14 +356,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 			 * infinite loop.
 			 */
 			UnlockBufHdr(buf, local_buf_state);
-
-			/*
-			 * POLAR: bulk read, alloc not-first page, if failed just ok,
-			 * return NULL.
-			 */
-			if (polar_bulk_io_is_in_progress && polar_bulk_io_in_progress_count > 0)
-				return NULL;
-
 			elog(ERROR, "no unpinned buffers available");
 		}
 		UnlockBufHdr(buf, local_buf_state);
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index fd0b44c3e0f..39b2b2a9908 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -154,12 +154,6 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 			*foundPtr = true;
 		else
 		{
-			/* POLAR: bulk read. the same as StartBufferIO */
-			if (polar_bulk_io_is_in_progress)
-			{
-				polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count] = bufHdr;
-				polar_bulk_io_in_progress_count++;
-			}
 			/* Previous read attempt must have failed; try again */
 			*foundPtr = false;
 		}
@@ -281,15 +275,6 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
 
 	*foundPtr = false;
-
-	/* POLAR: Bulk read. the same as StartBufferIO */
-	if (polar_bulk_io_is_in_progress)
-	{
-		polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count] = bufHdr;
-		polar_bulk_io_in_progress_count++;
-	}
-	/* POLAR end */
-
 	return bufHdr;
 }
 
@@ -543,8 +528,11 @@ GetLocalBufferStorage(void)
 		/* And don't overflow MaxAllocSize, either */
 		num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
 
-		cur_block = (char *) MemoryContextAllocIOAligned(LocalBufferContext,
-														 num_bufs * BLCKSZ, 0);
+		/* Buffers should be I/O aligned. */
+		cur_block = (char *)
+			TYPEALIGN(PG_IO_ALIGN_SIZE,
+					  MemoryContextAlloc(LocalBufferContext,
+										 num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
 		next_buf_in_block = 0;
 		num_bufs_in_block = num_bufs;
 	}
diff --git a/src/backend/storage/buffer/polar_bufmgr.c b/src/backend/storage/buffer/polar_bufmgr.c
index ae33fde212b..d202ff18557 100644
--- a/src/backend/storage/buffer/polar_bufmgr.c
+++ b/src/backend/storage/buffer/polar_bufmgr.c
@@ -1188,20 +1188,20 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 {
 	SMgrRelation smgr = reln->rd_smgr;
 	BufferDesc *bufHdr;
-	BufferDesc *first_buf_hdr;
 	Block		bufBlock;
 	bool		found;
 	bool		isLocalBuf = SmgrIsTemp(smgr);
 	int			actual_bulk_io_count;
 	int			index;
 	char	   *buf_read;
+	BufferDesc *buffers[MAX_BUFFERS_TO_READ_BY];
+	bool		checksum_fail[MAX_BUFFERS_TO_READ_BY] = {false};
 
 	/* POLAR: start lsn to do replay */
 	XLogRecPtr	checkpoint_redo_lsn = InvalidXLogRecPtr;
 	XLogRecPtr	replay_from;
 	polar_redo_action redo_action;
 	uint32		repeat_read_times = 0;
-	bool	   *checksum_fail;
 
 	polar_pgstat_count_bulk_read_calls(reln);
 
@@ -1215,28 +1215,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 								 mode, strategy, hit);
 	}
 
-	Assert(!polar_bulk_io_is_in_progress);
-	Assert(0 == polar_bulk_io_in_progress_count);
-
-	/* bulk read begin */
-	polar_bulk_io_is_in_progress = true;
-
-	/*
-	 * Alloc buffer for polar_bulk_io_in_progress_buf and
-	 * polar_bulk_io_is_for_input on demand. If bulk read is called once,
-	 * there is a great possibility that bulk read will be called later.
-	 * polar_bulk_io_in_progress_buf and polar_bulk_io_is_for_input will be
-	 * not freed, until backend exit.
-	 */
-	if (NULL == polar_bulk_io_in_progress_buf)
-	{
-		Assert(NULL == polar_bulk_io_is_for_input);
-		polar_bulk_io_in_progress_buf = MemoryContextAlloc(TopMemoryContext,
-														   POLAR_MAX_BULK_IO_SIZE * sizeof(polar_bulk_io_in_progress_buf[0]));
-		polar_bulk_io_is_for_input = MemoryContextAlloc(TopMemoryContext,
-														POLAR_MAX_BULK_IO_SIZE * sizeof(polar_bulk_io_is_for_input[0]));
-	}
-
 	*hit = false;
 
 	/* Make sure we will have room to remember the buffer pin */
@@ -1259,11 +1237,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 	}
 	else
 	{
-		/*
-		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
-		 * not currently in memory. If not found,
-		 * polar_bulk_io_in_progress_count will be added by 1.
-		 */
 		bufHdr = BufferAlloc(smgr, relpersistence, forkNum, firstBlockNum,
 							 strategy, &found);
 		if (found)
@@ -1291,10 +1264,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 										  false,
 										  found);
 
-		Assert(0 == polar_bulk_io_in_progress_count);
-		/* important, mark bulk_io end */
-		polar_bulk_io_is_in_progress = false;
-
 		return BufferDescriptorGetBuffer(bufHdr);
 	}
 
@@ -1312,14 +1281,7 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 	 */
 	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));	/* spinlock not needed */
 
-	Assert(1 == polar_bulk_io_in_progress_count);
-	Assert(bufHdr == polar_bulk_io_in_progress_buf[polar_bulk_io_in_progress_count - 1]);
-
-	/*
-	 * Hold the first block bufHdr, after TerminateBufferIO(),
-	 * polar_bulk_io_in_progress_buf is freed.
-	 */
-	first_buf_hdr = bufHdr;
+	buffers[0] = bufHdr;
 
 	/*
 	 * Make sure than single bulk_read will not read blocks across files.
@@ -1340,9 +1302,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 		/*
 		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
 		 * not currently in memory.
-		 *
-		 * If not found, polar_bulk_io_in_progress_count will be added by 1 by
-		 * StartBufferIO().
 		 */
 		if (isLocalBuf)
 			bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
@@ -1350,32 +1309,30 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 			bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
 								 strategy, &found);
 
+		Assert(bufHdr);
+
 		/*
 		 * For extra block, don't update pgBufferUsage.shared_blks_hit or
 		 * pgBufferUsage.shared_blks_read, also the blocks are not used now.
 		 */
-		/* bufHdr == NULL, all buffers are pinned. */
-		if (found || bufHdr == NULL)
+		if (found)
 		{
 			/*
 			 * important: this buffer is the upper boundary, it should be
 			 * excluded.
 			 */
-			if (bufHdr != NULL)
-			{
-				ReleaseBuffer(BufferDescriptorGetBuffer(bufHdr));
-			}
+			ReleaseBuffer(BufferDescriptorGetBuffer(bufHdr));
 			break;
 		}
 		Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));	/* spinlock not needed */
+		buffers[index] = bufHdr;
 	}
 
-	Assert(index == polar_bulk_io_in_progress_count);
+	actual_bulk_io_count = index;
 
 	/*
-	 * Until now, as to {blockNum + [0, polar_bulk_io_in_progress_count)}
-	 * block buffers, IO_IN_PROGRESS flag is set and io_in_progress_lock is
-	 * holded.
+	 * Until now, as to {blockNum + [0, actual_bulk_io_count)} block buffers,
+	 * IO_IN_PROGRESS flag is set and io_in_progress_lock is holded.
 	 *
 	 * Other proc(include backend sql exec、start xlog replay) which read
 	 * there buffers, would be blocked on io_in_progress_lock.
@@ -1384,21 +1341,7 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 	 * lock on io_in_progress_lock.
 	 */
 
-	/*
-	 * polar_bulk_io_in_progress_count will be reduced by TerminateBufferIO(),
-	 * For safety, its copy actual_bulk_io_count is used.
-	 */
-	actual_bulk_io_count = polar_bulk_io_in_progress_count;
-
-	/* for eliminating palloc and memcpy */
-	if (1 == actual_bulk_io_count)
-		buf_read = isLocalBuf ?
-			(char *) LocalBufHdrGetBlock(first_buf_hdr) :
-			(char *) BufHdrGetBlock(first_buf_hdr);
-	else
-		buf_read = (char *) palloc_io_aligned(actual_bulk_io_count * BLCKSZ, MCXT_ALLOC_ZERO);
-
-	checksum_fail = (bool *) palloc0(actual_bulk_io_count * sizeof(bool));
+	buf_read = (char *) palloc_aligned(actual_bulk_io_count * BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
 
 repeat_read:
 
@@ -1475,25 +1418,15 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 		}
 	}
 
-	/*
-	 * notice: 1. buffers must be processed by TerminateBufferIO() from back
-	 * to front. a) TerminateBufferIO() release
-	 * polar_bulk_io_in_progress_buf[] in decrement order. b) For better
-	 * performance, LWLockRelease() release io_in_progress_lock in decrement
-	 * order. 2. polar_bulk_io_in_progress_count was reduced by
-	 * TerminateBufferIO(). a) polar_bulk_io_in_progress_count must not be
-	 * used here.
-	 */
 	for (index = actual_bulk_io_count - 1; index >= 0; index--)
 	{
 		BlockNumber blockNum = firstBlockNum + index;
 
-		bufHdr = polar_bulk_io_in_progress_buf[index];
+		bufHdr = buffers[index];
 		bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
 
 		/* need copy page content from aligned_buf_read to block shared_buffer */
-		if (actual_bulk_io_count != 1)
-			memcpy((char *) bufBlock, buf_read + index * BLCKSZ, BLCKSZ);
+		memcpy((char *) bufBlock, buf_read + index * BLCKSZ, BLCKSZ);
 
 		if (unlikely(polar_trace_logindex_messages <= DEBUG3))
 		{
@@ -1552,8 +1485,6 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 
 			buf_state |= BM_VALID;
 			pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-			/* bulk io */
-			polar_bulk_io_in_progress_count--;
 		}
 		else
 			TerminateBufferIO(bufHdr, false, BM_VALID);
@@ -1589,21 +1520,9 @@ polar_bulk_read_buffer_common(Relation reln, char relpersistence, ForkNumber for
 									  false,
 									  found);
 
-	/*
-	 * notice: polar_bulk_io_in_progress_count was reduced by
-	 * TerminateBufferIO(). polar_bulk_io_in_progress_count must not be used
-	 * here.
-	 */
-	if (actual_bulk_io_count != 1)
-		pfree(buf_read);
-
-	pfree(checksum_fail);
-
-	Assert(0 == polar_bulk_io_in_progress_count);
-	/* important, mark bulk_io end */
-	polar_bulk_io_is_in_progress = false;
+	pfree(buf_read);
 
-	return BufferDescriptorGetBuffer(first_buf_hdr);
+	return BufferDescriptorGetBuffer(buffers[0]);
 }
 bool
 polar_is_future_page(BufferDesc *buf_hdr)
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 56b88594cc8..1b1d556fd4a 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -95,6 +95,11 @@ struct BufFile
 	off_t		curOffset;		/* offset part of current pos */
 	int			pos;			/* next read/write position in buffer */
 	int			nbytes;			/* total # of valid bytes in buffer */
+
+	/*
+	 * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
+	 * wasting per-file alignment padding when some users create many files.
+	 */
 	PGAlignedBlock buffer;
 };
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 2fe8c0334c9..198540106f0 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -96,7 +96,6 @@
 #include "common/pg_prng.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "port/pg_iovec.h"
 #include "portability/mem.h"
 #include "postmaster/startup.h"
 #include "storage/fd.h"
@@ -169,6 +168,11 @@ bool		data_sync_retry = false;
 /* How SyncDataDirectory() should do its job. */
 int			recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
 
+/* POLAR: GUC */
+bool		polar_enable_fallocate_no_hide_stale;
+
+/* POLAR end */
+
 /* Debugging.... */
 
 #ifdef FDDEBUG
@@ -2080,16 +2084,16 @@ FileClose(File file)
  * to read into.
  */
 int
-FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
+FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
 {
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
 	int			returnCode;
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
+	DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
 			   file, VfdCache[file].fileName,
-			   (int64) offset, amount));
+			   (int64) offset, (int64) amount));
 
 	returnCode = FileAccess(file);
 	if (returnCode < 0)
@@ -2130,16 +2134,16 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
 	pgstat_report_wait_end();
 }
 
-int
-FileRead(File file, char *buffer, int amount, off_t offset,
+ssize_t
+FileRead(File file, void *buffer, size_t amount, off_t offset,
 		 uint32 wait_event_info)
 {
-	int			returnCode;
+	ssize_t		returnCode;
 	Vfd		   *vfdP;
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
+	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
 			   file, VfdCache[file].fileName,
 			   (int64) offset,
 			   amount, buffer));
@@ -2188,16 +2192,16 @@ FileRead(File file, char *buffer, int amount, off_t offset,
 	return returnCode;
 }
 
-int
-FileWrite(File file, char *buffer, int amount, off_t offset,
+ssize_t
+FileWrite(File file, const void *buffer, size_t amount, off_t offset,
 		  uint32 wait_event_info)
 {
-	int			returnCode;
+	ssize_t		returnCode;
 	Vfd		   *vfdP;
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
+	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
 			   file, VfdCache[file].fileName,
 			   (int64) offset,
 			   amount, buffer));
@@ -2307,6 +2311,100 @@ FileSync(File file, uint32 wait_event_info)
 	return returnCode;
 }
 
+/*
+ * Zero a region of the file.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info, bool bulkwrite)
+{
+	int			returnCode;
+	ssize_t		written;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	pgstat_report_wait_start(wait_event_info);
+	if (bulkwrite)
+		written = polar_pwrite_zeros(VfdCache[file].fd, amount, offset);
+	else
+		written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
+	pgstat_report_wait_end();
+
+	if (written < 0)
+		return -1;
+	else if (written != amount)
+	{
+		/* if errno is unset, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#ifdef HAVE_POSIX_FALLOCATE
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+			   file, VfdCache[file].fileName,
+			   (int64) offset, (int64) amount));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return -1;
+
+retry:
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = polar_posix_fallocate(VfdCache[file].fd, offset, amount);
+	pgstat_report_wait_end();
+
+	if (returnCode == 0)
+		return 0;
+	else if (returnCode == EINTR)
+		goto retry;
+
+	/* for compatibility with %m printing etc */
+	errno = returnCode;
+
+	/*
+	 * Return in cases of a "real" failure, if fallocate is not supported,
+	 * fall through to the FileZero() backed implementation.
+	 */
+	if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+		return -1;
+#endif
+
+	return FileZero(file, offset, amount, wait_event_info, true);
+}
+
 off_t
 FileSize(File file)
 {
@@ -3846,67 +3944,3 @@ data_sync_elevel(int elevel)
 {
 	return data_sync_retry ? elevel : PANIC;
 }
-
-/*
- * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
- * error is returned, it is unspecified how much has been written.
- */
-ssize_t
-pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
-{
-	struct iovec iov_copy[PG_IOV_MAX];
-	ssize_t		sum = 0;
-	ssize_t		part;
-
-	/* We'd better have space to make a copy, in case we need to retry. */
-	if (iovcnt > PG_IOV_MAX)
-	{
-		errno = EINVAL;
-		return -1;
-	}
-
-	for (;;)
-	{
-		/* Write as much as we can. */
-		part = polar_pwritev(fd, iov, iovcnt, offset);
-		if (part < 0)
-			return -1;
-
-#ifdef SIMULATE_SHORT_WRITE
-		part = Min(part, 4096);
-#endif
-
-		/* Count our progress. */
-		sum += part;
-		offset += part;
-
-		/* Step over iovecs that are done. */
-		while (iovcnt > 0 && iov->iov_len <= part)
-		{
-			part -= iov->iov_len;
-			++iov;
-			--iovcnt;
-		}
-
-		/* Are they all done? */
-		if (iovcnt == 0)
-		{
-			/* We don't expect the kernel to write more than requested. */
-			Assert(part == 0);
-			break;
-		}
-
-		/*
-		 * Move whatever's left to the front of our mutable copy and adjust
-		 * the leading iovec.
-		 */
-		Assert(iovcnt > 0);
-		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
-		Assert(iov->iov_len > part);
-		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
-		iov_copy[0].iov_len -= part;
-		iov = iov_copy;
-	}
-
-	return sum;
-}
diff --git a/src/backend/storage/file/polar_fd.c b/src/backend/storage/file/polar_fd.c
index 1f16b79f156..a668752ff7f 100644
--- a/src/backend/storage/file/polar_fd.c
+++ b/src/backend/storage/file/polar_fd.c
@@ -104,7 +104,12 @@ vfs_mgr		polar_vfs[] =
 		.vfs_fsync = pg_fsync,
 		.vfs_unlink = unlink,
 		.vfs_rename = rename,
-		.vfs_fallocate = posix_fallocate,
+		.vfs_posix_fallocate = posix_fallocate,
+#ifdef __linux__
+		.vfs_fallocate = fallocate,
+#else
+		.vfs_fallocate = NULL,
+#endif
 		.vfs_ftruncate = ftruncate,
 		.vfs_truncate = truncate,
 		.vfs_opendir = opendir,
@@ -125,6 +130,7 @@ vfs_mgr		polar_vfs[] =
 #else
 		.vfs_posix_fadvise = NULL,
 #endif
+		.vfs_type = polar_bufferio_vfs_type,
 	},
 	{
 		.vfs_env_init = NULL,
@@ -149,6 +155,7 @@ vfs_mgr		polar_vfs[] =
 		.vfs_fsync = NULL,
 		.vfs_unlink = NULL,
 		.vfs_rename = NULL,
+		.vfs_posix_fallocate = NULL,
 		.vfs_fallocate = NULL,
 		.vfs_ftruncate = NULL,
 		.vfs_truncate = NULL,
@@ -162,6 +169,7 @@ vfs_mgr		polar_vfs[] =
 		.vfs_sync_file_range = NULL,
 		.vfs_posix_fadvise = NULL,
 		.vfs_mmap = NULL,
+		.vfs_type = NULL,
 	}
 };
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 9d14151b572..d72d4502652 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -1525,7 +1525,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno)
 	 * and second to avoid wasting space in processes that never call this.
 	 */
 	if (pageCopy == NULL)
-		pageCopy = MemoryContextAllocIOAligned(TopMemoryContext, BLCKSZ, 0);
+		pageCopy = MemoryContextAllocAligned(TopMemoryContext,
+											 BLCKSZ,
+											 PG_IO_ALIGN_SIZE,
+											 0);
 
 	memcpy(pageCopy, (char *) page, BLCKSZ);
 	((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
index 7b629f08d25..a36313f2c39 100644
--- a/src/backend/storage/smgr/Makefile
+++ b/src/backend/storage/smgr/Makefile
@@ -13,6 +13,7 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = \
+	bulk_write.o \
 	md.o \
 	smgr.o
 
diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
new file mode 100644
index 00000000000..dbda0e26982
--- /dev/null
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -0,0 +1,275 @@
+/*-------------------------------------------------------------------------
+ *
+ * bulk_write.c
+ *	  Efficiently and reliably populate a new relation
+ *
+ * The assumption is that no other backends access the relation while we are
+ * loading it, so we can take some shortcuts.  Do not mix operations through
+ * the regular buffer manager and the bulk loading interface!
+ *
+ * We bypass the buffer manager to avoid the locking overhead, and call
+ * smgrextend() directly.  A downside is that the pages will need to be
+ * re-read into shared buffers on first use after the build finishes.  That's
+ * usually a good tradeoff for large relations, and for small relations, the
+ * overhead isn't very significant compared to creating the relation in the
+ * first place.
+ *
+ * The pages are WAL-logged if needed.  To save on WAL header overhead, we
+ * WAL-log several pages in one record.
+ *
+ * One tricky point is that because we bypass the buffer manager, we need to
+ * register the relation for fsyncing at the next checkpoint ourselves, and
+ * make sure that the relation is correctly fsync'd by us or the checkpointer
+ * even if a checkpoint happens concurrently.
+ *
+ * NOTE:
+ * fsync is removed from PolarDB for we use buffer pool to cache those pages.
+ *
+ * Portions Copyright (c) 2024, Alibaba Group Holding Limited
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/smgr/bulk_write.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xloginsert.h"
+#include "access/xlogrecord.h"
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "storage/bulk_write.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "utils/rel.h"
+
+#define MAX_PENDING_WRITES 512
+
+typedef struct PendingWrite
+{
+	BulkWriteBuffer buf;
+	BlockNumber blkno;
+	bool		page_std;
+} PendingWrite;
+
+/*
+ * Bulk writer state for one relation fork.
+ */
+struct BulkWriteState
+{
+	/* Information about the target relation we're writing */
+	SMgrRelation smgr;
+	ForkNumber	forknum;
+	bool		use_wal;
+	char		relpersistence;
+
+	/* We keep several writes queued, and WAL-log them in batches */
+	int			npending;
+	PendingWrite pending_writes[MAX_PENDING_WRITES];
+
+	/* Current size of the relation */
+	BlockNumber pages_written;
+
+	MemoryContext memcxt;
+};
+
+/* GUCs */
+int			polar_bulk_write_maxpages;
+
+static void smgr_bulk_flush(BulkWriteState *bulkstate);
+
+/*
+ * Start a bulk write operation on a relation fork.
+ */
+BulkWriteState *
+smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
+{
+	return smgr_bulk_start_smgr(RelationGetSmgr(rel),
+								forknum,
+								RelationNeedsWAL(rel) || forknum == INIT_FORKNUM,
+								rel->rd_rel->relpersistence);
+}
+
+/*
+ * Start a bulk write operation on a relation fork.
+ *
+ * This is like smgr_bulk_start_rel, but can be used without a relcache entry.
+ */
+BulkWriteState *
+smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal, char relpersistence)
+{
+	BulkWriteState *state;
+
+	state = palloc(sizeof(BulkWriteState));
+	state->smgr = smgr;
+	state->forknum = forknum;
+	state->use_wal = use_wal;
+	state->relpersistence = relpersistence;
+
+	state->npending = 0;
+	state->pages_written = 0;
+
+	/*
+	 * Remember the memory context.  We will use it to allocate all the
+	 * buffers later.
+	 */
+	state->memcxt = CurrentMemoryContext;
+
+	return state;
+}
+
+/*
+ * Finish bulk write operation.
+ */
+void
+smgr_bulk_finish(BulkWriteState *bulkstate)
+{
+	/* WAL-log and flush any remaining pages */
+	smgr_bulk_flush(bulkstate);
+}
+
+static int
+buffer_cmp(const void *a, const void *b)
+{
+	const PendingWrite *bufa = (const PendingWrite *) a;
+	const PendingWrite *bufb = (const PendingWrite *) b;
+
+	/* We should not see duplicated writes for the same block */
+	Assert(bufa->blkno != bufb->blkno);
+	if (bufa->blkno > bufb->blkno)
+		return 1;
+	else
+		return -1;
+}
+
+/*
+ * Finish all the pending writes.
+ */
+static void
+smgr_bulk_flush(BulkWriteState *bulkstate)
+{
+	int			npending = bulkstate->npending;
+	PendingWrite *pending_writes = bulkstate->pending_writes;
+	BlockNumber nblocks;
+
+	if (npending == 0)
+		return;
+
+	if (npending > 1)
+		qsort(pending_writes, npending, sizeof(PendingWrite), buffer_cmp);
+
+	nblocks = bulkstate->pending_writes[npending - 1].blkno + 1;
+
+	/*
+	 * Before we alloc buffers from buffer pool for those pages, extend the
+	 * underlying file first.
+	 */
+	if (nblocks > bulkstate->pages_written)
+	{
+		smgrzeroextend(bulkstate->smgr, bulkstate->forknum, bulkstate->pages_written,
+					   nblocks - bulkstate->pages_written, true);
+		bulkstate->pages_written = nblocks;
+	}
+
+	for (int i = 0; i < npending;)
+	{
+		int			nbatch = 0;
+		BlockNumber blknos[XLR_MAX_BLOCK_ID];
+		Page		pages[XLR_MAX_BLOCK_ID];
+		Buffer		buffers[XLR_MAX_BLOCK_ID];
+		bool		page_std = true;
+
+		/*
+		 * Accumulate XLR_MAX_BLOCK_ID pages at most per round. For
+		 * log_newpages takes those count of pages into one record. Also to
+		 * reduce the usage of LWLock to avoid "too many LWLocks taken" ERROR.
+		 */
+		do
+		{
+			BlockNumber blkno = pending_writes[i].blkno;
+			Page		cached_page = pending_writes[i].buf->data;
+			Page		page;
+			Buffer		buffer;
+
+			buffer = polar_read_buffer_common(bulkstate->smgr, bulkstate->relpersistence,
+											  bulkstate->forknum, blkno, RBM_ZERO_AND_LOCK, NULL);
+			page = BufferGetPage(buffer);
+
+			memcpy(page, cached_page, BLCKSZ);
+			pfree(cached_page);
+
+			MarkBufferDirty(buffer);
+
+			/*
+			 * If any of the pages use !page_std, we log them all as such.
+			 * That's a bit wasteful, but in practice, a mix of standard and
+			 * non-standard page layout is rare.  None of the built-in AMs do
+			 * that.
+			 */
+			if (!pending_writes[i].page_std)
+				page_std = false;
+
+			blknos[nbatch] = blkno;
+			pages[nbatch] = page;
+			buffers[nbatch] = buffer;
+
+			i++;
+			nbatch++;
+		} while (i < npending && nbatch < XLR_MAX_BLOCK_ID);
+
+		/*
+		 * log_newpages takes pages from buffer pool, it will do PageSetLSN
+		 * for those pages. After the logging stuff, we can mark dirty and
+		 * release those buffers.
+		 */
+		if (bulkstate->use_wal)
+			log_newpages(&bulkstate->smgr->smgr_rnode.node, bulkstate->forknum,
+						 nbatch, blknos, pages, page_std);
+
+		for (int j = 0; j < nbatch; j++)
+			UnlockReleaseBuffer(buffers[j]);
+	}
+
+	bulkstate->npending = 0;
+}
+
+/*
+ * Queue write of 'buf'.
+ *
+ * NB: this takes ownership of 'buf'!
+ *
+ * You are only allowed to write a given block once as part of one bulk write
+ * operation.
+ */
+void
+smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
+{
+	PendingWrite *w;
+
+	w = &bulkstate->pending_writes[bulkstate->npending++];
+	w->buf = buf;
+	w->blkno = blocknum;
+	w->page_std = page_std;
+
+	if (bulkstate->npending >= polar_bulk_write_maxpages)
+		smgr_bulk_flush(bulkstate);
+}
+
+/*
+ * Allocate a new buffer which can later be written with smgr_bulk_write().
+ *
+ * There is no function to free the buffer.  When you pass it to
+ * smgr_bulk_write(), it takes ownership and frees it when it's no longer
+ * needed.
+ *
+ * This is currently implemented as a simple palloc, but could be implemented
+ * using a ring buffer or larger chunks in the future, so don't rely on it.
+ */
+BulkWriteBuffer
+smgr_bulk_get_buf(BulkWriteState *bulkstate)
+{
+	return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index c986868dd81..dc4c0e6f1aa 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -45,6 +45,7 @@
 /* POLAR */
 #include "access/polar_logindex_redo.h"
 #include "storage/polar_fd.h"
+#include "utils/guc.h"
 
 /*
  *	The magnetic disk storage manager keeps track of open file
@@ -125,6 +126,17 @@ static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 #define EXTENSION_DONT_OPEN			(1 << 5)
 
 
+/* POLAR: GUCs */
+int			polar_zero_extend_method = POLAR_ZERO_EXTEND_BULKWRITE;
+
+const struct config_enum_entry polar_zero_extend_method_options[] = {
+	{"none", POLAR_ZERO_EXTEND_NONE, false},
+	{"bulkwrite", POLAR_ZERO_EXTEND_BULKWRITE, false},
+	{"fallocate", POLAR_ZERO_EXTEND_FALLOCATE, false},
+	{NULL, 0, false}
+};
+
+
 /* local routines */
 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
 						 bool isRedo);
@@ -457,13 +469,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
  */
 void
 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 char *buffer, bool skipFsync)
+		 const void *buffer, bool skipFsync)
 {
 	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
 
-	AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN);
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
 
 	/* POLAR: bulk extend */
 	if (reln->polar_flag_for_bulk_extend[forknum])
@@ -492,13 +504,29 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 						relpath(reln->smgr_rnode, forknum),
 						InvalidBlockNumber)));
 
+	TRACE_POSTGRESQL_SMGR_MD_EXTEND_START(forknum, blocknum,
+										  reln->smgr_rnode.node.spcNode,
+										  reln->smgr_rnode.node.dbNode,
+										  reln->smgr_rnode.node.relNode,
+										  reln->smgr_rnode.backend);
+
 	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 
 	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND);
+
+	TRACE_POSTGRESQL_SMGR_MD_EXTEND_DONE(forknum, blocknum,
+										 reln->smgr_rnode.node.spcNode,
+										 reln->smgr_rnode.node.dbNode,
+										 reln->smgr_rnode.node.relNode,
+										 reln->smgr_rnode.backend,
+										 nbytes,
+										 BLCKSZ);
+
+	if (nbytes != BLCKSZ)
 	{
 		if (nbytes < 0)
 			ereport(ERROR,
@@ -521,6 +549,262 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 }
 
+/*
+ * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
+ *
+ * Similar to mdextend(), except the relation can be extended by multiple
+ * blocks at once and the added blocks will be filled with zeroes.
+ */
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+			 BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int			remblocks = nblocks;
+
+	Assert(nblocks > 0);
+
+	/* POLAR: bulk extend */
+	if (reln->polar_flag_for_bulk_extend[forknum])
+	{
+		Assert(reln->polar_nblocks_faked_for_bulk_extend[forknum] == blocknum);
+		reln->polar_nblocks_faked_for_bulk_extend[forknum] += nblocks;
+		return;
+	}
+	/* POLAR end */
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rnode, forknum),
+						InvalidBlockNumber)));
+
+	while (remblocks > 0)
+	{
+		BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		off_t		seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+
+		if (segstartblock + remblocks > RELSEG_SIZE)
+			numblocks = RELSEG_SIZE - segstartblock;
+		else
+			numblocks = remblocks;
+
+		TRACE_POSTGRESQL_SMGR_MD_ZEROEXTEND_START(forknum, blocknum,
+												  reln->smgr_rnode.node.spcNode,
+												  reln->smgr_rnode.node.dbNode,
+												  reln->smgr_rnode.node.relNode,
+												  reln->smgr_rnode.backend);
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+		/*
+		 * If available and useful, use posix_fallocate() (via
+		 * FileFallocate()) to extend the relation. That's often more
+		 * efficient than using write(), as it commonly won't cause the kernel
+		 * to allocate page cache space for the extended pages.
+		 *
+		 * However, we don't use FileFallocate() for small extensions, as it
+		 * defeats delayed allocation on some filesystems. Not clear where
+		 * that decision should be made though? For now just use a cutoff of
+		 * 8, anything between 4 and 8 worked OK in some local testing.
+		 */
+		if (polar_zero_extend_method == POLAR_ZERO_EXTEND_FALLOCATE)
+		{
+			int			ret;
+
+			ret = FileFallocate(v->mdfd_vfd,
+								seekpos, (off_t) BLCKSZ * numblocks,
+								WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret != 0)
+			{
+				ereport(ERROR,
+						errcode_for_file_access(),
+						errmsg("could not extend file \"%s\" with FileFallocate(): %m",
+							   FilePathName(v->mdfd_vfd)),
+						errhint("Check free disk space."));
+			}
+		}
+		else if (polar_zero_extend_method == POLAR_ZERO_EXTEND_BULKWRITE)
+		{
+			int			ret;
+
+			/*
+			 * Even if we don't want to use fallocate, we can still extend a
+			 * bit more efficiently than writing each 8kB block individually.
+			 * polar_pwrite_zeros() (via FileZero()) uses bulk zero buffers to
+			 * avoid multiple writes or needing a zeroed buffer for the whole
+			 * length of the extension.
+			 */
+			ret = FileZero(v->mdfd_vfd,
+						   seekpos, (off_t) BLCKSZ * numblocks,
+						   WAIT_EVENT_DATA_FILE_EXTEND, true);
+			if (ret < 0)
+				ereport(ERROR,
+						errcode_for_file_access(),
+						errmsg("could not extend file \"%s\": %m",
+							   FilePathName(v->mdfd_vfd)),
+						errhint("Check free disk space."));
+		}
+		else if (polar_zero_extend_method == POLAR_ZERO_EXTEND_NONE)
+		{
+			int			ret;
+
+			/*
+			 * Even if we don't want to use fallocate, we can still extend a
+			 * bit more efficiently than writing each 8kB block individually.
+			 * pg_pwrite_zeroes() (via FileZero()) uses
+			 * pg_pwritev_with_retry() to avoid multiple writes or needing a
+			 * zeroed buffer for the whole length of the extension.
+			 */
+			ret = FileZero(v->mdfd_vfd,
+						   seekpos, (off_t) BLCKSZ * numblocks,
+						   WAIT_EVENT_DATA_FILE_EXTEND, false);
+			if (ret < 0)
+				ereport(ERROR,
+						errcode_for_file_access(),
+						errmsg("could not extend file \"%s\": %m",
+							   FilePathName(v->mdfd_vfd)),
+						errhint("Check free disk space."));
+		}
+		else
+			elog(ERROR, "Invalid polar_zero_extend_method %d", polar_zero_extend_method);
+
+		TRACE_POSTGRESQL_SMGR_MD_ZEROEXTEND_DONE(forknum, blocknum,
+												 reln->smgr_rnode.node.spcNode,
+												 reln->smgr_rnode.node.dbNode,
+												 reln->smgr_rnode.node.relNode,
+												 reln->smgr_rnode.backend,
+												 BLCKSZ * numblocks,
+												 BLCKSZ * numblocks);
+
+		if (!skipFsync && !SmgrIsTemp(reln))
+			register_dirty_segment(reln, forknum, v);
+
+		Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+		remblocks -= numblocks;
+		curblocknum += numblocks;
+	}
+}
+
+/*
+ * polar_mdbulkextend() -- Add blocks to the specified relation.
+ *
+ * The semantics are nearly the same as mdwrite(): write at the
+ * specified position.  However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF).  Note that we assume writing a block beyond current EOF
+ * causes intervening file space to become filled with zeroes.
+ */
+void
+polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				   int nblocks, const void *buffer, bool skipFsync)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int			remblocks = nblocks;
+
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
+
+	Assert(nblocks > 0);
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rnode, forknum),
+						InvalidBlockNumber)));
+
+	while (remblocks > 0)
+	{
+		BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		off_t		seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+		int			nbytes;
+		int			amount;
+
+		if (segstartblock + remblocks > RELSEG_SIZE)
+			numblocks = RELSEG_SIZE - segstartblock;
+		else
+			numblocks = remblocks;
+
+		amount = BLCKSZ * numblocks;
+
+		TRACE_POSTGRESQL_SMGR_MD_EXTEND_START(forknum, blocknum,
+											  reln->smgr_rnode.node.spcNode,
+											  reln->smgr_rnode.node.dbNode,
+											  reln->smgr_rnode.node.relNode,
+											  reln->smgr_rnode.backend);
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+		seekpos = (off_t) BLCKSZ * segstartblock;
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+		nbytes = FileWrite(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_EXTEND);
+
+		TRACE_POSTGRESQL_SMGR_MD_EXTEND_DONE(forknum, blocknum,
+											 reln->smgr_rnode.node.spcNode,
+											 reln->smgr_rnode.node.dbNode,
+											 reln->smgr_rnode.node.relNode,
+											 reln->smgr_rnode.backend,
+											 nbytes,
+											 amount);
+
+		if (nbytes != amount)
+		{
+			if (nbytes < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not extend file \"%s\": %m",
+								FilePathName(v->mdfd_vfd)),
+						 errhint("Check free disk space.")));
+			/* short write: complain appropriately */
+			ereport(ERROR,
+					(errcode(ERRCODE_DISK_FULL),
+					 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+							FilePathName(v->mdfd_vfd),
+							nbytes, amount, blocknum),
+					 errhint("Check free disk space.")));
+		}
+
+		if (!skipFsync && !SmgrIsTemp(reln))
+			register_dirty_segment(reln, forknum, v);
+
+		remblocks -= numblocks;
+		curblocknum += numblocks;
+		buffer = (char *) buffer + amount;
+	}
+}
+
 /*
  *	mdopenfork() -- Open one fork of the specified relation.
  *
@@ -690,13 +974,13 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
  */
 void
 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-	   char *buffer)
+	   void *buffer)
 {
 	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
 
-	AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN);
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
 
 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 										reln->smgr_rnode.node.spcNode,
@@ -752,6 +1036,96 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	}
 }
 
+/*
+ * polar_mdbulkread() -- Read the specified continuous blocks from a relation.
+ */
+void
+polar_mdbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				 int nblocks, void *buffer)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int			remblocks = nblocks;
+
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
+
+	while (remblocks > 0)
+	{
+		BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		off_t		seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+		int			nbytes;
+		int			amount;
+
+		if (segstartblock + remblocks > RELSEG_SIZE)
+			numblocks = RELSEG_SIZE - segstartblock;
+		else
+			numblocks = remblocks;
+
+		amount = BLCKSZ * numblocks;
+
+		TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, curblocknum,
+											reln->smgr_rnode.node.spcNode,
+											reln->smgr_rnode.node.dbNode,
+											reln->smgr_rnode.node.relNode,
+											reln->smgr_rnode.backend);
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, false,
+						 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+		seekpos = (off_t) BLCKSZ * segstartblock;
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+		nbytes = FileRead(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_READ);
+
+		TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, curblocknum,
+										   reln->smgr_rnode.node.spcNode,
+										   reln->smgr_rnode.node.dbNode,
+										   reln->smgr_rnode.node.relNode,
+										   reln->smgr_rnode.backend,
+										   nbytes,
+										   amount);
+
+		if (nbytes != amount)
+		{
+			if (nbytes < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read block %u in file \"%s\": %m",
+								blocknum, FilePathName(v->mdfd_vfd))));
+
+			/*
+			 * Short read: we are at or past EOF, or we read a partial block
+			 * at EOF.  Normally this is an error; upper levels should never
+			 * try to read a nonexistent block.  However, if
+			 * zero_damaged_pages is ON or we are InRecovery, we should
+			 * instead return zeroes without complaining.  This allows, for
+			 * example, the case of trying to update a block that was later
+			 * truncated away.
+			 */
+			if (zero_damaged_pages || InRecovery)
+			{
+				/* only zero damaged_pages */
+				int			damaged_pages_start_offset = nbytes - nbytes % BLCKSZ;
+
+				MemSet((char *) buffer + damaged_pages_start_offset, 0, amount - damaged_pages_start_offset);
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_DATA_CORRUPTED),
+						 errmsg("could not bulk read block %u in file \"%s\": read only %d of %d bytes",
+								blocknum, FilePathName(v->mdfd_vfd),
+								nbytes, amount)));
+		}
+
+		remblocks -= numblocks;
+		curblocknum += numblocks;
+		buffer = (char *) buffer + amount;
+	}
+}
+
 /*
  *	mdwrite() -- Write the supplied block at the appropriate location.
  *
@@ -761,13 +1135,13 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  */
 void
 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		char *buffer, bool skipFsync)
+		const void *buffer, bool skipFsync)
 {
 	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
 
-	AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN);
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
 
 	/* This assert is too expensive to have on normally ... */
 #ifdef CHECK_WRITE_VS_EXTEND
@@ -818,6 +1192,93 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		register_dirty_segment(reln, forknum, v);
 }
 
+/*
+ * polar_mdbulkwrite() -- Write the supplied continuous blocks at the appropriate location.
+ *
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF).  To extend a relation,
+ * use mdextend().
+ */
+void
+polar_mdbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				  int nblocks, const void *buffer, bool skipFsync)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int			remblocks = nblocks;
+
+	AssertPointerAlignment(buffer, PG_IO_ALIGN_SIZE);
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum < mdnblocks(reln, forknum));
+#endif
+
+	while (remblocks > 0)
+	{
+		BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		off_t		seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+		int			nbytes;
+		int			amount;
+
+		if (segstartblock + remblocks > RELSEG_SIZE)
+			numblocks = RELSEG_SIZE - segstartblock;
+		else
+			numblocks = remblocks;
+
+		amount = BLCKSZ * numblocks;
+
+		TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, curblocknum,
+											 reln->smgr_rnode.node.spcNode,
+											 reln->smgr_rnode.node.dbNode,
+											 reln->smgr_rnode.node.relNode,
+											 reln->smgr_rnode.backend);
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync,
+						 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+		seekpos = (off_t) BLCKSZ * segstartblock;
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+		nbytes = FileWrite(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
+
+		TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, curblocknum,
+											reln->smgr_rnode.node.spcNode,
+											reln->smgr_rnode.node.dbNode,
+											reln->smgr_rnode.node.relNode,
+											reln->smgr_rnode.backend,
+											nbytes,
+											amount);
+
+		if (nbytes != amount)
+		{
+			if (nbytes < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not write block %u in file \"%s\": %m",
+								blocknum, FilePathName(v->mdfd_vfd))));
+			/* short write: complain appropriately */
+			ereport(ERROR,
+					(errcode(ERRCODE_DISK_FULL),
+					 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
+							blocknum,
+							FilePathName(v->mdfd_vfd),
+							nbytes, BLCKSZ),
+					 errhint("Check free disk space.")));
+		}
+
+		if (!skipFsync && !SmgrIsTemp(reln))
+			register_dirty_segment(reln, forknum, v);
+
+		remblocks -= numblocks;
+		curblocknum += numblocks;
+		buffer = (char *) buffer + amount;
+	}
+}
+
 /*
  *	mdnblocks() -- Get the number of blocks stored in a relation.
  *
@@ -979,6 +1440,49 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	}
 }
 
+/*
+ * mdregistersync() -- Mark whole relation as needing fsync
+ */
+void
+mdregistersync(SMgrRelation reln, ForkNumber forknum)
+{
+	int			segno;
+	int			min_inactive_seg;
+
+	/*
+	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
+	 * the loop below will get them all!
+	 */
+	mdnblocks(reln, forknum);
+
+	min_inactive_seg = segno = reln->md_num_open_segs[forknum];
+
+	/*
+	 * Temporarily open inactive segments, then close them after sync.  There
+	 * may be some inactive segments left opened after error, but that is
+	 * harmless.  We don't bother to clean them up and take a risk of further
+	 * trouble.  The next mdclose() will soon close them.
+	 */
+	while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
+		segno++;
+
+	while (segno > 0)
+	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+
+		register_dirty_segment(reln, forknum, v);
+
+		/* Close inactive segments immediately */
+		if (segno > min_inactive_seg)
+		{
+			FileClose(v->mdfd_vfd);
+			_fdvec_resize(reln, forknum, segno - 1);
+		}
+
+		segno--;
+	}
+}
+
 /*
  *	mdimmedsync() -- Immediately sync a relation to stable storage.
  *
@@ -998,7 +1502,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
-	 * fsync loop will get them all!
+	 * the loop below will get them all!
 	 */
 	mdnblocks(reln, forknum);
 
@@ -1328,7 +1832,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 */
 			if (nblocks < ((BlockNumber) RELSEG_SIZE))
 			{
-				char	   *zerobuf = palloc_io_aligned(BLCKSZ, MCXT_ALLOC_ZERO);
+				char	   *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO);
 
 				mdextend(reln, forknum,
 						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
@@ -1484,141 +1988,3 @@ mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
 	 */
 	return ftag->rnode.dbNode == candidate->rnode.dbNode;
 }
-
-/*
- *	polar_mdbulkextend() -- Add a block to the specified relation.
- *
- *		The semantics are nearly the same as mdwrite(): write at the
- *		specified position.  However, this is to be used for the case of
- *		extending a relation (i.e., blocknum is at or beyond the current
- *		EOF).  Note that we assume writing a block beyond current EOF
- *		causes intervening file space to become filled with zeroes.
- */
-void
-polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-				   int blockCount, char *buffer, bool skipFsync)
-{
-	off_t		seekpos;
-	int			nbytes;
-	MdfdVec    *v;
-	uint64		newblocknum = blocknum + blockCount;
-
-	AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN);
-
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
-	/*
-	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
-	 * more --- we mustn't create a block whose number actually is
-	 * InvalidBlockNumber.
-	 */
-	if (newblocknum >= InvalidBlockNumber)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rnode, forknum),
-						InvalidBlockNumber)));
-
-	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
-
-	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ * blockCount, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ * blockCount)
-	{
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not extend file \"%s\": %m",
-							FilePathName(v->mdfd_vfd)),
-					 errhint("Check free disk space.")));
-		/* short write: complain appropriately */
-		ereport(ERROR,
-				(errcode(ERRCODE_DISK_FULL),
-				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
-						FilePathName(v->mdfd_vfd),
-						nbytes, BLCKSZ * blockCount, blocknum),
-				 errhint("Check free disk space.")));
-	}
-
-	if (!skipFsync && !SmgrIsTemp(reln))
-		register_dirty_segment(reln, forknum, v);
-}
-
-/*
- *  POLAR: bulk read
- *
- *	polar_mdbulkread() -- Read the specified continuous blocks from a relation.
- *
- *  Caller must ensure that the blockcount does not exceed the length of the relation file.
- */
-void
-polar_mdbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-				 int blockCount, char *buffer)
-{
-	off_t		seekpos;
-	int			nbytes;
-	MdfdVec    *v;
-	int			amount = blockCount * BLCKSZ;
-
-	AssertPointerAlignment(buffer, POLAR_BUFFER_ALIGN_LEN);
-
-	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
-										reln->smgr_rnode.node.spcNode,
-										reln->smgr_rnode.node.dbNode,
-										reln->smgr_rnode.node.relNode,
-										reln->smgr_rnode.backend);
-
-	v = _mdfd_getseg(reln, forknum, blocknum, false,
-					 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
-
-	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-	Assert(seekpos + (off_t) amount <= (off_t) BLCKSZ * RELSEG_SIZE);
-
-	nbytes = FileRead(v->mdfd_vfd, buffer, amount, seekpos, WAIT_EVENT_DATA_FILE_READ);
-
-	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
-									   reln->smgr_rnode.node.spcNode,
-									   reln->smgr_rnode.node.dbNode,
-									   reln->smgr_rnode.node.relNode,
-									   reln->smgr_rnode.backend,
-									   nbytes,
-									   amount);
-
-	if (nbytes != amount)
-	{
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not read block %u in file \"%s\": %m",
-							blocknum, FilePathName(v->mdfd_vfd))));
-
-		/*
-		 * Short read: we are at or past EOF, or we read a partial block at
-		 * EOF.  Normally this is an error; upper levels should never try to
-		 * read a nonexistent block.  However, if zero_damaged_pages is ON or
-		 * we are InRecovery, we should instead return zeroes without
-		 * complaining.  This allows, for example, the case of trying to
-		 * update a block that was later truncated away.
-		 */
-		if (zero_damaged_pages || InRecovery || POLAR_IN_LOGINDEX_PARALLEL_REPLAY())
-		{
-			/* only zero damaged_pages */
-			int			damaged_pages_start_offset = nbytes - nbytes % BLCKSZ;
-
-			MemSet((char *) buffer + damaged_pages_start_offset, 0, amount - damaged_pages_start_offset);
-		}
-		else
-			ereport(ERROR,
-					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("could not bulk read block %u in file \"%s\": read only %d of %d bytes",
-							blocknum, FilePathName(v->mdfd_vfd),
-							nbytes, amount)));
-	}
-}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index bdf159e75dc..d0b69d3238c 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -6,6 +6,7 @@
  *	  All file system operations in POSTGRES dispatch through these
  *	  routines.
  *
+ * Portions Copyright (c) 2024, Alibaba Group Holding Limited
  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
@@ -53,24 +54,31 @@ typedef struct f_smgr
 	void		(*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
 								bool isRedo);
 	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, char *buffer, bool skipFsync);
+								BlockNumber blocknum, const void *buffer, bool skipFsync);
+	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum);
 	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
-							  BlockNumber blocknum, char *buffer);
+							  BlockNumber blocknum, void *buffer);
 	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
-							   BlockNumber blocknum, char *buffer, bool skipFsync);
+							   BlockNumber blocknum, const void *buffer, bool skipFsync);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-	/* POLAR: bulk io */
-	void		(*polar_smgr_bulkextend) (SMgrRelation reln, ForkNumber forknum,
-										  BlockNumber blocknum, int blockCount, char *buffer, bool skipFsync);
+	void		(*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
+	/* POLAR: bulk read */
 	void		(*polar_smgr_bulkread) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-										int blockCount, char *buffer);
+										int nblocks, void *buffer);
+	/* POLAR: bulk write */
+	void		(*polar_smgr_bulkwrite) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+										 int nblocks, const void *buffer, bool skipFsync);
+	/* POLAR: bulk extend */
+	void		(*polar_smgr_bulkextend) (SMgrRelation reln, ForkNumber forknum,
+										  BlockNumber blocknum, int nblocks, const void *buffer, bool skipFsync);
 	/* POLAR end */
 } f_smgr;
 
@@ -85,6 +93,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_exists = mdexists,
 		.smgr_unlink = mdunlink,
 		.smgr_extend = mdextend,
+		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
@@ -92,9 +101,13 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
-		/* POLAR: extend io */
-		.polar_smgr_bulkextend = polar_mdbulkextend,
+		.smgr_registersync = mdregistersync,
+		/* POLAR: bulk read */
 		.polar_smgr_bulkread = polar_mdbulkread,
+		/* POLAR: bulk write */
+		.polar_smgr_bulkwrite = polar_mdbulkwrite,
+		/* POLAR: extend batch */
+		.polar_smgr_bulkextend = polar_mdbulkextend,
 		/* POLAR end */
 	}
 };
@@ -544,7 +557,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
  */
 void
 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		   char *buffer, bool skipFsync)
+		   const void *buffer, bool skipFsync)
 {
 	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
@@ -573,23 +586,27 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	/* POLAR end */
 }
 
-/* POLAR: bulk extend */
+/*
+ * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ *
+ * Similar to smgrextend(), except the relation can be extended by
+ * multiple blocks at once and the added blocks will be filled with
+ * zeroes.
+ */
 void
-polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-					 int blockCount, char *buffer, bool skipFsync)
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			   int nblocks, bool skipFsync)
 {
-	Assert(blockCount >= 1);
-	Assert(!reln->polar_flag_for_bulk_extend[forknum]);
-
-	smgrsw[reln->smgr_which].polar_smgr_bulkextend(reln, forknum, blocknum, blockCount, buffer, skipFsync);
+	smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+											 nblocks, skipFsync);
 
 	/*
-	 * Normally we expect this to increase nblocks by one, but if the cached
-	 * value isn't as expected, just invalidate it so the next call asks the
-	 * kernel. nblock should be blocknum + blockCount in bulkExtend
+	 * Normally we expect this to increase the fork size by nblocks, but if
+	 * the cached value isn't as expected, just invalidate it so the next call
+	 * asks the kernel.
 	 */
 	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + blockCount;
+		reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
 	else
 		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 
@@ -597,24 +614,41 @@ polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum
 	 * POLAR RSC: update new blocknum into entry.
 	 */
 	if (POLAR_RSC_SHOULD_UPDATE(reln, forknum))
-		polar_rsc_update_entry(reln, forknum, blocknum + blockCount);
+		polar_rsc_update_entry(reln, forknum, blocknum + nblocks);
+	/* POLAR end */
 }
 
 /*
- *  POLAR: bulk read
- *
- *	polar_smgrbulkread() -- read multi particular block from a relation into the supplied
- *    				  buffer.
+ * polar_smgrbulkextend() -- Add new blocks to a file.
  *
- *		This routine is called from the buffer manager in order to
- *		instantiate pages in the shared buffer cache.  All storage managers
- *		return pages in the format that POSTGRES expects.
+ * The semantics are nearly the same as smgrwrite(): write at the
+ * specified position.  However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF).  Note that we assume writing nblocks beyond current EOF
+ * causes intervening file space to become filled with zeroes.
  */
 void
-polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-				   int blockCount, char *buffer)
+polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					 int nblocks, const void *buffer, bool skipFsync)
 {
-	smgrsw[reln->smgr_which].polar_smgr_bulkread(reln, forknum, blocknum, blockCount, buffer);
+	smgrsw[reln->smgr_which].polar_smgr_bulkextend(reln, forknum, blocknum, nblocks, buffer, skipFsync);
+
+	/*
+	 * Normally we expect this to increase the fork size by nblocks, but if
+	 * the cached value isn't as expected, just invalidate it so the next call
+	 * asks the kernel.
+	 */
+	if (reln->smgr_cached_nblocks[forknum] == blocknum)
+		reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+	else
+		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+
+	/*
+	 * POLAR RSC: update new blocknum into entry.
+	 */
+	if (POLAR_RSC_SHOULD_UPDATE(reln, forknum))
+		polar_rsc_update_entry(reln, forknum, blocknum + nblocks);
+	/* POLAR end */
 }
 
 /*
@@ -640,11 +674,26 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
  */
 void
 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 char *buffer)
+		 void *buffer)
 {
 	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
 }
 
+/*
+ * polar_smgrbulkread() -- read multi particular blocks from a relation into
+ *						   the supplied buffers.
+ *
+ * This routine is called from the buffer manager in order to
+ * instantiate pages in the shared buffer cache.  All storage managers
+ * return pages in the format that POSTGRES expects.
+ */
+void
+polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+				   int nblocks, void *buffer)
+{
+	smgrsw[reln->smgr_which].polar_smgr_bulkread(reln, forknum, blocknum, nblocks, buffer);
+}
+
 /*
  *	smgrwrite() -- Write the supplied buffer out.
  *
@@ -662,12 +711,42 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  */
 void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		  char *buffer, bool skipFsync)
+		  const void *buffer, bool skipFsync)
 {
 	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
 										buffer, skipFsync);
 }
 
+/*
+ * polar_smgrbulkwrite() -- Write the supplied buffers out.
+ *
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF).  To extend a relation,
+ * use polar_smgrbulkextend().
+ *
+ * This is not a synchronous write -- the block is not necessarily
+ * on disk at return, only dumped out to the kernel.  However,
+ * provisions will be made to fsync the write before the next checkpoint.
+ *
+ * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
+ * something that prevents a concurrent checkpoint from "racing ahead" of the
+ * write.  One way to prevent that is by holding a lock on the buffer; the
+ * buffer manager's writes are protected by that.  The bulk writer facility
+ * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
+ * checkpoint happened; that relies on the fact that no other backend can be
+ * concurrently modifying the page.
+ *
+ * skipFsync indicates that the caller will make other provisions to
+ * fsync the relation, so we needn't bother.  Temporary relations also
+ * do not require fsync.
+ */
+void
+polar_smgrbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+					int nblocks, const void *buffer, bool skipFsync)
+{
+	smgrsw[reln->smgr_which].polar_smgr_bulkwrite(reln, forknum, blocknum, nblocks,
+												  buffer, skipFsync);
+}
 
 /*
  *	smgrwriteback() -- Trigger kernel writeback for the supplied range of
@@ -847,6 +926,24 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	}
 }
 
+/*
+ * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
+ *
+ * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
+ * true, to register the fsyncs that were skipped earlier.
+ *
+ * Note: be mindful that a checkpoint could already have happened between the
+ * smgrwrite or smgrextend calls and this!  In that case, the checkpoint
+ * already missed fsyncing this relation, and you should use smgrimmedsync
+ * instead.  Most callers should use the bulk loading facility in bulk_write.c
+ * which handles all that.
+ */
+void
+smgrregistersync(SMgrRelation reln, ForkNumber forknum)
+{
+	smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
+}
+
 /*
  *	smgrimmedsync() -- Force the specified relation to stable storage.
  *
diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c
index 6dddd180afe..44541f42a76 100644
--- a/src/backend/utils/activity/pgstat_relation.c
+++ b/src/backend/utils/activity/pgstat_relation.c
@@ -819,9 +819,6 @@ pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait)
 	tabentry->polar_bulk_read_calls += lstats->t_counts.polar_t_bulk_read_calls;
 	tabentry->polar_bulk_read_calls_IO += lstats->t_counts.polar_t_bulk_read_calls_IO;
 	tabentry->polar_bulk_read_blocks_IO += lstats->t_counts.polar_t_bulk_read_blocks_IO;
-
-	/* POLAR: create index bulk extend */
-	tabentry->polar_bulk_create_index_extends_times += lstats->t_counts.polar_t_bulk_create_index_extends_times;
 	/* POLAR end */
 
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index de70701465e..884dee08be4 100755
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -117,13 +117,16 @@
 #include "utils/xml.h"
 
 /* POLAR */
+#include "access/hio.h"
 #include "access/multixact.h"
 #include "access/polar_logindex.h"
 #include "access/polar_logindex_redo.h"
 #include "access/subtrans.h"
 #include "access/slru.h"
 #include "commands/tablecmds.h"
+#include "common/file_utils.h"
 #include "common/username.h"
+#include "storage/bulk_write.h"
 #include "storage/polar_fd.h"
 #include "storage/polar_rsc.h"
 #include "storage/polar_xlogbuf.h"
@@ -715,6 +718,7 @@ extern const struct config_enum_entry dynamic_shared_memory_options[];
 
 /* POLAR enum GUC options start */
 extern const struct config_enum_entry polar_release_assert_level_options[];
+extern const struct config_enum_entry polar_zero_extend_method_options[];
 
 const struct config_enum_entry polar_session_id_display_options[] = {
 	{"proxy", POLAR_SID_DISPLAY_PROXY, false},
@@ -862,20 +866,8 @@ bool		polar_enable_alloc_checkinterrupts;
 
 bool		polar_enable_sync_ddl;
 
-/* POLAR: bulk io */
-int			polar_recovery_bulk_extend_size = 0;
-int			polar_min_bulk_extend_table_size = 0;
-bool		polar_enable_primary_recovery_bulk_extend = false;
-int			polar_bulk_extend_size = 0;
-int			polar_bulk_read_size = 0;
-int			polar_index_bulk_extend_size = 0;
-int			polar_index_create_bulk_extend_size = 0;
-
 /* POLAR end */
 
-/* POLAR: partial write */
-bool		polar_has_partial_write;
-
 bool		polar_disable_escape_inside_gbk_character;
 
 /*
@@ -1074,8 +1066,8 @@ const char *const config_group_names[] =
 	gettext_noop("PolarDB Buffer Management"),
 	/* POLAR_PROXY */
 	gettext_noop("PolarDB Proxy"),
-	/* POLAR_BULK_READ_EXTEND */
-	gettext_noop("PolarDB bulk read/extend"),
+	/* POLAR I/O management */
+	gettext_noop("PolarDB I/O Management"),
 	/* POLAR end */
 
 	/* DEVELOPER_OPTIONS */
@@ -1233,7 +1225,7 @@ static const unit_conversion time_unit_conversion_table[] =
 /******** option records follow ********/
 static struct config_bool ConfigureNamesBool[] =
 {
-	/* POLAR boolean GUCs start */
+	/* POLAR bool GUCs start */
 	{
 		{"polar_enable_persisted_logical_slot", PGC_POSTMASTER, UNGROUPED,
 			gettext_noop("Enable persisted logical slot on shared storage."),
@@ -1664,6 +1656,17 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"polar_enable_fallocate_no_hide_stale", PGC_USERSET, POLAR_IO_MANAGEMENT,
+			gettext_noop("Allow using FALLOC_FL_NO_HIDE_STALE during file extension."),
+			NULL,
+			POLAR_GUC_IS_CHANGABLE | POLAR_GUC_IS_INVISIBLE
+		},
+		&polar_enable_fallocate_no_hide_stale,
+		true,
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * POLAR: enable to send SIGSTOP rather than SIGQUIT to all peers when
 	 * backend exit abnormally, this is set with -T parameter when start
@@ -1724,20 +1727,6 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
-	/*
-	 * POLAR: bulk io
-	 */
-	{
-		{"polar_enable_primary_recovery_bulk_extend", PGC_SIGHUP, POLAR_BULK_READ_EXTEND,
-			gettext_noop("A switch to control whether to use xlog bulk extend opt during recovery on primary."),
-			NULL,
-			GUC_NO_RESET_ALL | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
-		},
-		&polar_enable_primary_recovery_bulk_extend,
-		true,
-		NULL, NULL, NULL
-	},
-
 	{
 		{"polar_enable_async_lock_replay_debug", PGC_SIGHUP, REPLICATION_STANDBY,
 			gettext_noop("Enable async lock replay debug logging."),
@@ -1793,7 +1782,7 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
-	/* POLAR boolean GUCs end */
+	/* POLAR bool GUCs end */
 
 	{
 		{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
@@ -3067,8 +3056,6 @@ static struct config_bool ConfigureNamesBool[] =
 	},
 
 
-	/* POLAR bool GUCs end */
-
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
@@ -3078,7 +3065,7 @@ static struct config_bool ConfigureNamesBool[] =
 
 static struct config_int ConfigureNamesInt[] =
 {
-	/* POLAR integer GUCs start */
+	/* POLAR int GUCs start */
 
 	{
 		/* see max_connections */
@@ -3701,89 +3688,76 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_BLOCKS | POLAR_GUC_IS_VISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
 		&polar_ring_buffer_vacuum_size,
-		0, 0, (10 * 1024 * 1024 * 1024L) / BLCKSZ,
+		128 * 1024 * 1024 / BLCKSZ, 0, (10 * 1024 * 1024 * 1024L) / BLCKSZ,
 		NULL, NULL, NULL
 	},
 
 	{
-		{"polar_recovery_bulk_extend_size", PGC_SIGHUP, POLAR_BULK_READ_EXTEND,
-			gettext_noop("Sets the size for bulk file extension while replaying xlog on standby (0 turns this feature off)."),
-			NULL,
+		{"polar_bulk_read_size", PGC_USERSET, POLAR_IO_MANAGEMENT,
+			gettext_noop("Size of bulk read."),
+			gettext_noop("0 turns this feature off."),
 			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		&polar_recovery_bulk_extend_size,
-		512, 0, 2048,
+		&polar_bulk_read_size,
+		16, 0, MAX_BUFFERS_TO_READ_BY,
 		NULL, NULL, NULL
 	},
 
 	{
-		{"polar_min_bulk_extend_table_size", PGC_USERSET, POLAR_BULK_READ_EXTEND,
-			gettext_noop("Sets the minimum amount of table data for bulk extend,"
-						 "bulk extend is enabled only when the table size >= polar_min_bulk_extend_table_size."),
-			NULL,
+		{"polar_heap_bulk_extend_size", PGC_USERSET, POLAR_IO_MANAGEMENT,
+			gettext_noop("Size of bulk extend for heap table."),
+			gettext_noop("0 turns this feature off."),
 			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		&polar_min_bulk_extend_table_size,
-		(8 * 1024 * 1024) / BLCKSZ, 0, INT_MAX / 2,
+		&polar_heap_bulk_extend_size,
+		512, 0, MAX_BUFFERS_TO_EXTEND_BY,
 		NULL, NULL, NULL
 	},
 
 	{
-		{"polar_bulk_extend_size", PGC_USERSET, POLAR_BULK_READ_EXTEND,
-			gettext_noop("Sets the size of preallocate file, 0 (turning this feature off)."),
-			NULL,
+		{"polar_index_bulk_extend_size", PGC_USERSET, POLAR_IO_MANAGEMENT,
+			gettext_noop("Size of bulk extend for index table."),
+			gettext_noop("0 turns this feature off."),
 			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		&polar_bulk_extend_size,
-		512, 0, INT_MAX / 2,
+		&polar_index_bulk_extend_size,
+		128, 0, MAX_BUFFERS_TO_EXTEND_BY,
 		NULL, NULL, NULL
 	},
 
 	{
-		{"polar_bulk_read_size", PGC_USERSET, POLAR_BULK_READ_EXTEND,
-			gettext_noop("Sets size of bulk read, 0 (turning this feature off). polar_bulk_read_size= 16 means 128KB."),
-			NULL,
+		{"polar_recovery_bulk_extend_size", PGC_SIGHUP, POLAR_IO_MANAGEMENT,
+			gettext_noop("Size for bulk extend during recovery."),
+			gettext_noop("0 turns this feature off."),
 			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		&polar_bulk_read_size,
-		16, 0, POLAR_MAX_BULK_IO_SIZE,
-		NULL, NULL, NULL
-	},
-
-	{
-		{"polar_xlog_page_buffers", PGC_POSTMASTER, RESOURCES_MEM,
-			gettext_noop("Sets the size of xlog buffer used by multi processes."),
-			NULL,
-			GUC_UNIT_MB | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
-		},
-		&polar_xlog_page_buffers,
-		0, 0, INT_MAX / 2,
+		&polar_recovery_bulk_extend_size,
+		512, 0, MAX_BUFFERS_TO_EXTEND_BY,
 		NULL, NULL, NULL
 	},
 
 	{
-		{"polar_index_bulk_extend_size", PGC_SIGHUP, POLAR_BULK_READ_EXTEND,
-			gettext_noop("Sets the size of preallocate file for index, 0 (turning this feature off)."),
+		{"polar_bulk_write_maxpages", PGC_USERSET, POLAR_IO_MANAGEMENT,
+			gettext_noop("Max cached pages in bulk write."),
 			NULL,
 			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		/* POLAR: default 1MB */
-		&polar_index_bulk_extend_size,
-		128, 0, INT_MAX / 2,
+		&polar_bulk_write_maxpages,
+		128, 1, 512,
 		NULL, NULL, NULL
 	},
 
 	{
-		{"polar_index_create_bulk_extend_size", PGC_SIGHUP, POLAR_BULK_READ_EXTEND,
-			gettext_noop("Sets the size of preallocate file for index create, 0 (turning this feature off)."),
+		{"polar_xlog_page_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the size of xlog buffer used by multi processes."),
 			NULL,
-			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
+			GUC_UNIT_MB | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		/* POLAR: default 4MB */
-		&polar_index_create_bulk_extend_size,
-		512, 0, INT_MAX / 2,
+		&polar_xlog_page_buffers,
+		0, 0, INT_MAX / 2,
 		NULL, NULL, NULL
 	},
+
 	/* POLAR int GUCs end */
 
 	{
@@ -5415,16 +5389,17 @@ static struct config_int ConfigureNamesInt[] =
 		30000, 0, INT_MAX,
 		NULL, NULL, NULL
 	},
+
 	{
-		{"polar_wal_init_set_size", PGC_SIGHUP, WAL_SETTINGS,
-			gettext_noop("Set the size of each data block written when initializing the zero wal file."),
-			NULL,
-			GUC_UNIT_BYTE | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
+		{"polar_zero_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Set the size of zero buffer, which is used to init zero wal and data file."),
+			gettext_noop("A value of -1 indicates a request for auto-tune, 0 disables it."),
+			GUC_UNIT_BLOCKS | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
 		},
-		&polar_wal_init_set_size,
-		POLAR_DEFAULT_XLOG_FILL_ZERO_SIZE, POLAR_MIN_XLOG_FILL_ZERO_SIZE, POLAR_MAX_XLOG_FILL_ZERO_SIZE,
-		NULL, NULL, NULL
+		&polar_zero_buffers,
+		-1, -1, 4096,
 	},
+
 	{
 		{"polar_instance_spec_mem", PGC_SIGHUP, DEVELOPER_OPTIONS,
 			gettext_noop("PolarDB instance specification for memory."),
@@ -6765,6 +6740,17 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"polar_zero_extend_method", PGC_USERSET, POLAR_IO_MANAGEMENT,
+			gettext_noop("Selects the method of zero extend to use."),
+			NULL,
+			GUC_NOT_IN_SAMPLE | GUC_NO_SHOW_ALL | POLAR_GUC_IS_INVISIBLE | POLAR_GUC_IS_CHANGABLE
+		},
+		&polar_zero_extend_method,
+		POLAR_ZERO_EXTEND_FALLOCATE, polar_zero_extend_method_options,
+		NULL, NULL, NULL
+	},
+
 	/* POLAR enum GUCs end */
 
 	{
@@ -7183,6 +7169,8 @@ static const char *const map_old_guc_names[] = {
 	/* POLAR */
 	"smgr_shared_relations", "polar_rsc_shared_relations",	/* RSC old style GUC */
 	"smgr_pool_sweep_times", "polar_rsc_pool_sweep_times",	/* RSC old style GUC */
+	"polar_bulk_extend_size", "polar_heap_bulk_extend_size",	/* bulk extend old style
+																 * GUC */
 	/* POLAR */
 
 	NULL
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 29eeaf70f62..08f52a2aa79 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -1434,25 +1434,12 @@ MemoryContextAllocAligned(MemoryContext context,
 	return aligned;
 }
 
-void *
-MemoryContextAllocIOAligned(MemoryContext context, Size size, int flags)
-{
-	/* FIXME: don't hardcode page size */
-	return MemoryContextAllocAligned(context, size, POLAR_BUFFER_ALIGN_LEN, flags);
-}
-
 void *
 palloc_aligned(Size size, Size alignto, int flags)
 {
 	return MemoryContextAllocAligned(CurrentMemoryContext, size, alignto, flags);
 }
 
-void *
-palloc_io_aligned(Size size, int flags)
-{
-	return MemoryContextAllocIOAligned(CurrentMemoryContext, size, flags);
-}
-
 /*
  * POLAR: Like palloc,
  * but allow to work well in the critical section temporarily.
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index 3ebbcf88ebe..b4be30ec717 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -86,6 +86,10 @@ provider postgresql {
 	probe smgr__md__read__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int);
 	probe smgr__md__write__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int);
 	probe smgr__md__write__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int);
+	probe smgr__md__extend__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int);
+	probe smgr__md__extend__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int);
+	probe smgr__md__zeroextend__start(ForkNumber, BlockNumber, Oid, Oid, Oid, int);
+	probe smgr__md__zeroextend__done(ForkNumber, BlockNumber, Oid, Oid, Oid, int, int, int);
 
 	probe wal__insert(unsigned char, unsigned char);
 	probe wal__switch();
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 559401dda9d..cde8d3b8c91 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -121,6 +121,7 @@ typedef struct ResourceOwnerData
 
 	/* We have built-in support for remembering: */
 	ResourceArray bufferarr;	/* owned buffers */
+	ResourceArray bufferioarr;	/* in-progress buffer IO */
 	ResourceArray catrefarr;	/* catcache references */
 	ResourceArray catlistrefarr;	/* catcache-list pins */
 	ResourceArray relrefarr;	/* relcache references */
@@ -441,6 +442,7 @@ ResourceOwnerCreate(ResourceOwner parent, const char *name)
 	}
 
 	ResourceArrayInit(&(owner->bufferarr), BufferGetDatum(InvalidBuffer));
+	ResourceArrayInit(&(owner->bufferioarr), BufferGetDatum(InvalidBuffer));
 	ResourceArrayInit(&(owner->catrefarr), PointerGetDatum(NULL));
 	ResourceArrayInit(&(owner->catlistrefarr), PointerGetDatum(NULL));
 	ResourceArrayInit(&(owner->relrefarr), PointerGetDatum(NULL));
@@ -516,6 +518,24 @@ ResourceOwnerReleaseInternal(ResourceOwner owner,
 
 	if (phase == RESOURCE_RELEASE_BEFORE_LOCKS)
 	{
+		/*
+		 * Abort failed buffer IO. AbortBufferIO()->TerminateBufferIO() calls
+		 * ResourceOwnerForgetBufferIOs(), so we just have to iterate till
+		 * there are none.
+		 *
+		 * Needs to be before we release buffer pins.
+		 *
+		 * During a commit, there shouldn't be any in-progress IO.
+		 */
+		while (ResourceArrayGetAny(&(owner->bufferioarr), &foundres))
+		{
+			Buffer		res = DatumGetBuffer(foundres);
+
+			if (isCommit)
+				elog(PANIC, "lost track of buffer IO on buffer %u", res);
+			AbortBufferIO(res);
+		}
+
 		/*
 		 * Release buffer pins.  Note that ReleaseBuffer will remove the
 		 * buffer entry from our array, so we just have to iterate till there
@@ -741,6 +761,7 @@ ResourceOwnerDelete(ResourceOwner owner)
 
 	/* And it better not own any resources, either */
 	Assert(owner->bufferarr.nitems == 0);
+	Assert(owner->bufferioarr.nitems == 0);
 	Assert(owner->catrefarr.nitems == 0);
 	Assert(owner->catlistrefarr.nitems == 0);
 	Assert(owner->relrefarr.nitems == 0);
@@ -770,6 +791,7 @@ ResourceOwnerDelete(ResourceOwner owner)
 
 	/* And free the object. */
 	ResourceArrayFree(&(owner->bufferarr));
+	ResourceArrayFree(&(owner->bufferioarr));
 	ResourceArrayFree(&(owner->catrefarr));
 	ResourceArrayFree(&(owner->catlistrefarr));
 	ResourceArrayFree(&(owner->relrefarr));
@@ -971,6 +993,43 @@ ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
 			 buffer, owner->name);
 }
 
+/*
+ * Make sure there is room for at least one more entry in a ResourceOwner's
+ * buffer array.
+ *
+ * This is separate from actually inserting an entry because if we run out
+ * of memory, it's critical to do so *before* acquiring the resource.
+ */
+void
+ResourceOwnerEnlargeBufferIOs(ResourceOwner owner)
+{
+	/* We used to allow pinning buffers without a resowner, but no more */
+	Assert(owner != NULL);
+	ResourceArrayEnlarge(&(owner->bufferioarr));
+}
+
+/*
+ * Remember that a buffer IO is owned by a ResourceOwner
+ *
+ * Caller must have previously done ResourceOwnerEnlargeBufferIOs()
+ */
+void
+ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
+{
+	ResourceArrayAdd(&(owner->bufferioarr), BufferGetDatum(buffer));
+}
+
+/*
+ * Forget that a buffer IO is owned by a ResourceOwner
+ */
+void
+ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
+{
+	if (!ResourceArrayRemove(&(owner->bufferioarr), BufferGetDatum(buffer)))
+		elog(PANIC, "buffer IO %d is not owned by resource owner %s",
+			 buffer, owner->name);
+}
+
 /*
  * Remember that a Local Lock is owned by a ResourceOwner
  *
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index fb0d395f44a..4e4ca750deb 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -252,7 +252,7 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
 	 */
 	while (blocknum > lts->nBlocksWritten)
 	{
-		PGAlignedBlock zerobuf;
+		PGIOAlignedBlock zerobuf;
 
 		MemSet(zerobuf.data, 0, sizeof(zerobuf));
 
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index b0045746b0b..8864460c7cf 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -2965,7 +2965,7 @@ main(int argc, char **argv)
 	BaseBackup(compression_algorithm, compression_detail, compressloc,
 			   &client_compress);
 
-	polar_vfs_destory_fe(ftype, polar_disk_name);
+	polar_vfs_destroy_fe(ftype, polar_disk_name);
 	success = true;
 	return 0;
 }
diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index d363c11f20b..98926a64bac 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -213,33 +213,27 @@ dir_open_for_write(const char *pathname, const char *temp_suffix, size_t pad_to_
 	/* Do pre-padding on non-compressed files */
 	if (pad_to_size && dir_data->compression_algorithm == PG_COMPRESSION_NONE)
 	{
-		char	   *data;
-		int			bytes;
-		int			write_once_bytes;
+		ssize_t		rc;
 
-		write_once_bytes = polar_is_write_pfs ? MAX_SEND_SIZE : XLOG_BLCKSZ;
-		data = (char *) pg_malloc0(write_once_bytes);
-		for (bytes = 0; bytes < pad_to_size; bytes += write_once_bytes)
+		rc = polar_pwrite_zeros(fd, pad_to_size, 0);
+
+		if (rc < 0)
 		{
-			errno = 0;
-			if (polar_write(fd, data, write_once_bytes) != write_once_bytes)
-			{
-				/* If write didn't set errno, assume problem is no disk space */
-				dir_data->lasterrno = errno ? errno : ENOSPC;
-				pg_free(data);
-				polar_close(fd);
-				return NULL;
-			}
+			dir_data->lasterrno = errno;
+			polar_close(fd);
+			return NULL;
 		}
 
+		/*
+		 * pg_pwrite() (called via polar_pwrite_zeros()) may have moved the
+		 * file position, so reset it (see win32pwrite.c).
+		 */
 		if (polar_lseek(fd, 0, SEEK_SET) != 0)
 		{
 			dir_data->lasterrno = errno;
-			pg_free(data);
 			polar_close(fd);
 			return NULL;
 		}
-		pg_free(data);
 	}
 
 	/*
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 7987c734317..d843328458b 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -188,7 +188,7 @@ skipfile(const char *fn)
 static void
 scan_file(const char *fn, int segmentno)
 {
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	PageHeader	header = (PageHeader) buf.data;
 	int			f;
 	BlockNumber blockno;
@@ -686,7 +686,7 @@ main(int argc, char *argv[])
 			printf(_("Checksums disabled in cluster\n"));
 	}
 
-	polar_vfs_destory_simple_fe();
+	polar_vfs_destroy_simple_fe();
 
 	return 0;
 }
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 6cf3b455650..ed900116490 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -183,7 +183,7 @@ main(int argc, char *argv[])
 				 "Either the file is corrupt, or it has a different layout than this program\n"
 				 "is expecting.  The results below are untrustworthy.\n\n"));
 
-	polar_vfs_destory_simple_fe();
+	polar_vfs_destroy_simple_fe();
 
 	/* set wal segment size */
 	WalSegSz = ControlFile->xlog_seg_size;
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 7b1b3a9a54f..22e60239703 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -2296,7 +2296,7 @@ get_control_dbstate(void)
 
 	/* POLAR: umount */
 	if (pg_config)
-		polar_vfs_destory_simple_fe();
+		polar_vfs_destroy_simple_fe();
 
 	return ret;
 }
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index a6ff206f289..a040f5053c4 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -520,7 +520,7 @@ main(int argc, char *argv[])
 
 	if (polar_real_datadir)
 		free(polar_real_datadir);
-	polar_vfs_destory_simple_fe();
+	polar_vfs_destroy_simple_fe();
 
 	printf(_("Write-ahead log reset\n"));
 	return 0;
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 2e50485c395..83b37a1e91c 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -77,7 +77,7 @@ static void
 local_queue_fetch_file(rewind_source *source, const char *path, size_t len)
 {
 	const char *datadir = ((local_source *) source)->datadir;
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	char		srcpath[MAXPGPATH];
 	int			srcfd;
 	size_t		written_len;
@@ -129,7 +129,7 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off,
 						size_t len)
 {
 	const char *datadir = ((local_source *) source)->datadir;
-	PGAlignedBlock buf;
+	PGIOAlignedBlock buf;
 	char		srcpath[MAXPGPATH];
 	int			srcfd;
 	off_t		begin = off;
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index e16c2e009eb..7365bdef2e1 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -177,8 +177,8 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
 {
 	int			src_fd;
 	int			dst_fd;
-	PGAlignedBlock buffer;
-	PGAlignedBlock new_vmbuf;
+	PGIOAlignedBlock buffer;
+	PGIOAlignedBlock new_vmbuf;
 	ssize_t		totalBytesRead = 0;
 	ssize_t		src_filesize;
 	int			rewriteVmBytesPerPage;
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index c07dc28809e..7f6af3935fd 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -1196,13 +1196,13 @@ main(int argc, char **argv)
 
 	XLogReaderFree(xlogreader_state);
 	if (polar_disk_name && polar_storage_cluster_name)
-		polar_vfs_destory_fe(ftype, polar_disk_name);
+		polar_vfs_destroy_fe(ftype, polar_disk_name);
 
 	return EXIT_SUCCESS;
 
 bad_argument:
 	if (polar_disk_name && polar_storage_cluster_name)
-		polar_vfs_destory_fe(ftype, polar_disk_name);
+		polar_vfs_destroy_fe(ftype, polar_disk_name);
 	pg_log_error_hint("Try \"%s --help\" for more information.", progname);
 	return EXIT_FAILURE;
 }
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index ccb9133c2ee..68ddc694c99 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -28,11 +28,16 @@
 #ifdef FRONTEND
 #include "common/logging.h"
 #endif
+#include "port/pg_iovec.h"
 
 /* POLAR */
 #include "storage/polar_fd.h"
 /* POLAR end */
 
+int			polar_zero_buffer_size = 0;
+int			polar_zero_buffers = -1;
+void	   *polar_zero_buffer = NULL;
+
 #ifdef FRONTEND
 
 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
@@ -498,3 +503,203 @@ get_dirent_type(const char *path,
 
 	return result;
 }
+
+/*
+ * Compute what remains to be done after a possibly partial vectored read or
+ * write.  The part of 'source' beginning after 'transferred' bytes is copied
+ * to 'destination', and its length is returned.  'source' and 'destination'
+ * may point to the same array, for in-place adjustment.  A return value of
+ * zero indicates completion (for callers without a cheaper way to know that).
+ */
+int
+compute_remaining_iovec(struct iovec *destination,
+						const struct iovec *source,
+						int iovcnt,
+						size_t transferred)
+{
+	Assert(iovcnt > 0);
+
+	/* Skip wholly transferred iovecs. */
+	while (source->iov_len <= transferred)
+	{
+		transferred -= source->iov_len;
+		source++;
+		iovcnt--;
+
+		/* All iovecs transferred? */
+		if (iovcnt == 0)
+		{
+			/*
+			 * We don't expect the kernel to transfer more than we asked it
+			 * to, or something is out of sync.
+			 */
+			Assert(transferred == 0);
+			return 0;
+		}
+	}
+
+	/* Copy the remaining iovecs to the front of the array. */
+	if (source != destination)
+		memmove(destination, source, sizeof(*source) * iovcnt);
+
+	/* Adjust leading iovec, which may have been partially transferred. */
+	Assert(destination->iov_len > transferred);
+	destination->iov_base = (char *) destination->iov_base + transferred;
+	destination->iov_len -= transferred;
+
+	return iovcnt;
+}
+
+/*
+ * pg_pwritev_with_retry
+ *
+ * Convenience wrapper for pg_pwritev() that retries on partial write.  If an
+ * error is returned, it is unspecified how much has been written.
+ */
+ssize_t
+pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	struct iovec iov_copy[PG_IOV_MAX];
+	ssize_t		sum = 0;
+	ssize_t		part;
+
+	/* We'd better have space to make a copy, in case we need to retry. */
+	if (iovcnt > PG_IOV_MAX)
+	{
+		errno = EINVAL;
+		return -1;
+	}
+
+	do
+	{
+		/* Write as much as we can. */
+		part = polar_pwritev(fd, iov, iovcnt, offset);
+		if (part < 0)
+			return -1;
+
+#ifdef SIMULATE_SHORT_WRITE
+		part = Min(part, 4096);
+#endif
+
+		/* Count our progress. */
+		sum += part;
+		offset += part;
+
+		/*
+		 * See what is left.  On the first loop we used the caller's array,
+		 * but in later loops we'll use our local copy that we are allowed to
+		 * mutate.
+		 */
+		iovcnt = compute_remaining_iovec(iov_copy, iov, iovcnt, part);
+		iov = iov_copy;
+	} while (iovcnt > 0);
+
+	return sum;
+}
+
+/*
+ * pg_pwrite_zeros
+ *
+ * Writes zeros to file worth "size" bytes at "offset" (from the start of the
+ * file), using vectored I/O.
+ *
+ * Returns the total amount of data written.  On failure, a negative value
+ * is returned with errno set.
+ */
+ssize_t
+pg_pwrite_zeros(int fd, size_t size, off_t offset)
+{
+	static const PGIOAlignedBlock zbuffer = {{0}};	/* worth BLCKSZ */
+	void	   *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data;
+	struct iovec iov[PG_IOV_MAX];
+	size_t		remaining_size = size;
+	ssize_t		total_written = 0;
+
+	/* Loop, writing as many blocks as we can for each system call. */
+	while (remaining_size > 0)
+	{
+		int			iovcnt = 0;
+		ssize_t		written;
+
+		for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
+		{
+			size_t		this_iov_size;
+
+			iov[iovcnt].iov_base = zerobuf_addr;
+
+			if (remaining_size < BLCKSZ)
+				this_iov_size = remaining_size;
+			else
+				this_iov_size = BLCKSZ;
+
+			iov[iovcnt].iov_len = this_iov_size;
+			remaining_size -= this_iov_size;
+		}
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		offset += written;
+		total_written += written;
+	}
+
+	Assert(total_written == size);
+
+	return total_written;
+}
+
+/*
+ * polar_pwrite_zeros
+ *
+ * Writes zeros to file worth "size" bytes at "offset" (from the start of the
+ * file), using bulk I/O.
+ *
+ * Returns the total amount of data written.  On failure, a negative value
+ * is returned with errno set.
+ *
+ * If there is no valid global zero buffer, it will fallback to pg_pwrite_zeros.
+ */
+ssize_t
+polar_pwrite_zeros(int fd, size_t size, off_t offset)
+{
+	size_t		remaining_size = size;
+	ssize_t		total_written = 0;
+
+#ifdef FRONTEND
+	if (polar_zero_buffer_size == 0)
+	{
+#define FRONTEND_ZERO_BUFFER_SIZE (1024 * 1024)
+		/* In frontend, we malloc a fixed size of 1MB and never free */
+		polar_zero_buffer = (void *) TYPEALIGN(PG_IO_ALIGN_SIZE,
+											   malloc(FRONTEND_ZERO_BUFFER_SIZE + PG_IO_ALIGN_SIZE));
+		polar_zero_buffer_size = FRONTEND_ZERO_BUFFER_SIZE;
+	}
+#else
+	if (polar_zero_buffer_size == 0)
+		return pg_pwrite_zeros(fd, size, offset);
+
+	Assert(polar_zero_buffer);
+#endif
+
+	/* Loop, writing as many blocks as we can for each system call. */
+	while (remaining_size > 0)
+	{
+		ssize_t		written;
+		size_t		amount = Min(remaining_size, polar_zero_buffer_size);
+
+		written = polar_pwrite(fd, polar_zero_buffer, amount, offset);
+
+		if (written != amount)
+			return -1;
+
+		remaining_size -= written;
+		offset += written;
+		total_written += written;
+	}
+
+	Assert(total_written == size);
+
+	return total_written;
+}
diff --git a/src/include/access/hio.h b/src/include/access/hio.h
index bb90c6fad81..73b5f98c0e7 100644
--- a/src/include/access/hio.h
+++ b/src/include/access/hio.h
@@ -32,6 +32,9 @@ typedef struct BulkInsertStateData
 	Buffer		current_buf;	/* current insertion target page */
 } BulkInsertStateData;
 
+/* GUCs */
+extern PGDLLIMPORT int polar_index_bulk_extend_size;
+extern PGDLLIMPORT int polar_heap_bulk_extend_size;
 
 extern void RelationPutHeapTuple(Relation relation, Buffer buffer,
 								 HeapTuple tuple, bool token);
@@ -40,4 +43,7 @@ extern Buffer RelationGetBufferForTuple(Relation relation, Size len,
 										BulkInsertStateData *bistate,
 										Buffer *vmbuffer, Buffer *vmbuffer_other);
 
+extern int	polar_get_bulk_extend_size(BlockNumber first_block, int bulk_extend_size);
+extern Buffer polar_index_add_blocks(Relation relation);
+
 #endif							/* HIO_H */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index d4ff4843eb4..ff836f7769b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -58,12 +58,6 @@ extern PGDLLIMPORT bool track_wal_io_timing;
 extern PGDLLIMPORT int wal_decode_buffer_size;
 
 extern PGDLLIMPORT int CheckPointSegments;
-extern int	polar_wal_init_set_size;
-
-/* xlog init zero file write size */
-#define POLAR_DEFAULT_XLOG_FILL_ZERO_SIZE 1024 * 1024
-#define POLAR_MIN_XLOG_FILL_ZERO_SIZE XLOG_BLCKSZ
-#define POLAR_MAX_XLOG_FILL_ZERO_SIZE 4 * 1024 * 1024
 
 /* Archive modes */
 typedef enum ArchiveMode
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 0e47962daa4..a77b5270464 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -14,6 +14,9 @@
 #include "access/xlogreader.h"
 #include "storage/bufmgr.h"
 
+/* POLAR */
+#include "storage/polar_bufmgr.h"
+
 /*
  * Prior to 8.4, all activity during recovery was carried out by the startup
  * process. This local variable continues to be used in many parts of the
@@ -56,9 +59,8 @@ extern PGDLLIMPORT HotStandbyState standbyState;
 
 #define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
 
-
-/* POLAR */
-#include "storage/polar_bufmgr.h"
+/* GUCs */
+extern PGDLLIMPORT int polar_recovery_bulk_extend_size;
 
 extern bool XLogHaveInvalidPages(void);
 extern void XLogCheckInvalidPages(void);
@@ -118,4 +120,6 @@ extern void XLogReadDetermineTimeline(XLogReaderState *state,
 
 extern void WALReadRaiseError(WALReadError *errinfo);
 
+extern int	polar_get_recovery_bulk_extend_size(BlockNumber target_block, BlockNumber nblocks);
+
 #endif
diff --git a/src/include/c.h b/src/include/c.h
index 8ae7471d72f..5a4f094e78b 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1247,34 +1247,45 @@ extern void polar_exceptional_condition(const char *conditionName,
 
 /*
  * Use this, not "char buf[BLCKSZ]", to declare a field or local variable
- * holding a page buffer, if that page might be accessed as a page and not
- * just a string of bytes.  Otherwise the variable might be under-aligned,
- * causing problems on alignment-picky hardware.  (In some places, we use
- * this to declare buffers even though we only pass them to read() and
- * write(), because copying to/from aligned buffers is usually faster than
- * using unaligned buffers.)  We include both "double" and "int64" in the
- * union to ensure that the compiler knows the value must be MAXALIGN'ed
- * (cf. configure's computation of MAXIMUM_ALIGNOF).
+ * holding a page buffer, if that page might be accessed as a page.  Otherwise
+ * the variable might be under-aligned, causing problems on alignment-picky
+ * hardware.  We include both "double" and "int64" in the union to ensure that
+ * the compiler knows the value must be MAXALIGN'ed (cf. configure's
+ * computation of MAXIMUM_ALIGNOF).
  */
 typedef union PGAlignedBlock
 {
 #ifdef pg_attribute_aligned
-	pg_attribute_aligned(4096)
-#else
-	__declspec(align(4096))
+	pg_attribute_aligned(PG_IO_ALIGN_SIZE)
 #endif
 	char		data[BLCKSZ];
 	double		force_align_d;
 	int64		force_align_i64;
 } PGAlignedBlock;
 
+/*
+ * Use this to declare a field or local variable holding a page buffer, if that
+ * page might be accessed as a page or passed to an SMgr I/O function.  If
+ * allocating using the MemoryContext API, the aligned allocation functions
+ * should be used with PG_IO_ALIGN_SIZE.  This alignment may be more efficient
+ * for I/O in general, but may be strictly required on some platforms when
+ * using direct I/O.
+ */
+typedef union PGIOAlignedBlock
+{
+#ifdef pg_attribute_aligned
+	pg_attribute_aligned(PG_IO_ALIGN_SIZE)
+#endif
+	char		data[BLCKSZ];
+	double		force_align_d;
+	int64		force_align_i64;
+} PGIOAlignedBlock;
+
 /* Same, but for an XLOG_BLCKSZ-sized buffer */
 typedef union PGAlignedXLogBlock
 {
 #ifdef pg_attribute_aligned
-	pg_attribute_aligned(4096)
-#else
-	__declspec(align(4096))
+	pg_attribute_aligned(PG_IO_ALIGN_SIZE)
 #endif
 	char		data[XLOG_BLCKSZ];
 	double		force_align_d;
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 9e423ed9027..743edcdd655 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -24,6 +24,12 @@ typedef enum PGFileType
 	PGFILETYPE_LNK
 } PGFileType;
 
+struct iovec;					/* avoid including port/pg_iovec.h here */
+
+extern int	polar_zero_buffer_size;
+extern int	polar_zero_buffers;
+extern void *polar_zero_buffer;
+
 #ifdef FRONTEND
 extern int	fsync_fname(const char *fname, bool isdir);
 extern void fsync_pgdata(const char *pg_data, int serverVersion);
@@ -38,4 +44,18 @@ extern PGFileType get_dirent_type(const char *path,
 								  bool look_through_symlinks,
 								  int elevel);
 
+extern int	compute_remaining_iovec(struct iovec *destination,
+									const struct iovec *source,
+									int iovcnt,
+									size_t transferred);
+
+extern ssize_t pg_pwritev_with_retry(int fd,
+									 const struct iovec *iov,
+									 int iovcnt,
+									 off_t offset);
+
+extern ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset);
+
+extern ssize_t polar_pwrite_zeros(int fd, size_t size, off_t offset);
+
 #endif							/* FILE_UTILS_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index a1910756a88..9f612eaac30 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -271,6 +271,12 @@
  */
 #define PG_CACHE_LINE_SIZE		128
 
+/*
+ * Assumed alignment requirement for direct I/O.  4K corresponds to common
+ * sector and memory page size.
+ */
+#define PG_IO_ALIGN_SIZE		4096
+
 /*
  *------------------------------------------------------------------------
  * The following symbols are for enabling debugging code, not for
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 18257077e0d..cec3e1490a2 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -196,9 +196,6 @@ typedef struct PgStat_TableCounts
 	PgStat_Counter polar_t_bulk_read_calls_IO;
 	PgStat_Counter polar_t_bulk_read_blocks_IO;
 	/* POLAR end */
-
-	/* bulk create index extend times */
-	PgStat_Counter polar_t_bulk_create_index_extends_times;
 } PgStat_TableCounts;
 
 /* ----------
@@ -257,7 +254,7 @@ typedef struct PgStat_TableXactStatus
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID	0x01A5BCA9
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BCA8
 
 typedef struct PgStat_ArchiverStats
 {
@@ -412,10 +409,6 @@ typedef struct PgStat_StatTabEntry
 	/* bulk read calls, IO read blocks counts */
 	PgStat_Counter polar_bulk_read_blocks_IO;
 	/* POLAR end */
-
-	/* POLAR: bulk extend */
-	PgStat_Counter polar_bulk_create_index_extends_times;
-	/* POLAR end */
 } PgStat_StatTabEntry;
 
 typedef struct PgStat_WalStats
@@ -685,13 +678,6 @@ polar_stat_wait_obj_and_time_clear(void)
 /* POLAR: end */
 
 
-/* POLAR: bulk create index extend stats */
-#define polar_pgstat_count_bulk_create_index_extend_times(rel)							\
-	do {																				\
-		if ((rel)->pgstat_info != NULL)													\
-			(rel)->pgstat_info->t_counts.polar_t_bulk_create_index_extends_times++;		\
-	} while (0)
-/* POLAR end */
 
 extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n);
 extern void pgstat_count_heap_update(Relation rel, bool hot);
diff --git a/src/include/polar_vfs/polar_directio.h b/src/include/polar_vfs/polar_directio.h
index d1e043f87b2..e2c3f900e6f 100644
--- a/src/include/polar_vfs/polar_directio.h
+++ b/src/include/polar_vfs/polar_directio.h
@@ -17,8 +17,7 @@
  * limitations under the License.
  *
  * IDENTIFICATION
- *    src/include/polar_vfs/polar_directio.h
- *
+ *	  src/include/polar_vfs/polar_directio.h
  *
  *-------------------------------------------------------------------------
  */
@@ -46,10 +45,9 @@ extern char *polar_directio_buffer;
 extern const vfs_mgr polar_vfs_dio;
 
 #define POLAR_ACCESS_MODE_MASK      0x3
-#define POLAR_DIRECTIO_ALIGN_LEN		POLAR_BUFFER_ALIGN_LEN
-#define POLAR_DIRECTIO_ALIGN_DOWN(LEN)  TYPEALIGN_DOWN(POLAR_DIRECTIO_ALIGN_LEN, LEN)
-#define POLAR_DIRECTIO_ALIGN(LEN)       TYPEALIGN(POLAR_DIRECTIO_ALIGN_LEN, LEN)
-#define POLAR_DIECRTIO_IS_ALIGNED(LEN)  !((uintptr_t)(LEN) & (uintptr_t)(POLAR_DIRECTIO_ALIGN_LEN - 1))
+#define POLAR_DIRECTIO_ALIGN_DOWN(LEN)  TYPEALIGN_DOWN(PG_IO_ALIGN_SIZE, LEN)
+#define POLAR_DIRECTIO_ALIGN(LEN)       TYPEALIGN(PG_IO_ALIGN_SIZE, LEN)
+#define POLAR_DIRECTIO_IS_ALIGNED(LEN)  !((uintptr_t)(LEN) & (uintptr_t)(PG_IO_ALIGN_SIZE - 1))
 
 extern int	polar_directio_open(const char *path, int flags, mode_t mode);
 extern ssize_t polar_directio_read(int fd, void *buf, size_t len);
diff --git a/src/include/polar_vfs/polar_vfs_fe.h b/src/include/polar_vfs/polar_vfs_fe.h
index 3a70e9a10b0..8f315dd2a08 100644
--- a/src/include/polar_vfs/polar_vfs_fe.h
+++ b/src/include/polar_vfs/polar_vfs_fe.h
@@ -44,12 +44,12 @@ extern char *polar_storage_cluster_name;
 
 extern int	polar_mkdir_p(char *path, int omode);
 extern void polar_vfs_init_fe(bool is_pfs, char *fname, char *storage_cluster_name, char *polar_disk_name, int flag);
-extern void polar_vfs_destory_fe(char *ftype, char *polar_disk_name);
+extern void polar_vfs_destroy_fe(char *ftype, char *polar_disk_name);
 extern bool polar_in_shared_storage_mode_fe(char *pgconfig);
 extern bool polar_in_localfs_mode_fe(char *pgconfig);
 extern bool polar_in_replica_mode_fe(const char *pgconfig);
 extern void polar_vfs_init_simple_fe(char *pgconfig, char *pg_datadir, int flag);
-extern void polar_vfs_destory_simple_fe(void);
+extern void polar_vfs_destroy_simple_fe(void);
 extern int	polar_vfs_state_backup_current(void);
 extern int	polar_vfs_state_restore_current(int index);
 extern int	polar_vfs_state_backup(bool is_shared, bool is_localfs, int hostid,
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index ca09a1608f8..451a01fa6bc 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -141,6 +141,9 @@ typedef struct timespec instr_time;
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000))
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(((uint64) (t).tv_sec * (uint64) 1000000000) + (uint64) ((t).tv_nsec))
+
 #else							/* !HAVE_CLOCK_GETTIME */
 
 /* Use gettimeofday() */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index e23e252b7e5..fb04b3c678b 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -243,7 +243,7 @@ typedef struct BufferDesc
  * platform with either 32 or 128 byte line sizes, it's good to align to
  * boundaries and avoid false sharing.
  */
-#define BUFFERDESC_PAD_TO_SIZE	(SIZEOF_VOID_P == 8 ? 64 : 1)
+#define BUFFERDESC_PAD_TO_SIZE	(SIZEOF_VOID_P == 8 ? 128 : 1)
 
 typedef union BufferDescPadded
 {
@@ -251,6 +251,9 @@ typedef union BufferDescPadded
 	char		pad[BUFFERDESC_PAD_TO_SIZE];
 } BufferDescPadded;
 
+StaticAssertDecl(sizeof(BufferDesc) <= BUFFERDESC_PAD_TO_SIZE,
+				 "padding size is too small to fit BufferDesc");
+
 #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
 #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index df531edfff3..ed2d2617d37 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -77,6 +77,9 @@ extern PGDLLIMPORT bool track_io_timing;
 extern PGDLLIMPORT int effective_io_concurrency;
 extern PGDLLIMPORT int maintenance_io_concurrency;
 
+#define MAX_BUFFERS_TO_READ_BY 64
+#define MAX_BUFFERS_TO_EXTEND_BY 1024
+
 extern PGDLLIMPORT int checkpoint_flush_after;
 extern PGDLLIMPORT int backend_flush_after;
 extern PGDLLIMPORT int bgwriter_flush_after;
@@ -89,12 +92,6 @@ extern PGDLLIMPORT int NLocBuffer;
 extern PGDLLIMPORT Block *LocalBufferBlockPointers;
 extern PGDLLIMPORT int32 *LocalRefCount;
 
-/* POLAR: bulk read */
-extern bool polar_bulk_io_is_in_progress;
-extern int	polar_bulk_io_in_progress_count;
-
-/* POLAR end */
-
 /* upper limit for effective_io_concurrency */
 #define MAX_IO_CONCURRENCY 1000
 
@@ -255,7 +252,7 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 extern bool IsBufferCleanupOK(Buffer buffer);
 extern bool HoldingBufferPinThatDelaysRecovery(void);
 
-extern void AbortBufferIO(void);
+extern void AbortBufferIO(Buffer buffer);
 
 extern void BufmgrCommit(void);
 extern bool BgBufferSync(struct WritebackContext *wb_context, int flags);
@@ -281,7 +278,10 @@ extern bool StartBufferIO(BufferDesc *buf, bool forInput);
 
 /* POLAR: bulk read */
 extern int	polar_get_buffer_access_strategy_ring_size(BufferAccessStrategy strategy);
-extern BufferDesc **polar_bulk_io_in_progress_buf;
+
+extern Buffer polar_read_buffer_common(struct SMgrRelationData *smgr, char relpersistence, ForkNumber forkNum,
+									   BlockNumber blockNum, ReadBufferMode mode,
+									   BufferAccessStrategy strategy);
 
 /* inline functions */
 
@@ -326,6 +326,9 @@ TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
 }
 
 /* POLAR */
+extern PGDLLIMPORT bool polar_has_partial_write;
+extern PGDLLIMPORT int polar_bulk_read_size;
+
 extern void polar_lock_buffer_for_cleanup_ext(Buffer buffer, bool fresh_check);
 extern void polar_lock_buffer_ext(Buffer buffer, int mode, bool fresh_check);
 extern bool polar_conditional_lock_buffer_ext(Buffer buffer, bool fresh_check);
diff --git a/src/include/storage/bulk_write.h b/src/include/storage/bulk_write.h
new file mode 100644
index 00000000000..73e4f08dde8
--- /dev/null
+++ b/src/include/storage/bulk_write.h
@@ -0,0 +1,44 @@
+/*-------------------------------------------------------------------------
+ *
+ * bulk_write.h
+ *	  Efficiently and reliably populate a new relation
+ *
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/bulk_write.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef BULK_WRITE_H
+#define BULK_WRITE_H
+
+#include "storage/smgr.h"
+#include "utils/rel.h"
+
+/* GUCs */
+extern PGDLLIMPORT int polar_bulk_write_maxpages;
+
+/* Bulk writer state, contents are private to bulk_write.c */
+typedef struct BulkWriteState BulkWriteState;
+
+/*
+ * Temporary buffer to hold a page to until it's written out. Use
+ * smgr_bulk_get_buf() to reserve one of these.  This is a separate typedef to
+ * distinguish it from other block-sized buffers passed around in the system.
+ */
+typedef PGIOAlignedBlock *BulkWriteBuffer;
+
+/* forward declared from smgr.h */
+struct SMgrRelationData;
+
+extern BulkWriteState *smgr_bulk_start_rel(Relation rel, ForkNumber forknum);
+extern BulkWriteState *smgr_bulk_start_smgr(struct SMgrRelationData *smgr, ForkNumber forknum, bool use_wal, char relpersistence);
+
+extern BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate);
+extern void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std);
+
+extern void smgr_bulk_finish(BulkWriteState *bulkstate);
+
+#endif							/* BULK_WRITE_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 69549b000fa..b5673adac80 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -51,8 +51,6 @@ typedef enum RecoveryInitSyncMethod
 	RECOVERY_INIT_SYNC_METHOD_SYNCFS
 }			RecoveryInitSyncMethod;
 
-struct iovec;					/* avoid including port/pg_iovec.h here */
-
 typedef int File;
 
 
@@ -61,6 +59,11 @@ extern PGDLLIMPORT int max_files_per_process;
 extern PGDLLIMPORT bool data_sync_retry;
 extern PGDLLIMPORT int recovery_init_sync_method;
 
+/* POLAR: GUC */
+extern PGDLLIMPORT bool polar_enable_fallocate_no_hide_stale;
+
+/* POLAR end */
+
 /*
  * This is private to fd.c, but exported for save/restore_backend_variables()
  */
@@ -84,9 +87,10 @@ extern PGDLLIMPORT int max_safe_fds;
  * to the appropriate Windows flag in src/port/open.c.  We simulate it with
  * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper.  We use the name
  * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good
- * idea on a Unix).
+ * idea on a Unix).  We can only use it if the compiler will correctly align
+ * PGIOAlignedBlock for us, though.
  */
-#if defined(O_DIRECT)
+#if defined(O_DIRECT) && defined(pg_attribute_aligned)
 #define		PG_O_DIRECT O_DIRECT
 #elif defined(F_NOCACHE)
 #define		PG_O_DIRECT 0x80000000
@@ -104,10 +108,13 @@ extern File PathNameOpenFile(const char *fileName, int fileFlags);
 extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode);
 extern File OpenTemporaryFile(bool interXact);
 extern void FileClose(File file);
-extern int	FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
-extern int	FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
-extern int	FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern int	FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info);
+extern ssize_t FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
+extern ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info);
 extern int	FileSync(File file, uint32 wait_event_info);
+extern int	FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info, bool bulkwrite);
+extern int	FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info);
+
 extern off_t FileSize(File file);
 extern int	FileTruncate(File file, off_t offset, uint32 wait_event_info);
 extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
@@ -178,10 +185,6 @@ extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern void pg_flush_data(int fd, off_t offset, off_t amount);
-extern ssize_t pg_pwritev_with_retry(int fd,
-									 const struct iovec *iov,
-									 int iovcnt,
-									 off_t offset);
 extern int	pg_truncate(const char *path, off_t length);
 extern void fsync_fname(const char *fname, bool isdir);
 extern int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 781f43ec910..89552788fd7 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -27,19 +27,22 @@ extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
 extern void mdextend(SMgrRelation reln, ForkNumber forknum,
-					 BlockNumber blocknum, char *buffer, bool skipFsync);
+					 BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-				   char *buffer);
+				   void *buffer);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
-					BlockNumber blocknum, char *buffer, bool skipFsync);
+					BlockNumber blocknum, const void *buffer, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void mdregistersync(SMgrRelation reln, ForkNumber forknum);
 
 extern void ForgetDatabaseSyncRequests(Oid dbid);
 extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
@@ -49,9 +52,14 @@ extern int	mdsyncfiletag(const FileTag *ftag, char *path);
 extern int	mdunlinkfiletag(const FileTag *ftag, char *path);
 extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
 
-/* POLAR: bulk io */
-extern void polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-							   int blockCount, char *buffer, bool skipFsync);
+/* POLAR */
 extern void polar_mdbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-							 int blockCount, char *buffer);
+							 int nblocks, void *buffer);
+extern void polar_mdbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+							  int nblocks, const void *buffer, bool skipFsync);
+extern void polar_mdbulkextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+							   int nblocks, const void *buffer, bool skipFsync);
+
+/* POLAR end */
+
 #endif							/* MD_H */
diff --git a/src/include/storage/polar_copybuf.h b/src/include/storage/polar_copybuf.h
index 1841c620dee..9bc0cf30c5e 100644
--- a/src/include/storage/polar_copybuf.h
+++ b/src/include/storage/polar_copybuf.h
@@ -19,7 +19,7 @@
  * limitations under the License.
  *
  * IDENTIFICATION
- *      src/include/storage/polar_copybuf.h
+ *	  src/include/storage/polar_copybuf.h
  *
  *-------------------------------------------------------------------------
  */
@@ -72,6 +72,9 @@ typedef union CopyBufferDescPadded
 	char		pad[COPYBUFFERDESC_PAD_TO_SIZE];
 } CopyBufferDescPadded;
 
+StaticAssertDecl(sizeof(CopyBufferDesc) <= COPYBUFFERDESC_PAD_TO_SIZE,
+				 "padding size is too small to fit CopyBufferDesc");
+
 #define CopyBufHdrGetBlock(copy_buf_hdr) \
 	((Block) (polar_copy_buffer_blocks + ((Size) (copy_buf_hdr)->buf_id) * BLCKSZ))
 
diff --git a/src/include/storage/polar_fd.h b/src/include/storage/polar_fd.h
index 46d0458b934..cd84c17471d 100644
--- a/src/include/storage/polar_fd.h
+++ b/src/include/storage/polar_fd.h
@@ -18,7 +18,7 @@
  * limitations under the License.
  *
  * IDENTIFICATION
- *      src/include/storage/polar_fd.h
+ *	  src/include/storage/polar_fd.h
  *
  *-------------------------------------------------------------------------
  */
@@ -156,7 +156,8 @@ typedef struct vfs_mgr
 	int			(*vfs_fsync) (int fd);
 	int			(*vfs_unlink) (const char *path);
 	int			(*vfs_rename) (const char *oldpath, const char *newpath);
-	int			(*vfs_fallocate) (int fd, off_t offset, off_t len);
+	int			(*vfs_posix_fallocate) (int fd, off_t offset, off_t len);
+	int			(*vfs_fallocate) (int fd, int mode, off_t offset, off_t len);
 	int			(*vfs_ftruncate) (int fd, off_t len);
 	int			(*vfs_truncate) (const char *path, off_t len);
 	DIR		   *(*vfs_opendir) (const char *path);
@@ -170,10 +171,17 @@ typedef struct vfs_mgr
 	int			(*vfs_sync_file_range) (int fd, off_t offset, off_t nbytes, unsigned int flags);
 	int			(*vfs_posix_fadvise) (int fd, off_t offset, off_t len, int advice);
 	int			(*vfs_umount) (char *ftype, const char *pbdname);
+	PolarVFSKind (*vfs_type) (int fd);
 } vfs_mgr;
 
 extern vfs_mgr polar_vfs[];
 
+static inline PolarVFSKind
+polar_bufferio_vfs_type(int fd)
+{
+	return POLAR_VFS_LOCAL_BIO;
+}
+
 extern ssize_t polar_read_line(int fd, void *buffer, size_t len);
 extern int	polar_copy_file(char *fromfile, char *tofile, bool skiperr);
 extern void polar_copydir(char *fromdir, char *todir, bool recurse, bool clean, bool skip_file_err);
@@ -327,9 +335,15 @@ polar_rename(const char *oldfile, const char *newfile)
 }
 
 static inline int
-polar_fallocate(int fd, off_t offset, off_t len)
+polar_posix_fallocate(int fd, off_t offset, off_t len)
+{
+	return polar_vfs[polar_vfs_switch].vfs_posix_fallocate(fd, offset, len);
+}
+
+static inline int
+polar_fallocate(int fd, int mode, off_t offset, off_t len)
 {
-	return polar_vfs[polar_vfs_switch].vfs_fallocate(fd, offset, len);
+	return polar_vfs[polar_vfs_switch].vfs_fallocate(fd, mode, offset, len);
 }
 
 static inline int
@@ -486,4 +500,10 @@ polar_umount(char *ftype, const char *pbdname)
 	return rc;
 }
 
+static inline PolarVFSKind
+polar_vfs_type(int fd)
+{
+	return polar_vfs[polar_vfs_switch].vfs_type(fd);
+}
+
 #endif
diff --git a/src/include/storage/polar_xlogbuf.h b/src/include/storage/polar_xlogbuf.h
index 548fc687107..0b4f031ebe6 100644
--- a/src/include/storage/polar_xlogbuf.h
+++ b/src/include/storage/polar_xlogbuf.h
@@ -97,6 +97,9 @@ typedef union polar_xlog_buffer_desc_padded
 	char		pad[XLOGBUFFERDESC_PAD_TO_SIZE];
 } polar_xlog_buffer_desc_padded;
 
+StaticAssertDecl(sizeof(polar_xlog_buffer_desc) <= XLOGBUFFERDESC_PAD_TO_SIZE,
+				 "padding size is too small to fit polar_xlog_buffer_desc");
+
 typedef struct polar_xlog_buffer_ctl_t
 {
 	polar_xlog_buffer_desc_padded *buffer_descriptors;
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a415f3ac0c0..d40ad2ee183 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -105,13 +105,15 @@ extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, char *buffer, bool skipFsync);
+					   BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
-					 BlockNumber blocknum, char *buffer);
+					 BlockNumber blocknum, void *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
-					  BlockNumber blocknum, char *buffer, bool skipFsync);
+					  BlockNumber blocknum, const void *buffer, bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
@@ -119,16 +121,25 @@ extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 						 int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum);
 extern void AtEOXact_SMgr(void);
 extern bool ProcessBarrierSmgrRelease(void);
 
-/* POLAR: bulk io */
+/* POLAR */
+#define POLAR_ZERO_EXTEND_NONE		0
+#define POLAR_ZERO_EXTEND_BULKWRITE	1
+#define POLAR_ZERO_EXTEND_FALLOCATE	2
+
+extern PGDLLIMPORT int polar_zero_extend_method;
+
+extern void polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+							   int nblocks, void *buffer);
+extern void polar_smgrbulkwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+								int nblocks, const void *buffer, bool skipFsync);
 extern void polar_smgrbulkextend(SMgrRelation reln, ForkNumber forknum,
-								 BlockNumber blocknum, int blockCount, char *buffer, bool skipFsync);
+								 BlockNumber blocknum, int nblocks, const void *buffer, bool skipFsync);
 extern void polar_smgr_init_bulk_extend(SMgrRelation reln, ForkNumber forknum);
 extern void polar_smgr_clear_bulk_extend(SMgrRelation reln, ForkNumber forknum);
-extern void polar_smgrbulkread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-							   int blockCount, char *buffer);
 
 /* POLAR end */
 
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 8716c95a335..dfb8b63789a 100755
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -272,10 +272,6 @@ extern struct config_generic *polar_parameter_check_name_internal(const char *gu
 
 /* POLAR end */
 
-/* POLAR */
-#define POLAR_MAX_BULK_IO_SIZE 64
-/* POLAR end */
-
 /* GUC vars that are actually declared in guc.c, rather than elsewhere */
 extern PGDLLIMPORT bool Debug_print_plan;
 extern PGDLLIMPORT bool Debug_print_parse;
@@ -615,22 +611,8 @@ extern bool polar_enable_track_lock_timing;
 extern bool polar_enable_track_network_stat;
 extern bool polar_enable_track_network_timing;
 
-/* POLAR: bulk io */
-extern int	polar_recovery_bulk_extend_size;
-extern int	polar_min_bulk_extend_table_size;
-extern bool polar_enable_primary_recovery_bulk_extend;
-extern int	polar_bulk_extend_size;
-extern int	polar_bulk_read_size;
-
-extern int	polar_index_bulk_extend_size;
-
-
-extern int	polar_index_create_bulk_extend_size;
-
 /* POLAR end */
 
-/* POLAR: partial write */
-extern bool polar_has_partial_write;
 extern bool polar_find_in_string_list(const char *itemname, const char *stringlist);
 
 
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 5d7e6f3e66e..f3aae6a0bae 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -100,7 +100,7 @@ enum config_group
 	POLAR_REL_SIZE_CACHE,
 	POLAR_BUFFER_MANAGEMENT,
 	POLAR_PROXY,
-	POLAR_BULK_READ_EXTEND,
+	POLAR_IO_MANAGEMENT,
 	/* POLAR end */
 	DEVELOPER_OPTIONS
 };
diff --git a/src/include/utils/palloc.h b/src/include/utils/palloc.h
index b050ab53db9..2100597c2df 100644
--- a/src/include/utils/palloc.h
+++ b/src/include/utils/palloc.h
@@ -75,13 +75,11 @@ extern void *MemoryContextAllocExtended(MemoryContext context,
 										Size size, int flags);
 extern void *MemoryContextAllocAligned(MemoryContext context,
 									   Size size, Size alignto, int flags);
-extern void *MemoryContextAllocIOAligned(MemoryContext context, Size size, int flags);
 
 extern void *palloc(Size size);
 extern void *palloc0(Size size);
 extern void *palloc_extended(Size size, int flags);
 extern void *palloc_aligned(Size size, Size alignto, int flags);
-extern void *palloc_io_aligned(Size size, int flags);
 extern pg_nodiscard void *repalloc(void *pointer, Size size);
 extern void pfree(void *pointer);
 
diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h
index d01cccc27c1..61a72ed52f5 100644
--- a/src/include/utils/resowner_private.h
+++ b/src/include/utils/resowner_private.h
@@ -30,6 +30,11 @@ extern void ResourceOwnerEnlargeBuffers(ResourceOwner owner);
 extern void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer);
 extern void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer);
 
+/* support for IO-in-progress management */
+extern void ResourceOwnerEnlargeBufferIOs(ResourceOwner owner);
+extern void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer);
+extern void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer);
+
 /* support for local lock management */
 extern void ResourceOwnerRememberLock(ResourceOwner owner, LOCALLOCK *locallock);
 extern void ResourceOwnerForgetLock(ResourceOwner owner, LOCALLOCK *locallock);
diff --git a/src/polar_vfs/polar_bufferio.c b/src/polar_vfs/polar_bufferio.c
index f37090d26a0..ae8b99b0067 100644
--- a/src/polar_vfs/polar_bufferio.c
+++ b/src/polar_vfs/polar_bufferio.c
@@ -57,7 +57,12 @@ const vfs_mgr polar_vfs_bio =
 #endif
 	.vfs_unlink = unlink,
 	.vfs_rename = rename,
-	.vfs_fallocate = posix_fallocate,
+	.vfs_posix_fallocate = posix_fallocate,
+#ifdef __linux__
+	.vfs_fallocate = fallocate,
+#else
+	.vfs_fallocate = NULL,
+#endif
 	.vfs_ftruncate = ftruncate,
 	.vfs_truncate = truncate,
 	.vfs_opendir = opendir,
@@ -68,4 +73,5 @@ const vfs_mgr polar_vfs_bio =
 	.vfs_mgr_func = NULL,
 	.vfs_chmod = chmod,
 	.vfs_mmap = mmap,
+	.vfs_type = polar_bufferio_vfs_type,
 };
diff --git a/src/polar_vfs/polar_directio.c b/src/polar_vfs/polar_directio.c
index a16fc0c1bb3..186743d05ee 100644
--- a/src/polar_vfs/polar_directio.c
+++ b/src/polar_vfs/polar_directio.c
@@ -40,6 +40,12 @@ polar_directio_fsync(int fd)
 #endif
 }
 
+static inline PolarVFSKind
+polar_directio_vfs_type(int fd)
+{
+	return POLAR_VFS_LOCAL_DIO;
+}
+
 /*
  * Local file system interface with O_DIRECT flag.
  * It use original file system interface to do other jobs
@@ -73,7 +79,12 @@ const vfs_mgr polar_vfs_dio =
 	.vfs_fsync = polar_directio_fsync,
 	.vfs_unlink = unlink,
 	.vfs_rename = rename,
-	.vfs_fallocate = posix_fallocate,
+	.vfs_posix_fallocate = posix_fallocate,
+#ifdef __linux__
+	.vfs_fallocate = fallocate,
+#else
+	.vfs_fallocate = NULL,
+#endif
 	.vfs_ftruncate = ftruncate,
 	.vfs_truncate = truncate,
 	.vfs_opendir = opendir,
@@ -84,6 +95,7 @@ const vfs_mgr polar_vfs_dio =
 	.vfs_mgr_func = NULL,
 	.vfs_chmod = chmod,
 	.vfs_mmap = mmap,
+	.vfs_type = polar_directio_vfs_type,
 };
 
 /*
@@ -125,9 +137,9 @@ polar_directio_write(int fd, const void *buf, size_t len)
 	if (offset < 0)
 		return res;
 
-	if (POLAR_DIECRTIO_IS_ALIGNED(buf) &&
-		POLAR_DIECRTIO_IS_ALIGNED(len) &&
-		POLAR_DIECRTIO_IS_ALIGNED(offset))
+	if (POLAR_DIRECTIO_IS_ALIGNED(buf) &&
+		POLAR_DIRECTIO_IS_ALIGNED(len) &&
+		POLAR_DIRECTIO_IS_ALIGNED(offset))
 		return write(fd, buf, len);
 
 	res = polar_directio_pwrite(fd, buf, len, offset);
@@ -148,9 +160,9 @@ polar_directio_read(int fd, void *buf, size_t len)
 	if (offset < 0)
 		return res;
 
-	if (POLAR_DIECRTIO_IS_ALIGNED(buf) &&
-		POLAR_DIECRTIO_IS_ALIGNED(len) &&
-		POLAR_DIECRTIO_IS_ALIGNED(offset))
+	if (POLAR_DIRECTIO_IS_ALIGNED(buf) &&
+		POLAR_DIRECTIO_IS_ALIGNED(len) &&
+		POLAR_DIRECTIO_IS_ALIGNED(offset))
 		return read(fd, buf, len);
 
 	res = polar_directio_pread(fd, buf, len, offset);
@@ -187,9 +199,9 @@ polar_directio_pread(int fd, void *buffer, size_t len, off_t offset)
 	off_t		nleft;
 	ssize_t		cplen;
 
-	if (POLAR_DIECRTIO_IS_ALIGNED(buffer) &&
-		POLAR_DIECRTIO_IS_ALIGNED(len) &&
-		POLAR_DIECRTIO_IS_ALIGNED(offset))
+	if (POLAR_DIRECTIO_IS_ALIGNED(buffer) &&
+		POLAR_DIRECTIO_IS_ALIGNED(len) &&
+		POLAR_DIRECTIO_IS_ALIGNED(offset))
 		return pread(fd, buffer, len, offset);
 
 	from = (char *) buffer;
@@ -208,19 +220,19 @@ polar_directio_pread(int fd, void *buffer, size_t len, off_t offset)
 		nleft > 0)
 	{
 		off = head_start;
-		res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off);
+		res = pread(fd, buf, PG_IO_ALIGN_SIZE, off);
 
 		if (res < 0)
 			return res;
-		else if (res <= (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)))
+		else if (res <= (offset & (PG_IO_ALIGN_SIZE - 1)))
 			return count;
 		else
 		{
-			cplen = Min(res - (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), len);
+			cplen = Min(res - (offset & (PG_IO_ALIGN_SIZE - 1)), len);
 			cplen = Min(nleft, cplen);
 		}
 
-		memcpy(from, buf + (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), cplen);
+		memcpy(from, buf + (offset & (PG_IO_ALIGN_SIZE - 1)), cplen);
 		from += cplen;
 		count += cplen;
 		nleft -= cplen;
@@ -259,13 +271,13 @@ polar_directio_pread(int fd, void *buffer, size_t len, off_t offset)
 		nleft > 0)
 	{
 		off = tail_start;
-		res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off);
+		res = pread(fd, buf, PG_IO_ALIGN_SIZE, off);
 
 		if (res < 0)
 			return res;
 		else
 		{
-			cplen = Min(res, ((offset + len) & (POLAR_DIRECTIO_ALIGN_LEN - 1)));
+			cplen = Min(res, ((offset + len) & (PG_IO_ALIGN_SIZE - 1)));
 			cplen = Min(nleft, cplen);
 		}
 
@@ -289,9 +301,9 @@ polar_directio_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 
 	for (i = 0; i < iovcnt; i++)
 	{
-		if (aligned && (!POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_base) ||
-						!POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_len) ||
-						!POLAR_DIECRTIO_IS_ALIGNED(offset)))
+		if (aligned && (!POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_base) ||
+						!POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_len) ||
+						!POLAR_DIRECTIO_IS_ALIGNED(offset)))
 			aligned = false;
 
 		bytes += iov[i].iov_len;
@@ -341,15 +353,15 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset)
 #define POLAR_DIRECTIO_PWRITE_SECTION(start, len)               \
 	do                                                          \
 	{                                                           \
-		MemSet(buf, 0x0, POLAR_DIRECTIO_ALIGN_LEN);             \
-		res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off);    \
+		MemSet(buf, 0x0, PG_IO_ALIGN_SIZE);             \
+		res = pread(fd, buf, PG_IO_ALIGN_SIZE, off);    \
 		if (res < 0)                                            \
 			return res;                                         \
 		memcpy(buf + start, from, len);                         \
-		res = pwrite(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off);   \
+		res = pwrite(fd, buf, PG_IO_ALIGN_SIZE, off);   \
 		if (res < 0)                                            \
 			return res;                                         \
-		Assert(res == POLAR_DIRECTIO_ALIGN_LEN);                \
+		Assert(res == PG_IO_ALIGN_SIZE);                \
 		from += len;                                            \
 		count += len;                                           \
 		nleft -= len;                                           \
@@ -369,9 +381,9 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset)
 	bool		need_truncate = false;
 	struct stat stat_buf;
 
-	if (POLAR_DIECRTIO_IS_ALIGNED(buffer) &&
-		POLAR_DIECRTIO_IS_ALIGNED(len) &&
-		POLAR_DIECRTIO_IS_ALIGNED(offset))
+	if (POLAR_DIRECTIO_IS_ALIGNED(buffer) &&
+		POLAR_DIRECTIO_IS_ALIGNED(len) &&
+		POLAR_DIRECTIO_IS_ALIGNED(offset))
 		return pwrite(fd, buffer, len, offset);
 
 	from = (char *) buffer;
@@ -389,7 +401,7 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset)
 	 * Whether we should truncate file to expected size or not. stat_buf
 	 * constains the original file's states including size.
 	 */
-	if (!POLAR_DIECRTIO_IS_ALIGNED(offset + len))
+	if (!POLAR_DIRECTIO_IS_ALIGNED(offset + len))
 	{
 		res = fstat(fd, &stat_buf);
 		if (res < 0)
@@ -403,8 +415,8 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset)
 		nleft > 0)
 	{
 		off = head_start;
-		cplen = Min(nleft, POLAR_DIRECTIO_ALIGN_LEN - (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)));
-		POLAR_DIRECTIO_PWRITE_SECTION((offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), cplen);
+		cplen = Min(nleft, PG_IO_ALIGN_SIZE - (offset & (PG_IO_ALIGN_SIZE - 1)));
+		POLAR_DIRECTIO_PWRITE_SECTION((offset & (PG_IO_ALIGN_SIZE - 1)), cplen);
 	}
 
 	/* write the middle sections */
@@ -436,7 +448,7 @@ polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset)
 		nleft > 0)
 	{
 		off = tail_start;
-		cplen = Min(nleft, (offset + len) & (POLAR_DIRECTIO_ALIGN_LEN - 1));
+		cplen = Min(nleft, (offset + len) & (PG_IO_ALIGN_SIZE - 1));
 		POLAR_DIRECTIO_PWRITE_SECTION(0, cplen);
 	}
 
@@ -463,9 +475,9 @@ polar_directio_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset
 
 	for (i = 0; i < iovcnt; i++)
 	{
-		if (aligned && (!POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_base) ||
-						!POLAR_DIECRTIO_IS_ALIGNED(iov[i].iov_len) ||
-						!POLAR_DIECRTIO_IS_ALIGNED(offset)))
+		if (aligned && (!POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_base) ||
+						!POLAR_DIRECTIO_IS_ALIGNED(iov[i].iov_len) ||
+						!POLAR_DIRECTIO_IS_ALIGNED(offset)))
 			aligned = false;
 
 		bytes += iov[i].iov_len;
diff --git a/src/polar_vfs/polar_pfsd.c b/src/polar_vfs/polar_pfsd.c
index 84e25c3b226..5dd4015fb97 100644
--- a/src/polar_vfs/polar_pfsd.c
+++ b/src/polar_vfs/polar_pfsd.c
@@ -42,6 +42,12 @@ static ssize_t polar_pfsd_pwritev(int fd, const struct iovec *iov, int iovcnt, o
 
 int			max_pfsd_io_size = PFSD_DEFAULT_MAX_IOSIZE;
 
+static inline PolarVFSKind
+polar_pfsd_vfs_type(int fd)
+{
+	return POLAR_VFS_PFS;
+}
+
 /*
  * Pfsd file system interface.
  * It use original pfsd's file access interface.
@@ -74,7 +80,8 @@ const vfs_mgr polar_vfs_pfsd =
 	.vfs_fsync = pfsd_fsync,
 	.vfs_unlink = pfsd_unlink,
 	.vfs_rename = pfsd_rename,
-	.vfs_fallocate = pfsd_posix_fallocate,
+	.vfs_posix_fallocate = pfsd_posix_fallocate,
+	.vfs_fallocate = pfsd_fallocate,
 	.vfs_ftruncate = pfsd_ftruncate,
 	.vfs_truncate = pfsd_truncate,
 	.vfs_opendir = pfsd_opendir,
@@ -85,6 +92,7 @@ const vfs_mgr polar_vfs_pfsd =
 	.vfs_mgr_func = NULL,
 	.vfs_chmod = pfsd_chmod,
 	.vfs_mmap = NULL,
+	.vfs_type = polar_pfsd_vfs_type,
 #else
 	.vfs_env_init = NULL,
 	.vfs_env_destroy = NULL,
@@ -108,6 +116,7 @@ const vfs_mgr polar_vfs_pfsd =
 	.vfs_fsync = NULL,
 	.vfs_unlink = NULL,
 	.vfs_rename = NULL,
+	.vfs_posix_fallocate = NULL,
 	.vfs_fallocate = NULL,
 	.vfs_ftruncate = NULL,
 	.vfs_truncate = NULL,
@@ -119,6 +128,7 @@ const vfs_mgr polar_vfs_pfsd =
 	.vfs_mgr_func = NULL,
 	.vfs_chmod = NULL,
 	.vfs_mmap = NULL,
+	.vfs_type = NULL,
 #endif
 };
 
diff --git a/src/polar_vfs/polar_vfs.c b/src/polar_vfs/polar_vfs.c
index a361e612e8e..66d1524be9f 100644
--- a/src/polar_vfs/polar_vfs.c
+++ b/src/polar_vfs/polar_vfs.c
@@ -238,11 +238,11 @@ polar_vfs_init(void)
 
 	if (localfs_mode)
 	{
-		if (!POLAR_DIECRTIO_IS_ALIGNED(polar_max_direct_io_size))
+		if (!POLAR_DIRECTIO_IS_ALIGNED(polar_max_direct_io_size))
 			elog(FATAL, "polar_max_direct_io_size is not aligned!");
 		else if (polar_directio_buffer == NULL &&
 				 posix_memalign((void **) &polar_directio_buffer,
-								POLAR_DIRECTIO_ALIGN_LEN,
+								PG_IO_ALIGN_SIZE,
 								polar_max_direct_io_size) != 0)
 		{
 			elog(ERROR, "posix_memalign alloc polar_directio_buffer failed!");
diff --git a/src/polar_vfs/polar_vfs_fe.c b/src/polar_vfs/polar_vfs_fe.c
index 762e94f8acb..8377920e13f 100644
--- a/src/polar_vfs/polar_vfs_fe.c
+++ b/src/polar_vfs/polar_vfs_fe.c
@@ -105,7 +105,12 @@ vfs_mgr		polar_vfs[] =
 		.vfs_fsync = fsync,
 		.vfs_unlink = unlink,
 		.vfs_rename = rename,
-		.vfs_fallocate = posix_fallocate,
+		.vfs_posix_fallocate = posix_fallocate,
+#ifdef __linux__
+		.vfs_fallocate = fallocate,
+#else
+		.vfs_fallocate = NULL,
+#endif
 		.vfs_ftruncate = ftruncate,
 		.vfs_truncate = truncate,
 		.vfs_opendir = opendir,
@@ -116,6 +121,7 @@ vfs_mgr		polar_vfs[] =
 		.vfs_mgr_func = polar_get_local_vfs_mgr,
 		.vfs_chmod = chmod,
 		.vfs_mmap = mmap,
+		.vfs_type = NULL,
 	},
 	{
 		.vfs_env_init = NULL,
@@ -139,6 +145,7 @@ vfs_mgr		polar_vfs[] =
 		.vfs_fsync = NULL,
 		.vfs_unlink = NULL,
 		.vfs_rename = NULL,
+		.vfs_posix_fallocate = NULL,
 		.vfs_fallocate = NULL,
 		.vfs_ftruncate = NULL,
 		.vfs_truncate = NULL,
@@ -150,6 +157,7 @@ vfs_mgr		polar_vfs[] =
 		.vfs_mgr_func = NULL,
 		.vfs_chmod = NULL,
 		.vfs_mmap = NULL,
+		.vfs_type = NULL,
 	}
 };
 
@@ -407,14 +415,14 @@ polar_vfs_init_fe(bool is_pfs, char *fname, char *storage_cluster_name, char *po
 
 	if (localfs_mode)
 	{
-		if (!POLAR_DIECRTIO_IS_ALIGNED(polar_max_direct_io_size))
+		if (!POLAR_DIRECTIO_IS_ALIGNED(polar_max_direct_io_size))
 		{
 			fprintf(stderr, "polar_max_direct_io_size is not aligned!\n");
 			exit(EXIT_FAILURE);
 		}
 		else if (polar_directio_buffer == NULL &&
 				 posix_memalign((void **) &polar_directio_buffer,
-								POLAR_DIRECTIO_ALIGN_LEN,
+								PG_IO_ALIGN_SIZE,
 								polar_max_direct_io_size) != 0)
 		{
 			fprintf(stderr, "posix_memalign alloc polar_directio_buffer failed!\n");
@@ -457,10 +465,10 @@ polar_vfs_init_fe(bool is_pfs, char *fname, char *storage_cluster_name, char *po
  * Unmount polar file system for frontend.
  */
 void
-polar_vfs_destory_fe(char *ftype, char *disk_name)
+polar_vfs_destroy_fe(char *ftype, char *disk_name)
 {
 	/*
-	 * Do not destory polar vfs when instance is not in shared storage mode.
+	 * Do not destroy polar vfs when instance is not in shared storage mode.
 	 */
 	if (localfs_mode || !polar_enable_shared_storage_mode)
 		return;
@@ -713,11 +721,11 @@ polar_vfs_init_simple_fe(char *pgconfig, char *pg_datadir, int flag)
 }
 
 void
-polar_vfs_destory_simple_fe(void)
+polar_vfs_destroy_simple_fe(void)
 {
 	if (polar_disk_name != NULL)
 	{
-		polar_vfs_destory_fe(polar_datadir, polar_disk_name);
+		polar_vfs_destroy_fe(polar_datadir, polar_disk_name);
 		pg_free(polar_disk_name);
 		polar_disk_name = NULL;
 	}
diff --git a/src/polar_vfs/polar_vfs_interface.c b/src/polar_vfs/polar_vfs_interface.c
index 1b2aaa17b35..278e0a9ba18 100644
--- a/src/polar_vfs/polar_vfs_interface.c
+++ b/src/polar_vfs/polar_vfs_interface.c
@@ -94,7 +94,8 @@ static int	vfs_access(const char *path, int mode);
 static int	vfs_fsync(int file);
 static int	vfs_unlink(const char *fname);
 static int	vfs_rename(const char *oldfile, const char *newfile);
-static int	vfs_fallocate(int file, off_t offset, off_t len);
+static int	vfs_posix_fallocate(int file, off_t offset, off_t len);
+static int	vfs_fallocate(int file, int mode, off_t offset, off_t len);
 static int	vfs_ftruncate(int file, off_t len);
 static int	vfs_truncate(const char *path, off_t len);
 
@@ -114,6 +115,8 @@ static int	vfs_chmod(const char *path, mode_t mode);
 static inline const char *polar_vfs_file_type_and_path(const char *path, int *kind);
 static void *vfs_mmap(void *start, size_t length, int prot, int flags, int file, off_t offset);
 
+static PolarVFSKind vfs_type(int fd);
+
 static const vfs_mgr *const vfs[POLAR_VFS_KIND_SIZE] =
 {
 	/* Local file system interface. */
@@ -171,6 +174,7 @@ static const vfs_mgr vfs_interface =
 	.vfs_fsync = vfs_fsync,
 	.vfs_unlink = vfs_unlink,
 	.vfs_rename = vfs_rename,
+	.vfs_posix_fallocate = vfs_posix_fallocate,
 	.vfs_fallocate = vfs_fallocate,
 	.vfs_ftruncate = vfs_ftruncate,
 	.vfs_truncate = vfs_truncate,
@@ -182,6 +186,7 @@ static const vfs_mgr vfs_interface =
 	.vfs_mgr_func = vfs_get_mgr,
 	.vfs_chmod = vfs_chmod,
 	.vfs_mmap = vfs_mmap,
+	.vfs_type = vfs_type,
 };
 
 bool		localfs_mode = false;
@@ -840,7 +845,40 @@ vfs_rename(const char *oldfile, const char *newfile)
 }
 
 static int
-vfs_fallocate(int file, off_t offset, off_t len)
+vfs_posix_fallocate(int file, off_t offset, off_t len)
+{
+	vfs_vfd    *vfdP = NULL;
+	int			rc = 0;
+	int			save_errno;
+
+	VFS_HOLD_INTERRUPTS();
+
+	CHECK_FD_REENTRANT_BEGIN();
+	POLAR_VFS_FD_MASK_RMOVE(file);
+	vfdP = vfs_find_file(file);
+
+	if (unlikely(polar_vfs_debug))
+		elog(LOG, "vfs_posix_fallocate from %s", vfdP->file_name);
+
+	if (polar_vfs_io_before_hook)
+		polar_vfs_io_before_hook(vfdP, 0, VFS_FALLOCATE);
+
+	rc = vfs[vfdP->kind]->vfs_posix_fallocate(vfdP->fd, offset, len);
+	save_errno = errno;
+
+	if (polar_vfs_io_after_hook)
+		polar_vfs_io_after_hook(vfdP, 0, VFS_FALLOCATE);
+
+	CHECK_FD_REENTRANT_END();
+
+	VFS_RESUME_INTERRUPTS();
+
+	errno = save_errno;
+	return rc;
+}
+
+static int
+vfs_fallocate(int file, int mode, off_t offset, off_t len)
 {
 	vfs_vfd    *vfdP = NULL;
 	int			rc = 0;
@@ -852,12 +890,13 @@ vfs_fallocate(int file, off_t offset, off_t len)
 	POLAR_VFS_FD_MASK_RMOVE(file);
 	vfdP = vfs_find_file(file);
 
-	elog(LOG, "vfs_fallocate from %s", vfdP->file_name);
+	if (unlikely(polar_vfs_debug))
+		elog(LOG, "vfs_fallocate from %s", vfdP->file_name);
 
 	if (polar_vfs_io_before_hook)
 		polar_vfs_io_before_hook(vfdP, 0, VFS_FALLOCATE);
 
-	rc = vfs[vfdP->kind]->vfs_fallocate(vfdP->fd, offset, len);
+	rc = vfs[vfdP->kind]->vfs_fallocate(vfdP->fd, mode, offset, len);
 	save_errno = errno;
 
 	if (polar_vfs_io_after_hook)
@@ -1272,3 +1311,14 @@ vfs_mmap(void *start, size_t length, int prot, int flags, int file, off_t offset
 
 	return vfs[vfdP->kind]->vfs_mmap(start, length, prot, flags, vfdP->fd, offset);
 }
+
+static PolarVFSKind
+vfs_type(int fd)
+{
+	vfs_vfd    *vfdP = NULL;
+
+	POLAR_VFS_FD_MASK_RMOVE(fd);
+	vfdP = vfs_find_file(fd);
+
+	return vfs[vfdP->kind]->vfs_type(vfdP->fd);
+}
diff --git a/src/test/modules/test_bulkio/.gitignore b/src/test/modules/test_bulkio/.gitignore
new file mode 100644
index 00000000000..5dcb3ff9723
--- /dev/null
+++ b/src/test/modules/test_bulkio/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_bulkio/Makefile b/src/test/modules/test_bulkio/Makefile
new file mode 100644
index 00000000000..4daf0189666
--- /dev/null
+++ b/src/test/modules/test_bulkio/Makefile
@@ -0,0 +1,20 @@
+# src/test/modules/test_bulkio/Makefile
+
+MODULE_big = test_bulkio
+OBJS = test_bulkio.o $(WIN32RES)
+PGFILEDESC = "test_bulkio - test code for bulk IO interface"
+
+EXTENSION = test_bulkio
+DATA = test_bulkio--1.0.sql
+REGRESS = test_bulkio
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_bulkio
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_bulkio/expected/test_bulkio.out b/src/test/modules/test_bulkio/expected/test_bulkio.out
new file mode 100644
index 00000000000..dbb150a0e64
--- /dev/null
+++ b/src/test/modules/test_bulkio/expected/test_bulkio.out
@@ -0,0 +1,10 @@
+CREATE EXTENSION test_bulkio;
+-- The default RELSEG_SIZE is 128GB on this PolarDB version, skip test with
+-- polar_zero_extend_method = 'none' or 'bulkwrite'
+set polar_zero_extend_method = 'fallocate';
+SELECT test_bulkio();
+ test_bulkio 
+-------------
+ 
+(1 row)
+
diff --git a/src/test/modules/test_bulkio/sql/test_bulkio.sql b/src/test/modules/test_bulkio/sql/test_bulkio.sql
new file mode 100644
index 00000000000..21ecae8211d
--- /dev/null
+++ b/src/test/modules/test_bulkio/sql/test_bulkio.sql
@@ -0,0 +1,6 @@
+CREATE EXTENSION test_bulkio;
+
+-- The default RELSEG_SIZE is 128GB on this PolarDB version, skip test with
+-- polar_zero_extend_method = 'none' or 'bulkwrite'
+set polar_zero_extend_method = 'fallocate';
+SELECT test_bulkio();
diff --git a/src/test/modules/test_bulkio/test_bulkio--1.0.sql b/src/test/modules/test_bulkio/test_bulkio--1.0.sql
new file mode 100644
index 00000000000..fffe8a339f3
--- /dev/null
+++ b/src/test/modules/test_bulkio/test_bulkio--1.0.sql
@@ -0,0 +1,8 @@
+/* src/test/modules/test_bulkio/test_bulkio--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_bulkio" to load this file. \quit
+
+CREATE FUNCTION test_bulkio()
+RETURNS VOID STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_bulkio/test_bulkio.c b/src/test/modules/test_bulkio/test_bulkio.c
new file mode 100644
index 00000000000..16b5372e663
--- /dev/null
+++ b/src/test/modules/test_bulkio/test_bulkio.c
@@ -0,0 +1,131 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_bulkio.c
+ *	  Test module for bulk IO interface
+ *
+ * Copyright (c) 2024, Alibaba Group Holding Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * IDENTIFICATION
+ *	  src/test/modules/test_bulkio/test_bulkio.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "storage/smgr.h"
+
+PG_MODULE_MAGIC;
+
+static int	blk_range = 20;
+static ForkNumber forknum = MAIN_FORKNUM;
+PGIOAlignedBlock write_buffers[32];
+PGIOAlignedBlock read_buffers[32];
+
+static void
+test_bulkread(SMgrRelation smgr, BlockNumber begin_blkno)
+{
+	MemSet(read_buffers, 0, BLCKSZ * blk_range);
+
+	for (int i = 0; i < blk_range; i++)
+	{
+		smgrextend(smgr, forknum, begin_blkno + i, &write_buffers[i], true);
+		smgrread(smgr, forknum, begin_blkno + i, &read_buffers[i]);
+
+		/* Cross validation */
+		if (memcmp(&write_buffers[i], &read_buffers[i], BLCKSZ) != 0)
+			elog(ERROR, "bulkio test read failed");
+	}
+
+	MemSet(read_buffers, 0, BLCKSZ * blk_range);
+	polar_smgrbulkread(smgr, forknum, begin_blkno, blk_range, &read_buffers);
+
+	if (memcmp(&write_buffers, &read_buffers, BLCKSZ * blk_range) != 0)
+		elog(ERROR, "bulkio test bulk read failed");
+
+	smgrtruncate(smgr, &forknum, 1, &begin_blkno);
+}
+
+static void
+test_bulkwrite(SMgrRelation smgr, BlockNumber begin_blkno)
+{
+	smgrzeroextend(smgr, forknum, begin_blkno, blk_range, true);
+
+	polar_smgrbulkwrite(smgr, forknum, begin_blkno, blk_range, &write_buffers, true);
+
+	MemSet(read_buffers, 0, BLCKSZ * blk_range);
+	polar_smgrbulkread(smgr, forknum, begin_blkno, blk_range, &read_buffers);
+
+	if (memcmp(&write_buffers, &read_buffers, BLCKSZ * blk_range) != 0)
+		elog(ERROR, "bulkio test bulk write failed");
+
+	smgrtruncate(smgr, &forknum, 1, &begin_blkno);
+}
+
+static void
+test_bulkextend(SMgrRelation smgr, BlockNumber begin_blkno)
+{
+	polar_smgrbulkextend(smgr, forknum, begin_blkno, blk_range, &write_buffers, true);
+
+	MemSet(read_buffers, 0, BLCKSZ * blk_range);
+	polar_smgrbulkread(smgr, forknum, begin_blkno, blk_range, &read_buffers);
+
+	if (memcmp(&write_buffers, &read_buffers, BLCKSZ * blk_range) != 0)
+		elog(ERROR, "bulkio test bulk extend failed");
+
+	smgrtruncate(smgr, &forknum, 1, &begin_blkno);
+}
+
+static void
+test_bulkio_aux(SMgrRelation smgr, BlockNumber begin_blkno)
+{
+	BlockNumber nblocks = smgrnblocks(smgr, forknum);
+
+	if (begin_blkno - nblocks > 0)
+		smgrzeroextend(smgr, forknum, nblocks, begin_blkno - nblocks, true);
+
+	for (int i = 0; i < blk_range; i++)
+		MemSet(&write_buffers[i], begin_blkno + i, BLCKSZ);
+
+	test_bulkread(smgr, begin_blkno);
+	test_bulkwrite(smgr, begin_blkno);
+	test_bulkextend(smgr, begin_blkno);
+}
+
+PG_FUNCTION_INFO_V1(test_bulkio);
+Datum
+test_bulkio(PG_FUNCTION_ARGS)
+{
+	BlockNumber zero_blkno = 0;
+	RelFileNode perf_rlocator = {MyDatabaseTableSpace, MyDatabaseId, 1};
+	SMgrRelation smgr = smgropen(perf_rlocator, InvalidBackendId);
+
+	if (!smgrexists(smgr, forknum))
+		smgrcreate(smgr, forknum, false);
+	else
+		smgrtruncate(smgr, &forknum, 1, &zero_blkno);
+
+	test_bulkio_aux(smgr, 0);
+	test_bulkio_aux(smgr, 1 * RELSEG_SIZE - 10);
+	test_bulkio_aux(smgr, 2 * RELSEG_SIZE - 10);
+	test_bulkio_aux(smgr, 3 * RELSEG_SIZE - 10);
+
+	smgrdounlinkall(&smgr, 1, false);
+	smgrclose(smgr);
+
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_bulkio/test_bulkio.control b/src/test/modules/test_bulkio/test_bulkio.control
new file mode 100644
index 00000000000..47db048caa2
--- /dev/null
+++ b/src/test/modules/test_bulkio/test_bulkio.control
@@ -0,0 +1,4 @@
+comment = 'Test code for bulk IO interface'
+default_version = '1.0'
+module_pathname = '$libdir/test_bulkio'
+relocatable = true
diff --git a/src/test/modules/test_polar_directio/test_directio.c b/src/test/modules/test_polar_directio/test_directio.c
index 1ea2f99bf34..4f01c15c0e8 100644
--- a/src/test/modules/test_polar_directio/test_directio.c
+++ b/src/test/modules/test_polar_directio/test_directio.c
@@ -89,7 +89,7 @@ test_directio(PG_FUNCTION_ARGS)
 
 	if (polar_directio_buffer == NULL &&
 		posix_memalign((void **) &polar_directio_buffer,
-					   POLAR_DIRECTIO_ALIGN_LEN,
+					   PG_IO_ALIGN_SIZE,
 					   polar_max_direct_io_size) != 0)
 		elog(PANIC, "posix_memalign alloc polar_directio_buffer failed!");
 
@@ -442,7 +442,7 @@ test_aligned_buffer_offset_len(int directio_fd, int bufferio_fd)
 	for (i = 0; i < SUBAPI_LOOP; i++)
 	{
 		len = POLAR_DIRECTIO_ALIGN(random() % polar_max_direct_io_size);
-		Assert(0 == posix_memalign((void **) &buffer, POLAR_DIRECTIO_ALIGN_LEN, len));
+		Assert(0 == posix_memalign((void **) &buffer, PG_IO_ALIGN_SIZE, len));
 		MemSet(buffer, 0x4, len);
 		Assert(0 == polar_stat(directio_file, &stat_buf));
 		offset = POLAR_DIRECTIO_ALIGN_DOWN(random() % stat_buf.st_size);
diff --git a/src/test/polar_pl/Makefile b/src/test/polar_pl/Makefile
index 52bb7ab580f..5fbc74c99b6 100644
--- a/src/test/polar_pl/Makefile
+++ b/src/test/polar_pl/Makefile
@@ -6,6 +6,9 @@
 #
 #-------------------------------------------------------------------------
 
+export enable_fault_injector
+export with_ssl
+
 EXTRA_INSTALL = external/polar_monitor
 EXTRA_INSTALL += contrib/pg_stat_statements
 
@@ -17,8 +20,6 @@ subdir = src/test/polar_pl
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-export with_ssl
-
 check:
 	$(prove_check)
 
diff --git a/src/test/polar_pl/t/011_polar_bulk_extend.pl b/src/test/polar_pl/t/011_polar_bulk_extend.pl
deleted file mode 100644
index 36a6015460e..00000000000
--- a/src/test/polar_pl/t/011_polar_bulk_extend.pl
+++ /dev/null
@@ -1,104 +0,0 @@
-# 011_polar_bulk_extend.pl
-#	  In this cases, we will check bulk extending in 100 MB table.
-#
-# Copyright (c) 2024, Alibaba Group Holding Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# IDENTIFICATION
-#	  src/test/polar_pl/t/011_polar_bulk_extend.pl
-
-use strict;
-use warnings;
-use PostgreSQL::Test::Cluster;
-use PostgreSQL::Test::Utils;
-use Test::More;
-
-plan tests => 4;
-
-# Start Server
-my $node_primary = PostgreSQL::Test::Cluster->new('primary');
-$node_primary->polar_init_primary;
-$node_primary->start;
-
-# Create extension polar_monitor
-$node_primary->safe_psql('postgres',
-	'CREATE EXTENSION IF NOT EXISTS polar_monitor;');
-
-
-# Create table
-$node_primary->safe_psql('postgres',
-	q[create table bulk_extend_tbl(id int8, value int8);]);
-
-
-# Close the feature
-$node_primary->safe_psql('postgres',
-	'alter system set polar_bulk_extend_size = 0;');
-$node_primary->safe_psql('postgres',
-	'alter system set polar_min_bulk_extend_table_size = 0;');
-$node_primary->reload;
-
-# Load Data
-$node_primary->safe_psql('postgres',
-	q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);]
-);
-
-is( $node_primary->safe_psql(
-		'postgres',
-		'select heap_bulk_extend_times = 0 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';'
-	),
-	't',
-	'heap_bulk_extend_times should be 0');
-
-is( $node_primary->safe_psql(
-		'postgres',
-		'select heap_bulk_extend_blocks = 0 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';'
-	),
-	't',
-	'heap_bulk_extend_blocks should be 0');
-
-# Reset table
-$node_primary->safe_psql('postgres', q[drop table bulk_extend_tbl;]);
-
-# Open the feature
-$node_primary->safe_psql('postgres',
-	'alter system set polar_bulk_extend_size = 512;');
-$node_primary->reload;
-
-$node_primary->safe_psql('postgres',
-	q[create table bulk_extend_tbl(id int8, value int8);]);
-
-# Load Data
-$node_primary->safe_psql('postgres',
-	q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);]
-);
-
-my $bulk_extend_times = $node_primary->safe_psql('postgres',
-	'select heap_bulk_extend_times > 20 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';'
-);
-
-my $bulk_extend_blocks = $node_primary->safe_psql('postgres',
-	'select heap_bulk_extend_blocks > 10000 from polar_pg_stat_bulk_extend_all_tables where relname=\'bulk_extend_tbl\';'
-);
-
-# For stable cases, we use > 20/ > 10000 instead of =25/=13312. And we print acutal values
-is($bulk_extend_times, 't',
-	"heap_bulk_extend_times should be 25 > 20. But actual heap_bulk_extend_times is $bulk_extend_times."
-);
-is($bulk_extend_blocks, 't',
-	"heap_bulk_extend_blocks should be 13312 > 10000. But actual heap_bulk_extend_blocks is $bulk_extend_blocks."
-);
-print
-  "The actual heap_bulk_extend_times is $bulk_extend_times, actual heap_bulk_extend_blocks is $bulk_extend_blocks";
-
-$node_primary->stop;
diff --git a/src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl b/src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl
deleted file mode 100644
index 6ef57a43aaf..00000000000
--- a/src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl
+++ /dev/null
@@ -1,93 +0,0 @@
-use strict;
-# 012_polar_create_index_bulk_extend.pl
-#	  create index bulk extend test
-#
-# Copyright (c) 2024, Alibaba Group Holding Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# IDENTIFICATION
-#	  src/test/polar_pl/t/012_polar_create_index_bulk_extend.pl
-
-use warnings;
-use PostgreSQL::Test::Cluster;
-use PostgreSQL::Test::Utils;
-use Test::More;
-
-plan tests => 2;
-
-# Start Server
-my $node_primary = PostgreSQL::Test::Cluster->new('primary');
-$node_primary->polar_init_primary;
-$node_primary->start;
-
-# Create extension polar_monitor
-$node_primary->safe_psql('postgres',
-	'CREATE EXTENSION IF NOT EXISTS polar_monitor;');
-
-
-# Create table
-$node_primary->safe_psql('postgres',
-	q[create table bulk_extend_tbl(id int8, value int8);]);
-
-
-# Close the feature
-$node_primary->safe_psql('postgres',
-	'alter system set polar_index_create_bulk_extend_size = 0;');
-$node_primary->safe_psql('postgres',
-	'alter system set polar_min_bulk_extend_table_size = 0;');
-$node_primary->reload;
-
-# Load Data
-$node_primary->safe_psql('postgres',
-	q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);]
-);
-
-# Create index
-$node_primary->safe_psql('postgres',
-	q[CREATE INDEX bulk_extend_idx on bulk_extend_tbl(id);]);
-
-is( $node_primary->safe_psql(
-		'postgres',
-		'select idx_create_extend_times = 0 from polar_pg_stat_all_index_extend_stats where relname=\'bulk_extend_tbl\';'
-	),
-	't',
-	'idx_create_extend_times should be 0');
-
-# Reset table
-$node_primary->safe_psql('postgres', q[drop table bulk_extend_tbl;]);
-
-# Open the feature
-$node_primary->safe_psql('postgres',
-	'alter system set polar_index_create_bulk_extend_size = 512;');
-$node_primary->reload;
-
-$node_primary->safe_psql('postgres',
-	q[create table bulk_extend_tbl(id int8, value int8);]);
-
-# Load Data
-$node_primary->safe_psql('postgres',
-	q[INSERT INTO bulk_extend_tbl select generate_series,generate_series from generate_series(0, 12800*185 + 184);]
-);
-
-$node_primary->safe_psql('postgres',
-	q[CREATE INDEX bulk_extend_idx on bulk_extend_tbl(id);]);
-
-is( $node_primary->safe_psql(
-		'postgres',
-		'select idx_create_extend_times = 13 from polar_pg_stat_all_index_extend_stats where relname=\'bulk_extend_tbl\';'
-	),
-	't',
-	'idx_create_extend_times should be 13');
-
-$node_primary->stop;
diff --git a/src/test/polar_pl/t/013_polar_index_bulk_extend.pl b/src/test/polar_pl/t/013_polar_index_bulk_extend.pl
deleted file mode 100644
index 3cc0aea793b..00000000000
--- a/src/test/polar_pl/t/013_polar_index_bulk_extend.pl
+++ /dev/null
@@ -1,77 +0,0 @@
-# 013_polar_index_bulk_extend.pl
-#	  polar index insert bulk extend test
-#
-# Copyright (c) 2024, Alibaba Group Holding Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# IDENTIFICATION
-#	  src/test/polar_pl/t/013_polar_index_bulk_extend.pl
-
-use strict;
-use warnings;
-use PostgreSQL::Test::Cluster;
-use PostgreSQL::Test::Utils;
-use Test::More;
-
-plan tests => 2;
-
-my $node = PostgreSQL::Test::Cluster->new('primary');
-$node->polar_init_primary;
-$node->start;
-
-# Set the min bulk extend table size to 0, so the index bulk
-# extend always hits.
-$node->safe_psql('postgres',
-	'alter system set polar_min_bulk_extend_table_size = 0;');
-
-# Set the index bulk extend size to 256 (2MB), the index
-# size will larger than 2MB.
-$node->safe_psql('postgres',
-	'alter system set polar_index_bulk_extend_size = 256;');
-$node->reload;
-
-$node->safe_psql(
-	'postgres',
-	q[create table test_index_bulk_extend(test1 int);
-		create index test_index on test_index_bulk_extend(test1);]);
-
-$node->safe_psql('postgres',
-	q[insert into test_index_bulk_extend values(1);]);
-
-# 2 * 1024 * 1024 = 2097152 = 2MB
-is( $node->safe_psql(
-		'postgres',
-		"select pg_indexes_size('test_index_bulk_extend') > 2097152;"),
-	't',
-	'index bulk extend 2MB');
-
-$node->safe_psql('postgres', q[truncate test_index_bulk_extend;]);
-
-# Set the index bulk extend size to 512 (4MB), the index
-# size will larger than 4MB.
-$node->safe_psql('postgres',
-	'alter system set polar_index_bulk_extend_size = 512;');
-$node->reload;
-
-$node->safe_psql('postgres',
-	q[insert into test_index_bulk_extend values(1);]);
-
-# 4 * 1024 * 1024 = 4194304 = 4MB
-is( $node->safe_psql(
-		'postgres',
-		"select pg_indexes_size('test_index_bulk_extend') > 4194304;"),
-	't',
-	'index bulk extend 4MB');
-
-$node->stop;
diff --git a/src/test/polar_pl/t/038_bulk_write.pl b/src/test/polar_pl/t/038_bulk_write.pl
new file mode 100644
index 00000000000..26512ef3a25
--- /dev/null
+++ b/src/test/polar_pl/t/038_bulk_write.pl
@@ -0,0 +1,149 @@
+#!/usr/bin/perl
+
+# 038_bulk_write.pl
+#
+# Copyright (c) 2024, Alibaba Group Holding Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# IDENTIFICATION
+#	  src/test/polar_pl/t/038_bulk_write.pl
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node_primary;
+my $node_replica;
+my $node_standby;
+
+sub test_index_create()
+{
+	$node_primary->safe_psql('postgres',
+		'CREATE TABLE t(id int, ir int4range)');
+	$node_primary->safe_psql('postgres',
+		'INSERT INTO t SELECT i, int4range(i, i+100) FROM generate_series(1,10000) AS i'
+	);
+	$node_primary->safe_psql('postgres', 'CREATE INDEX ON t(id)');
+	$node_primary->safe_psql('postgres',
+		'CREATE INDEX ON t USING SPGIST (ir)');
+	$node_primary->safe_psql('postgres', 'VACUUM FULL t');
+	is( $node_primary->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		10000,
+		"btree index is ok");
+	$node_primary->safe_psql('postgres', 'TRUNCATE t');
+	$node_primary->safe_psql('postgres',
+		'INSERT INTO t SELECT i, int4range(i, i+100) FROM generate_series(1,10000) AS i'
+	);
+	is( $node_primary->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		10000,
+		"btree index is ok");
+
+	$node_primary->wait_for_catchup($node_replica);
+	is( $node_replica->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		10000,
+		"btree index is ok in replica");
+
+	$node_primary->wait_for_catchup($node_standby);
+	is( $node_standby->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		10000,
+		"btree index is ok in standby");
+
+	$node_primary->safe_psql('postgres', 'DROP TABLE t');
+	$node_primary->safe_psql('postgres',
+		'CREATE TABLE t(id int, ir int4range)');
+	$node_primary->safe_psql('postgres',
+		'INSERT INTO t SELECT i, int4range(i, i+100) FROM generate_series(1,10000) AS i'
+	);
+	$node_primary->safe_psql('postgres', 'CREATE INDEX ON t(id)');
+	$node_primary->stop('immediate');
+	$node_primary->start;
+	is( $node_primary->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		10000,
+		"btree index is ok after crash");
+
+	$node_primary->safe_psql('postgres', 'DROP TABLE t');
+}
+
+$node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->polar_init_primary;
+
+$node_replica = PostgreSQL::Test::Cluster->new('replica');
+$node_replica->polar_init_replica($node_primary);
+
+$node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->polar_init_standby($node_primary);
+
+$node_primary->start;
+
+$node_primary->polar_create_slot($node_replica->name);
+$node_primary->polar_create_slot($node_standby->name);
+
+$node_replica->start;
+$node_standby->start;
+
+# test bulk write with polar_zero_extend_method = none
+$node_primary->append_conf('postgresql.conf',
+	"polar_zero_extend_method = none");
+$node_primary->reload;
+
+foreach my $maxpages (1, 2, 16, 128, 512)
+{
+	print("test maxpages=$maxpages\n");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_bulk_write_maxpages = $maxpages");
+	$node_primary->reload;
+	test_index_create;
+}
+
+# test bulk write with polar_zero_extend_method = bulkwrite
+$node_primary->append_conf('postgresql.conf',
+	"polar_zero_extend_method = bulkwrite");
+$node_primary->reload;
+
+foreach my $maxpages (1, 2, 16, 128, 512)
+{
+	print("test maxpages=$maxpages\n");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_bulk_write_maxpages = $maxpages");
+	$node_primary->reload;
+	test_index_create;
+}
+
+# test bulk write with polar_zero_extend_method = fallocate
+$node_primary->append_conf('postgresql.conf',
+	"polar_zero_extend_method = fallocate");
+$node_primary->reload;
+
+foreach my $maxpages (1, 2, 16, 128, 512)
+{
+	print("test maxpages=$maxpages\n");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_bulk_write_maxpages = $maxpages");
+	$node_primary->reload;
+	test_index_create;
+}
+
+# done with the node
+$node_primary->stop;
+$node_replica->stop;
+$node_standby->stop;
+
+done_testing();
diff --git a/src/test/polar_pl/t/044_polar_zero_buffers.pl b/src/test/polar_pl/t/044_polar_zero_buffers.pl
new file mode 100644
index 00000000000..1f7989c34ee
--- /dev/null
+++ b/src/test/polar_pl/t/044_polar_zero_buffers.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/perl
+# 044_polar_zero_buffers.pl
+#	  Test polar_pwrite_zeros with different size of GUC polar_zero_buffers.
+#
+# Copyright (c) 2024, Alibaba Group Holding Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# IDENTIFICATION
+#	  src/test/polar_pl/t/044_polar_zero_buffers.pl
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+my $backup_name = 'my_backup';
+
+# Take backup
+$node_primary->backup($backup_name);
+
+# Create streaming standby linking to primary
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+$node_standby->start;
+
+sub test_polar_pwrite_zeros
+{
+	$node_primary->safe_psql('postgres', 'CREATE TABLE t(id int)');
+	$node_primary->safe_psql('postgres',
+		'INSERT INTO t SELECT generate_series(1,1000000)');
+	$node_primary->safe_psql('postgres', 'CREATE INDEX ON t(id)');
+	is( $node_primary->safe_psql(
+			'postgres', 'SET enable_indexscan = off; SELECT count(*) FROM t'),
+		1000000,
+		"heap is ok on primary");
+	is( $node_primary->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		1000000,
+		"btree index is ok on primary");
+
+	$node_primary->wait_for_catchup($node_standby);
+
+	is( $node_standby->safe_psql(
+			'postgres', 'SET enable_indexscan = off; SELECT count(*) FROM t'),
+		1000000,
+		"heap is ok on standby");
+	is( $node_standby->safe_psql(
+			'postgres', 'SET enable_seqscan = off; SELECT count(*) FROM t'),
+		1000000,
+		"btree index is ok on standby");
+
+	$node_primary->safe_psql('postgres', 'DROP TABLE t');
+}
+
+foreach my $zero_buffers (-1, 0, 1, 3, 16, 512)
+{
+	print("test zero_buffers=$zero_buffers\n");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_zero_buffers = $zero_buffers");
+	$node_primary->restart;
+	$node_standby->append_conf('postgresql.conf',
+		"polar_zero_buffers = $zero_buffers");
+	$node_standby->restart;
+	test_polar_pwrite_zeros;
+}
+
+done_testing();
diff --git a/src/test/polar_pl/t/045_bulk_extend.pl b/src/test/polar_pl/t/045_bulk_extend.pl
new file mode 100644
index 00000000000..6ac33ec1643
--- /dev/null
+++ b/src/test/polar_pl/t/045_bulk_extend.pl
@@ -0,0 +1,220 @@
+#!/usr/bin/perl
+# 045_bulk_extend.pl
+#	  Test bulk extend for heap_tbl table and btree index.
+#
+# Copyright (c) 2024, Alibaba Group Holding Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# IDENTIFICATION
+#	  src/test/polar_pl/t/045_bulk_extend.pl
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf('postgresql.conf',
+	"polar_heap_bulk_extend_size = 0");
+$node_primary->append_conf('postgresql.conf',
+	"polar_index_bulk_extend_size = 0");
+$node_primary->append_conf('postgresql.conf',
+	"polar_recovery_bulk_extend_size = 0");
+$node_primary->append_conf('postgresql.conf', "max_connections = 10");
+$node_primary->append_conf('postgresql.conf', "shared_buffers = 16MB");
+$node_primary->append_conf('postgresql.conf', "enable_seqscan = off");
+$node_primary->start;
+my $backup_name = 'my_backup';
+
+# Take backup
+$node_primary->backup($backup_name);
+
+# Create streaming standby linking to primary
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+$node_standby->start;
+
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION amcheck');
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION bloom');
+
+$node_primary->safe_psql('postgres', 'CREATE TABLE heap_tbl(id int)');
+$node_primary->safe_psql('postgres',
+	'CREATE INDEX btree_idx ON heap_tbl(id)');
+$node_primary->safe_psql('postgres',
+	'CREATE TABLE misc_tbl(id int4, arr int4[], gp point, sp point, m int4)');
+$node_primary->safe_psql('postgres',
+	'CREATE INDEX gin_idx ON misc_tbl USING gin(arr)');
+$node_primary->safe_psql('postgres',
+	'CREATE INDEX gist_idx ON misc_tbl USING gist(gp)');
+$node_primary->safe_psql('postgres',
+	'CREATE INDEX spgist_idx ON misc_tbl USING spgist(sp)');
+$node_primary->safe_psql('postgres',
+	'CREATE INDEX bloom_idx ON misc_tbl USING bloom(m, id)');
+
+my ($base_heap_size, $base_btree_size, $base_gin_size,
+	$base_gist_size, $base_spgist_size, $base_bloom_size);
+
+sub bulk_extend_sanity_check
+{
+	my $node = shift;
+	my $extend_size = shift;
+
+	my $heap_size = $node->safe_psql('postgres',
+		"SELECT pg_relation_size('heap_tbl')/8192");
+	my $btree_size = $node->safe_psql('postgres',
+		"SELECT pg_relation_size('btree_idx')/8192");
+	my $gin_size =
+	  $node->safe_psql('postgres', "SELECT pg_relation_size('gin_idx')/8192");
+	my $gist_size = $node->safe_psql('postgres',
+		"SELECT pg_relation_size('gist_idx')/8192");
+	my $spgist_size = $node->safe_psql('postgres',
+		"SELECT pg_relation_size('spgist_idx')/8192");
+	my $bloom_size = $node->safe_psql('postgres',
+		"SELECT pg_relation_size('bloom_idx')/8192");
+
+	if ($extend_size == 0)
+	{
+		if ($node eq $node_primary)
+		{
+			$base_heap_size = $node->safe_psql('postgres',
+				"SELECT pg_relation_size('heap_tbl')/8192");
+			$base_btree_size = $node->safe_psql('postgres',
+				"SELECT pg_relation_size('btree_idx')/8192");
+			$base_gin_size = $node->safe_psql('postgres',
+				"SELECT pg_relation_size('gin_idx')/8192");
+			$base_gist_size = $node->safe_psql('postgres',
+				"SELECT pg_relation_size('gist_idx')/8192");
+			$base_spgist_size = $node->safe_psql('postgres',
+				"SELECT pg_relation_size('spgist_idx')/8192");
+			$base_bloom_size = $node->safe_psql('postgres',
+				"SELECT pg_relation_size('bloom_idx')/8192");
+		}
+	}
+	else
+	{
+		ok(($heap_size - $base_heap_size) < $extend_size,
+			'no waste in heap bulk extend');
+		print("heap_size: $heap_size, base_heap_size: $base_heap_size\n");
+		ok(($btree_size - $base_btree_size) < $extend_size,
+			'no waste in btree bulk extend');
+		print("btree_size: $btree_size, base_btree_size: $base_btree_size\n");
+		ok(($gin_size - $base_gin_size) < $extend_size,
+			'no waste in gin bulk extend');
+		print("gin_size: $gin_size, base_gin_size: $base_gin_size\n");
+		ok(($gist_size - $base_gist_size) < $extend_size,
+			'no waste in gist bulk extend');
+		print("gist_size: $gist_size, base_gist_size: $base_gist_size\n");
+		ok(($spgist_size - $base_spgist_size) < $extend_size,
+			'no waste in spgist bulk extend');
+		print(
+			"spgist_size: $spgist_size, base_spgist_size: $base_spgist_size\n"
+		);
+		ok(($bloom_size - $base_bloom_size) < $extend_size,
+			'no waste in bloom bulk extend');
+		print("bloom_size: $bloom_size, base_bloom_size: $base_bloom_size\n");
+	}
+
+	# heap and btree got amcheck, use it
+	$node->safe_psql('postgres', "SELECT verify_heapam('heap_tbl')");
+	if ($node eq $node_primary)
+	{
+		$node->safe_psql('postgres',
+			"SELECT bt_index_parent_check('btree_idx', 't', 't')");
+	}
+	else
+	{
+		$node->safe_psql('postgres',
+			"SELECT bt_index_check('btree_idx', 't')");
+	}
+
+	is( $node->safe_psql(
+			'postgres',
+			'WITH rand AS (SELECT (floor(random() * 99998) + 3)::int v)
+			 SELECT count(*) FROM misc_tbl, rand
+			 WHERE arr @> array[1, rand.v::int]'
+		),
+		1,
+		'gin index check');
+	is( $node->safe_psql(
+			'postgres',
+			'WITH rand AS (SELECT ceil(random() * 100000)::int v)
+			 SELECT count(*) FROM misc_tbl, rand
+			 WHERE gp <@ box(point(rand.v*10,rand.v*10), point((rand.v+1)*10, (rand.v+1)*10))'
+		),
+		1,
+		'gist index check');
+	is( $node->safe_psql(
+			'postgres',
+			'WITH rand AS (SELECT ceil(random() * 100000)::int v)
+			 SELECT count(*) FROM misc_tbl, rand
+			 WHERE sp <@ box(point(rand.v*10,rand.v*10), point((rand.v+1)*10, (rand.v+1)*10))'
+		),
+		1,
+		'spgist index check');
+	is( $node->safe_psql(
+			'postgres',
+			'WITH rand AS (SELECT (ceil(random() * 99999) + 1)::int v)
+			 SELECT count(*) FROM misc_tbl, rand
+			 WHERE m = rand.v%10 and id = rand.v'
+		),
+		1,
+		'bloom index check');
+}
+
+sub test_bulk_extend
+{
+	my $extend_size = shift;
+
+	print("Test bulk extend with size of $extend_size blocks\n");
+
+	$node_primary->safe_psql('postgres', 'TRUNCATE TABLE heap_tbl');
+	$node_primary->safe_psql('postgres', 'TRUNCATE TABLE misc_tbl');
+
+	$node_primary->safe_psql('postgres',
+		'INSERT INTO heap_tbl SELECT generate_series(1,1000000)');
+	$node_primary->safe_psql(
+		'postgres',
+		"INSERT INTO misc_tbl
+		 SELECT g, array[1, 2, g], point(g*10+1, g*10+1), point(g*10+1, g*10+1), g%10
+		 FROM generate_series(1, 100000) g"
+	);
+
+	bulk_extend_sanity_check($node_primary, $extend_size);
+
+	$node_primary->stop('immediate');
+	$node_primary->start;
+	bulk_extend_sanity_check($node_primary, $extend_size);
+
+	$node_primary->wait_for_catchup($node_standby);
+	bulk_extend_sanity_check($node_standby, $extend_size);
+}
+
+foreach my $extend_size (0, 1, 3, 16, 512)
+{
+	print("test extend_size=$extend_size\n");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_heap_bulk_extend_size = $extend_size");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_index_bulk_extend_size = $extend_size");
+	$node_primary->append_conf('postgresql.conf',
+		"polar_recovery_bulk_extend_size = $extend_size");
+	$node_primary->reload;
+	test_bulk_extend $extend_size;
+}
+
+done_testing();
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index d55aec3a1d0..80f4bfe7cbb 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -2,6 +2,9 @@
 -- CREATE_INDEX
 -- Create ancillary data structures (i.e. indices)
 --
+-- Disable bulk extend to make table size unchanged, thus make plan stable.
+set polar_heap_bulk_extend_size = 0;
+set polar_index_bulk_extend_size = 0;
 -- directory paths are passed to us in environment variables
 \getenv abs_srcdir PG_ABS_SRCDIR
 --
diff --git a/src/test/regress/expected/polar/polar_index_bulk_extend_for_coverage.out b/src/test/regress/expected/polar/polar_index_bulk_extend_for_coverage.out
deleted file mode 100644
index 747e20f0a1e..00000000000
--- a/src/test/regress/expected/polar/polar_index_bulk_extend_for_coverage.out
+++ /dev/null
@@ -1,58 +0,0 @@
--- This regression test case is used for coverage testing.
--- The feature index bulk extend test is in test/polar_pl.
--- Index bulk extend is used in big table index insert, 
--- the regression test cases don't create an big table.
-ALTER SYSTEM SET polar_index_bulk_extend_size = 512;
-ALTER SYSTEM SET polar_min_bulk_extend_table_size = 0;
-SELECT pg_reload_conf();
- pg_reload_conf 
-----------------
- t
-(1 row)
-
-SELECT pg_sleep(2);
- pg_sleep 
-----------
- 
-(1 row)
-
-show polar_index_bulk_extend_size;
- polar_index_bulk_extend_size 
-------------------------------
- 4MB
-(1 row)
-
-show polar_min_bulk_extend_table_size;
- polar_min_bulk_extend_table_size 
-----------------------------------
- 0
-(1 row)
-
-CREATE TABLE test_index_bulk_extend(test1 int, test2 int);
-CREATE INDEX test_index_bulk on test_index_bulk_extend(test1);
-INSERT INTO test_index_bulk_extend values(generate_series(1, 10000), generate_series(1, 10000));
-SELECT * FROM test_index_bulk_extend ORDER BY test1 limit 10;
- test1 | test2 
--------+-------
-     1 |     1
-     2 |     2
-     3 |     3
-     4 |     4
-     5 |     5
-     6 |     6
-     7 |     7
-     8 |     8
-     9 |     9
-    10 |    10
-(10 rows)
-
-DROP INDEX test_index_bulk;
-DROP TABLE test_index_bulk_extend;
-ALTER SYSTEM RESET polar_index_bulk_extend_size;
-ALTER SYSTEM RESET polar_min_bulk_extend_table_size;
-SELECT pg_reload_conf();
- pg_reload_conf 
-----------------
- t
-(1 row)
-
diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out
index 391b36d1318..5900541141b 100644
--- a/src/test/regress/expected/test_setup.out
+++ b/src/test/regress/expected/test_setup.out
@@ -1,6 +1,9 @@
 --
 -- TEST_SETUP --- prepare environment expected by regression test scripts
 --
+-- Disable bulk extend to make table size unchanged, thus make plan stable.
+set polar_heap_bulk_extend_size = 0;
+set polar_index_bulk_extend_size = 0;
 -- directory paths and dlsuffix are passed to us in environment variables
 \getenv abs_srcdir PG_ABS_SRCDIR
 \getenv libdir PG_LIBDIR
diff --git a/src/test/regress/polar_check_schedule b/src/test/regress/polar_check_schedule
index 69c82eb9253..cc27a7895ad 100644
--- a/src/test/regress/polar_check_schedule
+++ b/src/test/regress/polar_check_schedule
@@ -20,4 +20,4 @@ polar_dir: polar
 test: force_unlogged_logged force_trans_ro_non_sup
 test: polar_parallel_bgwriter
 test: polar_invalid_memory_alloc_1 polar_shm_unused
-test: polar_support_gbk_encoding polar_copy_into_gbk polar_index_bulk_extend_for_coverage
+test: polar_support_gbk_encoding polar_copy_into_gbk
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index d8fded3d930..362531b6d51 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -3,6 +3,10 @@
 -- Create ancillary data structures (i.e. indices)
 --
 
+-- Disable bulk extend to make table size unchanged, thus make plan stable.
+set polar_heap_bulk_extend_size = 0;
+set polar_index_bulk_extend_size = 0;
+
 -- directory paths are passed to us in environment variables
 \getenv abs_srcdir PG_ABS_SRCDIR
 
diff --git a/src/test/regress/sql/polar/polar_index_bulk_extend_for_coverage.sql b/src/test/regress/sql/polar/polar_index_bulk_extend_for_coverage.sql
deleted file mode 100644
index e57144cb0a1..00000000000
--- a/src/test/regress/sql/polar/polar_index_bulk_extend_for_coverage.sql
+++ /dev/null
@@ -1,19 +0,0 @@
--- This regression test case is used for coverage testing.
--- The feature index bulk extend test is in test/polar_pl.
--- Index bulk extend is used in big table index insert, 
--- the regression test cases don't create an big table.
-ALTER SYSTEM SET polar_index_bulk_extend_size = 512;
-ALTER SYSTEM SET polar_min_bulk_extend_table_size = 0;
-SELECT pg_reload_conf();
-SELECT pg_sleep(2);
-show polar_index_bulk_extend_size;
-show polar_min_bulk_extend_table_size;
-CREATE TABLE test_index_bulk_extend(test1 int, test2 int);
-CREATE INDEX test_index_bulk on test_index_bulk_extend(test1);
-INSERT INTO test_index_bulk_extend values(generate_series(1, 10000), generate_series(1, 10000));
-SELECT * FROM test_index_bulk_extend ORDER BY test1 limit 10;
-DROP INDEX test_index_bulk;
-DROP TABLE test_index_bulk_extend;
-ALTER SYSTEM RESET polar_index_bulk_extend_size;
-ALTER SYSTEM RESET polar_min_bulk_extend_table_size;
-SELECT pg_reload_conf();
\ No newline at end of file
diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql
index 02c0c84c3ad..2a0f3b5959b 100644
--- a/src/test/regress/sql/test_setup.sql
+++ b/src/test/regress/sql/test_setup.sql
@@ -2,6 +2,10 @@
 -- TEST_SETUP --- prepare environment expected by regression test scripts
 --
 
+-- Disable bulk extend to make table size unchanged, thus make plan stable.
+set polar_heap_bulk_extend_size = 0;
+set polar_index_bulk_extend_size = 0;
+
 -- directory paths and dlsuffix are passed to us in environment variables
 \getenv abs_srcdir PG_ABS_SRCDIR
 \getenv libdir PG_LIBDIR
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 0eff305b4cd..3fee816fb01 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -323,6 +323,8 @@ BuildAccumulator
 BuiltinScript
 BulkInsertState
 BulkInsertStateData
+BulkWriteBuffer
+BulkWriteState
 CACHESIGN
 CAC_state
 CCFastEqualFN
@@ -1699,6 +1701,7 @@ PGEventResultDestroy
 PGFInfoFunction
 PGFileType
 PGFunction
+PGIOAlignedBlock
 PGLZ_HistEntry
 PGLZ_Strategy
 PGMessageField
@@ -1986,6 +1989,7 @@ PendingFsyncEntry
 PendingRelDelete
 PendingRelSync
 PendingUnlinkEntry
+PendingWrite
 PendingWriteback
 PerLockTagEntry
 PerlInterpreter
diff --git a/src/tools/polar_copyright_check.pl b/src/tools/polar_copyright_check.pl
index 98b010f9f44..19e8dcfe2fa 100755
--- a/src/tools/polar_copyright_check.pl
+++ b/src/tools/polar_copyright_check.pl
@@ -59,7 +59,10 @@
 
 my $invalid_comment_body = 'Invalid comment body';
 
-my @common_typo = ('poalr', 'wirte', 'wrod', 'confict');
+my @common_typo = (
+	'poalr', 'wirte', 'wrod', 'confict',
+	'enalbe', 'cleard', 'recognisable', 'exsits',
+	'conficts', 'sucess');
 my @standard_comment_prefix = ('\/\* POLAR px\:', '\/\* POLAR\:');
 
 #------ the max diff line before checking PG community files ------
@@ -552,6 +555,14 @@ sub c_apache_license_format_check
 		return 0;
 	}
 
+	# identification indent
+	if ($lines[$start_line] !~ m/^ \*\t  .+/)
+	{
+		print
+		  "  $logger_error $invalid_comment_body: invalid identification path indent.\n";
+		return 0;
+	}
+
 	# trim the identification and check
 	my $trim = substr($lines[$start_line], 2);
 	$trim = lstrip($trim);