citusdata · gurkanindibay · Sep 1, 2023 · Jul 25, 2023 · Jul 28, 2023 · Jul 28, 2023
diff --git a/src/backend/distributed/metadata/node_metadata.c b/src/backend/distributed/metadata/node_metadata.c
@@ -9,7 +9,6 @@
 #include "funcapi.h"
 #include "utils/plancache.h"
 
-
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup.h"
@@ -102,8 +101,8 @@
 static int32 GetNextGroupId(void);
 static int GetNextNodeId(void);
 static void InsertPlaceholderCoordinatorRecord(void);
-static void InsertNodeRow(int nodeid, char *nodename, int32 nodeport, NodeMetadata
-						  *nodeMetadata);
+static void InsertNodeRow(int nodeid, char *nodename, int32 nodeport,
+						  NodeMetadata *nodeMetadata);
 static void DeleteNodeRow(char *nodename, int32 nodeport);
 static void BlockDistributedQueriesOnMetadataNodes(void);
 static WorkerNode * TupleToWorkerNode(TupleDesc tupleDescriptor, HeapTuple heapTuple);
@@ -134,6 +133,13 @@
 static void EnsureParentSessionHasExclusiveLockOnPgDistNode(pid_t parentSessionPid);
 static void SetNodeMetadata(MetadataSyncContext *context, bool localOnly);
 static void EnsureTransactionalMetadataSyncMode(void);
+static void LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE
+											lockMode);
+static BackgroundWorkerHandle * CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown);
+static BackgroundWorkerHandle * LockPlacementsWithBackgroundWorkersInPrimaryNode(
+	WorkerNode *workerNode, bool force, int32 lock_cooldown);
+
+/* Function definitions go here */
 
 /* declarations for dynamic loading */
 PG_FUNCTION_INFO_V1(citus_set_coordinator_host);
@@ -152,6 +158,7 @@
 PG_FUNCTION_INFO_V1(citus_activate_node);
 PG_FUNCTION_INFO_V1(master_activate_node);
 PG_FUNCTION_INFO_V1(citus_update_node);
+PG_FUNCTION_INFO_V1(citus_pause_node_within_txn);
 PG_FUNCTION_INFO_V1(master_update_node);
 PG_FUNCTION_INFO_V1(get_shard_id_for_distribution_column);
 PG_FUNCTION_INFO_V1(citus_nodename_for_nodeid);
@@ -160,7 +167,6 @@
 PG_FUNCTION_INFO_V1(citus_is_coordinator);
 PG_FUNCTION_INFO_V1(citus_internal_mark_node_not_synced);
 
-
 /*
  * DefaultNodeMetadata creates a NodeMetadata struct with the fields set to
  * sane defaults, e.g. nodeRack = WORKER_DEFAULT_RACK.
@@ -544,7 +550,8 @@
 							   "metadata is not allowed"),
 						errhint("You can force disabling node, SELECT "
 								"citus_disable_node('%s', %d, "
-								"synchronous:=true);", workerNode->workerName,
+								"synchronous:=true);",
+								workerNode->workerName,
 								nodePort),
 						errdetail("Citus uses the first worker node in the "
 								  "metadata for certain internal operations when "
@@ -693,8 +700,7 @@
 	else
 	{
 		ereport(ERROR, (errmsg(
-							"only the 'shouldhaveshards' property can be set using this function"
-							)));
+							"only the 'shouldhaveshards' property can be set using this function")));
 	}
 
 	TransactionModifiedNodeMetadata = true;
@@ -1160,6 +1166,100 @@
 }
 
 
+/*
+ * Acquires shard metadata locks on all shards residing in the given worker node
+ *
+ * TODO: This function is not compatible with query from any node feature.
+ * To ensure proper behavior, it is essential to acquire locks on placements across all nodes
+ * rather than limiting it to just the coordinator (or the specific node from which this function is called)
+ */
+void
+LockShardsInWorkerPlacementList(WorkerNode *workerNode, LOCKMODE lockMode)
+{
+	List *placementList = AllShardPlacementsOnNodeGroup(workerNode->groupId);
+	LockShardsInPlacementListMetadata(placementList, lockMode);
+}
+
+
+/*
+ * This function is used to start a background worker to kill backends holding conflicting
+ * locks with this backend. It returns NULL if the background worker could not be started.
+ */
+BackgroundWorkerHandle *
+CheckBackgroundWorkerToObtainLocks(int32 lock_cooldown)
+{
+	BackgroundWorkerHandle *handle = StartLockAcquireHelperBackgroundWorker(MyProcPid,
+																			lock_cooldown);
+	if (!handle)
+	{
+		/*
+		 * We failed to start a background worker, which probably means that we exceeded
+		 * max_worker_processes, and this is unlikely to be resolved by retrying. We do not want
+		 * to repeatedly throw an error because if citus_update_node is called to complete a
+		 * failover then finishing is the only way to bring the cluster back up. Therefore we
+		 * give up on killing other backends and simply wait for the lock. We do set
+		 * lock_timeout to lock_cooldown, because we don't want to wait forever to get a lock.
+		 */
+		SetLockTimeoutLocally(lock_cooldown);
+		ereport(WARNING, (errmsg(
+							  "could not start background worker to kill backends with conflicting"
+							  " locks to force the update. Degrading to acquiring locks "
+							  "with a lock time out."),
+						  errhint(
+							  "Increasing max_worker_processes might help.")));
+	}
+	return handle;
+}
+
+
+/*
+ * This function is used to lock shards in a primary node.
+ * If force is true, we start a background worker to kill backends holding
+ * conflicting locks with this backend.
+ *
+ * If the node is a primary node we block reads and writes.
+ *
+ * This lock has two purposes:
+ *
+ * - Ensure buggy code in Citus doesn't cause failures when the
+ *   nodename/nodeport of a node changes mid-query
+ *
+ * - Provide fencing during failover, after this function returns all
+ *   connections will use the new node location.
+ *
+ * Drawback:
+ *
+ * - This function blocks until all previous queries have finished. This
+ *   means that long-running queries will prevent failover.
+ *
+ *   In case of node failure said long-running queries will fail in the end
+ *   anyway as they will be unable to commit successfully on the failed
+ *   machine. To cause quick failure of these queries use force => true
+ *   during the invocation of citus_update_node to terminate conflicting
+ *   backends proactively.
+ *
+ * It might be worth blocking reads to a secondary for the same reasons,
+ * though we currently only query secondaries on follower clusters
+ * where these locks will have no effect.
+ */
+BackgroundWorkerHandle *
+LockPlacementsWithBackgroundWorkersInPrimaryNode(WorkerNode *workerNode, bool force, int32
+												 lock_cooldown)
+{
+	BackgroundWorkerHandle *handle = NULL;
+
+	if (NodeIsPrimary(workerNode))
+	{
+		if (force)
+		{
+			handle = CheckBackgroundWorkerToObtainLocks(lock_cooldown);
+		}
+		LockShardsInWorkerPlacementList(workerNode, AccessExclusiveLock);
+	}
+	return handle;
+}
+
+
 /*
  * citus_update_node moves the requested node to a different nodename and nodeport. It
  * locks to ensure no queries are running concurrently; and is intended for customers who
@@ -1188,8 +1288,6 @@
 	int32 lock_cooldown = PG_GETARG_INT32(4);
 
 	char *newNodeNameString = text_to_cstring(newNodeName);
-	List *placementList = NIL;
-	BackgroundWorkerHandle *handle = NULL;
 
 	WorkerNode *workerNodeWithSameAddress = FindWorkerNodeAnyCluster(newNodeNameString,
 																	 newNodePort);
@@ -1226,64 +1324,9 @@
 		EnsureTransactionalMetadataSyncMode();
 	}
 
-	/*
-	 * If the node is a primary node we block reads and writes.
-	 *
-	 * This lock has two purposes:
-	 *
-	 * - Ensure buggy code in Citus doesn't cause failures when the
-	 *   nodename/nodeport of a node changes mid-query
-	 *
-	 * - Provide fencing during failover, after this function returns all
-	 *   connections will use the new node location.
-	 *
-	 * Drawback:
-	 *
-	 * - This function blocks until all previous queries have finished. This
-	 *   means that long-running queries will prevent failover.
-	 *
-	 *   In case of node failure said long-running queries will fail in the end
-	 *   anyway as they will be unable to commit successfully on the failed
-	 *   machine. To cause quick failure of these queries use force => true
-	 *   during the invocation of citus_update_node to terminate conflicting
-	 *   backends proactively.
-	 *
-	 * It might be worth blocking reads to a secondary for the same reasons,
-	 * though we currently only query secondaries on follower clusters
-	 * where these locks will have no effect.
-	 */
-	if (NodeIsPrimary(workerNode))
-	{
-		/*
-		 * before acquiring the locks check if we want a background worker to help us to
-		 * aggressively obtain the locks.
-		 */
-		if (force)
-		{
-			handle = StartLockAcquireHelperBackgroundWorker(MyProcPid, lock_cooldown);
-			if (!handle)
-			{
-				/*
-				 * We failed to start a background worker, which probably means that we exceeded
-				 * max_worker_processes, and this is unlikely to be resolved by retrying. We do not want
-				 * to repeatedly throw an error because if citus_update_node is called to complete a
-				 * failover then finishing is the only way to bring the cluster back up. Therefore we
-				 * give up on killing other backends and simply wait for the lock. We do set
-				 * lock_timeout to lock_cooldown, because we don't want to wait forever to get a lock.
-				 */
-				SetLockTimeoutLocally(lock_cooldown);
-				ereport(WARNING, (errmsg(
-									  "could not start background worker to kill backends with conflicting"
-									  " locks to force the update. Degrading to acquiring locks "
-									  "with a lock time out."),
-								  errhint(
-									  "Increasing max_worker_processes might help.")));
-			}
-		}
-
-		placementList = AllShardPlacementsOnNodeGroup(workerNode->groupId);
-		LockShardsInPlacementListMetadata(placementList, AccessExclusiveLock);
-	}
+	BackgroundWorkerHandle *handle = LockPlacementsWithBackgroundWorkersInPrimaryNode(
+		workerNode, force,
+		lock_cooldown);
 
 	/*
 	 * if we have planned statements such as prepared statements, we should clear the cache so that
@@ -1330,6 +1373,34 @@
 }
 
 
+/*
+ * This function is designed to obtain locks for all the shards in a worker placement list.
+ * Once the transaction is committed, the acquired locks will be automatically released.
+ * Therefore, it is essential to invoke this function within a transaction.
+ * This function proves beneficial when there is a need to temporarily disable writes to a specific node within a transaction.
+ */
+Datum
+citus_pause_node_within_txn(PG_FUNCTION_ARGS)
+{
+	CheckCitusVersion(ERROR);
+
+	int32 nodeId = PG_GETARG_INT32(0);
+	bool force = PG_GETARG_BOOL(1);
+	int32 lock_cooldown = PG_GETARG_INT32(2);
+
+	WorkerNode *workerNode = FindNodeAnyClusterByNodeId(nodeId);
+	if (workerNode == NULL)
+	{
+		ereport(ERROR, (errcode(ERRCODE_NO_DATA_FOUND),
+						errmsg("node %u not found", nodeId)));
+	}
+
+	LockPlacementsWithBackgroundWorkersInPrimaryNode(workerNode, force, lock_cooldown);
+
+	PG_RETURN_VOID();
+}
+
+
 /*
  * master_update_node is a wrapper function for old UDF name.
  */
@@ -1947,7 +2018,8 @@
 			ereport(ERROR, (errmsg("cannot remove or disable the node "
 								   "%s:%d because because it contains "
 								   "the only shard placement for "
-								   "shard " UINT64_FORMAT, workerNode->workerName,
+								   "shard " UINT64_FORMAT,
+								   workerNode->workerName,
 								   workerNode->workerPort, placement->shardId),
 							errdetail("One of the table(s) that prevents the operation "
 									  "complete successfully is %s",
@@ -2499,7 +2571,8 @@
 	if (!valueBool && workerNode->groupId == COORDINATOR_GROUP_ID)
 	{
 		ereport(ERROR, (errmsg("cannot change \"%s\" field of the "
-							   "coordinator node", field)));
+							   "coordinator node",
+							   field)));
 	}
 }
 

diff --git a/src/backend/distributed/sql/citus--12.0-1--12.1-1.sql b/src/backend/distributed/sql/citus--12.0-1--12.1-1.sql
@@ -2,5 +2,6 @@
 
 -- bump version to 12.1-1
 
+#include "udfs/citus_pause_node_within_txn/12.1-1.sql"
 #include "udfs/citus_prepare_pg_upgrade/12.1-1.sql"
 #include "udfs/citus_finish_pg_upgrade/12.1-1.sql"
diff --git a/src/backend/distributed/sql/downgrades/citus--12.1-1--12.0-1.sql b/src/backend/distributed/sql/downgrades/citus--12.1-1--12.0-1.sql
@@ -1,5 +1,6 @@
 -- citus--12.1-1--12.0-1
-
+DROP FUNCTION pg_catalog.citus_pause_node_within_txn(int,bool,int);
 -- we have modified the relevant upgrade script to include any_value changes
 -- we don't need to upgrade this downgrade path for any_value changes
 -- since if we are doing a Citus downgrade, not PG downgrade, then it would be no-op.
+
diff --git a/src/backend/distributed/sql/udfs/citus_pause_node_within_txn/12.1-1.sql b/src/backend/distributed/sql/udfs/citus_pause_node_within_txn/12.1-1.sql
diff --git a/src/backend/distributed/sql/udfs/citus_pause_node_within_txn/latest.sql b/src/backend/distributed/sql/udfs/citus_pause_node_within_txn/latest.sql
@@ -0,0 +1,13 @@
+CREATE FUNCTION pg_catalog.citus_pause_node_within_txn(node_id int,
+                                              force bool DEFAULT false,
+                                              lock_cooldown int DEFAULT 10000)
+  RETURNS void
+  LANGUAGE C STRICT
+  AS 'MODULE_PATHNAME', $$citus_pause_node_within_txn$$;
+
+COMMENT ON FUNCTION pg_catalog.citus_pause_node_within_txn(node_id int,
+                                              force bool ,
+                                              lock_cooldown int )
+  IS 'pauses node with given id which leads to add lock in tables and prevent any queries to be executed on that node';
+
+REVOKE ALL ON FUNCTION pg_catalog.citus_pause_node_within_txn(int,bool,int) FROM PUBLIC;