whatyouhide · harunzengin · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023 · Sep 1, 2023
diff --git a/lib/xandra/cluster.ex b/lib/xandra/cluster.ex
@@ -508,12 +508,19 @@ defmodule Xandra.Cluster do
   end
 
   defp with_conn_and_retrying(cluster, options, fun) do
-    RetryStrategy.run_with_retrying(options, fn -> with_conn(cluster, fun) end)
+    case Pool.checkout(cluster) do
+      {:error, :empty} ->
+        action = "checkout from cluster #{inspect(cluster)}"
+        {:error, ConnectionError.new(action, {:cluster, :not_connected})}
+
+      {:ok, connected_hosts} ->
+        RetryStrategy.run_cluster_with_retrying(options, connected_hosts, fun)
+    end
   end
 
   defp with_conn(cluster, fun) do
     case Pool.checkout(cluster) do
-      {:ok, pool} ->
+      {:ok, [{pool, _host} | _connected_hosts]} ->
         fun.(pool)
 
       {:error, :empty} ->

diff --git a/lib/xandra/cluster/pool.ex b/lib/xandra/cluster/pool.ex
@@ -60,7 +60,8 @@ defmodule Xandra.Cluster.Pool do
     :gen_statem.stop(pid, reason, timeout)
   end
 
-  @spec checkout(:gen_statem.server_ref()) :: {:ok, pid()} | {:error, :empty}
+  @spec checkout(:gen_statem.server_ref()) ::
+          {:ok, [{pid(), Host.t()}, ...]} | {:error, :empty}
   def checkout(pid) do
     :gen_statem.call(pid, :checkout)
   end
@@ -333,6 +334,33 @@ defmodule Xandra.Cluster.Pool do
     {:keep_state, data}
   end
 
+  # For testing purposes
+  def handle_event(:info, {:add_test_hosts, hosts_with_status}, _state, %__MODULE__{} = data) do
+    data =
+      Enum.reduce(hosts_with_status, data, fn {%Host{} = host, status}, data_acc ->
+        data_acc =
+          update_in(data_acc.load_balancing_state, fn current_state ->
+            current_state = data_acc.load_balancing_module.host_added(current_state, host)
+            m = data_acc.load_balancing_module
+            f = String.to_existing_atom("host_" <> Atom.to_string(status))
+            a = [current_state, host]
+            apply(m, f, a)
+          end)
+
+        update_in(
+          data_acc.peers,
+          &Map.put(&1, Host.to_peername(host), %{
+            host: host,
+            status: status,
+            pool_pid: Process.spawn(fn -> nil end, []),
+            pool_ref: make_ref()
+          })
+        )
+      end)
+
+    {:keep_state, data}
+  end
+
   # Sent by the connection itself.
   def handle_event(
         :info,
@@ -412,14 +440,19 @@ defmodule Xandra.Cluster.Pool do
         data.load_balancing_module.query_plan(lb_state)
       end)
 
-    # Find the first host in the plan for which we have a pool.
+    # Find all connected hosts
+    connected_hosts =
+      for host <- query_plan,
+          %{pool_pid: pid, host: host} = Map.get(data.peers, Host.to_peername(host)),
+          not is_nil(host),
+          is_pid(pid),
+          do: {pid, host}
+
     reply =
-      query_plan
-      |> Stream.map(fn %Host{} = host -> Map.fetch(data.peers, Host.to_peername(host)) end)
-      |> Enum.find_value(_default = {:error, :empty}, fn
-        {:ok, %{pool_pid: pid}} when is_pid(pid) -> {:ok, pid}
-        _other -> nil
-      end)
+      case connected_hosts do
+        [] -> {:error, :empty}
+        connected_hosts -> {:ok, connected_hosts}
+      end
 
     {data, {:reply, from, reply}}
   end

diff --git a/lib/xandra/retry_strategy.ex b/lib/xandra/retry_strategy.ex
@@ -16,18 +16,26 @@ defmodule Xandra.RetryStrategy do
   When a query fails and a retry strategy module was passed as an option, Xandra
   will:
 
-    1. invoke the `c:new/1` callback with the options passed to the failing
-       function to initialize the given retry strategy
+    1. invoke the `c:new/1` callback with options passed to the failing function
+       to initialize the given retry strategy
 
-    1. ask the retry strategy whether to retry or error out (`c:retry/3`) until
+    2. ask the retry strategy whether to retry or error out (`c:retry/3`) until
        either the query succeeds or `c:retry/3` says to error out
 
   The `c:new/1` and `c:retry/3` callbacks will be invoked in the same
   process that executed the original query.
 
-  If `c:retry/3` says to retry a query, such query will be retried on a
-  different Xandra connection than the one the query was first executed
-  through. For more information, see the documentation for `c:retry/3`.
+  There are two levels where RetryStrategy is invoked, distinguishable with the
+  `:execution_level` key in the options passed to `c:new/1` and `c:retry/3`,
+  namely `:cluster` level and `:xandra` level. On `:cluster` level, you have the option
+  to select a `:target_connection` from the list of `:connected_hosts`, in order to
+  retry on a different node for instance. The `:connected_hosts` in `options` is a
+  list of tuples, where the first element is the Xandra connection pid and the
+  second is of `Host.t()` describing the host.
+
+  If on `:cluster` level `c:retry/3` says to retry a query, such query can be retried on the
+  Xandra connection that is returned in the new `option` by `c:retry/3` under the `:target_connection`
+  key.
 
   ## Examples
 
@@ -75,12 +83,54 @@ defmodule Xandra.RetryStrategy do
         end
       end
 
+  A particularly useful application is to retry on queries on different hosts
+  when using `Xandra.Cluster`. We can even choose not to execute on certain `Host.t()`s
+  (because they may be in a different datacenter). Following example retries on all hosts
+  after the first `:connected_node` has failed:
+
+      defmodule AllNodesStrategy do
+        @behaviour Xandra.RetryStrategy
+
+        alias Xandra.Cluster.Host
+
+        def new(options) do
+          if options[:execution_level] == :cluster do
+            [_already_tried_node | rest_of_nodes] = options[:connected_hosts]
+
+            rest_of_nodes
+          end
+        end
+
+        def retry(_error, options, state) do
+          case options[:execution_level] do
+            :xandra ->
+              :error
+
+            :cluster ->
+              case state do
+                [] ->
+                  :error
+
+                [{conn, %Host{}} | rest_of_nodes] ->
+                  options = Keyword.put(options, :target_connection, conn)
+                  {:retry, options, rest_of_nodes}
+              end
+          end
+        end
+      end
   """
 
+  alias Xandra.Cluster.Host
+
   @type state :: term
 
   @doc """
   Initializes the state of a retry strategy based on the given `options`.
+
+  `connected_hosts` is a list of tuples with Xandra connection pids as its first
+  element and the `Host.t()` information as second. You would need to save the connection
+  information to the state as applicable to your retry logic in order to select the next
+  host in `c:retry/3`. See ##Examples about an example.
   """
   @callback new(options :: keyword) :: state
 
@@ -105,25 +155,31 @@ defmodule Xandra.RetryStrategy do
   third argument. This process will continue until either the query is executed
   successfully or this callback returns `:error`.
 
-  Note that when `{:retry, new_options, new_state}` is returned, the query will
-  be executed again *on a different Xandra connection*. This behaviour is
-  particularly useful with pooled connections and especially when using
-  `Xandra.Cluster` as the pool, since it will mean that there's a chance the
-  retried query will be executed on a different node altogether.
+  Note that when `execution_level: :cluster` if we would return a `:target_connection` pid,
+  the query would be retried on the specified `Xandra` connection. To select a connection pid,
+  we can use `:connected_hosts` key in `options`.
+
+  When retrying on `execution_level: :xandra`, we are retrying with the exact same connection.
   """
   @callback retry(error :: term, options :: keyword, state) ::
               :error | {:retry, new_options :: keyword, new_state :: state}
 
   @doc false
-  @spec run_with_retrying(keyword, (-> result)) :: result when result: var
+  @spec run_with_retrying(keyword, (-> result)) :: result
+        when result: var
   def run_with_retrying(options, fun) do
+    options = Keyword.put(options, :execution_level, :xandra)
+
     case Keyword.pop(options, :retry_strategy) do
-      {nil, _options} -> fun.()
-      {retry_strategy, options} -> run_with_retrying(options, retry_strategy, fun)
+      {nil, _options} ->
+        fun.()
+
+      {retry_strategy, options} ->
+        run_with_retrying(options, retry_strategy, fun)
     end
   end
 
-  defp run_with_retrying(options, retry_strategy, fun) do
+  def run_with_retrying(options, retry_strategy, fun) do
     with {:error, reason} <- fun.() do
       {retry_state, options} =
         Keyword.pop_lazy(options, :retrying_state, fn ->
@@ -146,4 +202,63 @@ defmodule Xandra.RetryStrategy do
       end
     end
   end
+
+  @spec run_cluster_with_retrying(Keyword.t(), [{pid(), Host.t()}, ...], (pid() -> result)) ::
+          result
+        when result: var
+  def run_cluster_with_retrying(options, connected_hosts, fun) do
+    [{conn, _host} | _connected_hosts] = connected_hosts
+
+    options =
+      Keyword.merge(options,
+        execution_level: :cluster,
+        connected_hosts: connected_hosts,
+        target_connection: conn
+      )
+
+    case Keyword.pop(options, :retry_strategy) do
+      {nil, _options} ->
+        fun.(conn)
+
+      {retry_strategy, options} ->
+        run_cluster_with_retrying(options, connected_hosts, retry_strategy, fun)
+    end
+  end
+
+  defp run_cluster_with_retrying(options, connected_hosts, retry_strategy, fun) do
+    {conn, options} =
+      case Keyword.pop(options, :target_connection) do
+        {conn, options} when is_pid(conn) ->
+          {conn, options}
+
+        {:random, options} ->
+          [{conn, _host}] = Enum.take_random(connected_hosts, 1)
+          {conn, options}
+      end
+
+    with {:error, reason} <- fun.(conn) do
+      {retry_state, options} =
+        Keyword.pop_lazy(options, :retrying_state, fn ->
+          retry_strategy.new(options)
+        end)
+
+      case retry_strategy.retry(reason, options, retry_state) do
+        :error ->
+          {:error, reason}
+
+        {:retry, new_options, new_retry_state} ->
+          new_options =
+            Keyword.put(new_options, :retrying_state, new_retry_state)
+            |> Keyword.put_new(:target_connection, :random)
+
+          run_cluster_with_retrying(new_options, connected_hosts, retry_strategy, fun)
+
+        other ->
+          raise ArgumentError,
+                "invalid return value #{inspect(other)} from " <>
+                  "retry strategy #{inspect(retry_strategy)} " <>
+                  "with state #{inspect(retry_state)}"
+      end
+    end
+  end
 end