From 283fefd992f89e8def6eb1d63d394ffddcd61ca8 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Mon, 19 Aug 2024 23:15:15 -0300 Subject: [PATCH 01/20] Refactor handle of query to its own module The idea is to make easier to compose the query based on options. --- lib/req_athena.ex | 65 +++++++++++++--------------------- lib/req_athena/query.ex | 57 +++++++++++++++++++++++++++++ test/req_athena/query_test.exs | 36 +++++++++++++++++++ test/req_athena_test.exs | 5 ++- 4 files changed, 121 insertions(+), 42 deletions(-) create mode 100644 lib/req_athena/query.ex create mode 100644 test/req_athena/query_test.exs diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 6e57e22..9234f53 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -38,7 +38,7 @@ defmodule ReqAthena do * `:database` - Required. The AWS Athena database name. - * `:output_location` - Conditional. The S3 url location to output AWS Athena query results. + * `:output_location` - Conditional. The S3 URL location to output AWS Athena query results. * `:workgroup` - Conditional. The AWS Athena workgroup. @@ -126,11 +126,15 @@ defmodule ReqAthena do end end + defp put_request_body(request, query, cache_query) when is_binary(query) do + put_request_body(request, %ReqAthena.Query{query: query}, cache_query) + end + defp put_request_body(request, {query, []}, cache_query) do - put_request_body(request, query, cache_query) + put_request_body(request, %ReqAthena.Query{query: query}, cache_query) end - defp put_request_body(request, {query, _params}, cache_query) do + defp put_request_body(request, {query, params}, cache_query) do hash = if cache_query do query |> :erlang.md5() |> Base.encode16() @@ -138,16 +142,12 @@ defmodule ReqAthena do :os.system_time() |> to_string() end - statement_name = "query_" <> hash + query = %ReqAthena.Query{query: query, params: params, statement_name: "query_" <> hash} - request - |> put_request_body("PREPARE #{statement_name} FROM #{query}", cache_query) - |> Request.put_private(:athena_parameterized?, true) - |> Request.put_private(:athena_statement_name, statement_name) + put_request_body(request, query, cache_query) end - defp put_request_body(request, query, cache_query) - when is_binary(query) do + defp put_request_body(request, %ReqAthena.Query{} = query, cache_query) do output_config = case {request.options[:output_location], request.options[:workgroup]} do {output, workgroup} when is_empty(output) and is_empty(workgroup) -> @@ -166,13 +166,14 @@ defmodule ReqAthena do body = Map.merge(output_config, %{ QueryExecutionContext: %{Database: fetch_option!(request, :database)}, - QueryString: query + QueryString: ReqAthena.Query.to_query_string(query) }) client_request_token = generate_client_request_token(body, cache_query) body = Map.put(body, :ClientRequestToken, client_request_token) %{request | body: Jason.encode!(body)} + |> Request.put_private(:athena_query, query) end defp generate_client_request_token(parameters, cache_query) do @@ -191,9 +192,9 @@ defmodule ReqAthena do defp handle_athena_result({request, %{status: 200} = response}) do action = Request.get_private(request, :athena_action) - parameterized? = Request.get_private(request, :athena_parameterized?, false) + query = Request.get_private(request, :athena_query) - case {action, parameterized?} do + case {action, ReqAthena.Query.to_prepare?(query)} do {"StartQueryExecution", _} -> get_query_state(request, response) @@ -267,26 +268,26 @@ defmodule ReqAthena do end end - @athena_keys ~w(athena_action athena_parameterized? athena_wait_count)a + @athena_keys ~w(athena_action athena_query athena_wait_count)a defp execute_prepared_query(request) do - {_, params} = fetch_option!(request, :athena) - statement_name = Req.Request.get_private(request, :athena_statement_name) - athena = "EXECUTE #{statement_name} USING " <> Enum.map_join(params, ", ", &encode_value/1) - {_, private} = Map.split(request.private, @athena_keys) + {ours_private, theirs_private} = Map.split(request.private, @athena_keys) + + %ReqAthena.Query{prepared: false} = query = ours_private.athena_query + prepared_query = %ReqAthena.Query{query | prepared: true} request = %{ request - | private: private, + | private: theirs_private, current_request_steps: Keyword.keys(request.request_steps) } - Request.halt(request, Req.post!(request, athena: athena)) + Request.halt(request, Req.post!(request, athena: prepared_query)) end defp decode_result(request, response) do body = Jason.decode!(response.body) - statement_name = Request.get_private(request, :athena_statement_name) + query = Request.get_private(request, :athena_query) query_execution_id = Request.get_private(request, :athena_query_execution_id) output_location = Request.get_private(request, :athena_output_location) @@ -301,7 +302,7 @@ defmodule ReqAthena do %ReqAthena.Result{ query_execution_id: query_execution_id, output_location: output_location, - statement_name: statement_name, + statement_name: query.statement_name, rows: decode_rows(rows, columns_info), columns: decode_column_labels(column_labels), metadata: columns_info @@ -311,7 +312,7 @@ defmodule ReqAthena do %ReqAthena.Result{ query_execution_id: query_execution_id, output_location: output_location, - statement_name: statement_name + statement_name: query.statement_name } body -> @@ -408,24 +409,6 @@ defmodule ReqAthena do defp now, do: NaiveDateTime.utc_now() |> NaiveDateTime.to_erl() - defp encode_value(value) when is_binary(value), do: "'#{value}'" - defp encode_value(%Date{} = value), do: to_string(value) |> encode_value() - - defp encode_value(%DateTime{} = value) do - value - |> DateTime.to_naive() - |> encode_value() - end - - defp encode_value(%NaiveDateTime{} = value) do - value - |> NaiveDateTime.truncate(:millisecond) - |> to_string() - |> encode_value() - end - - defp encode_value(value), do: value - defp decode_value(nil, _), do: nil @integer_types ~w(bigint smallint integer) diff --git a/lib/req_athena/query.ex b/lib/req_athena/query.ex new file mode 100644 index 0000000..dfe0df6 --- /dev/null +++ b/lib/req_athena/query.ex @@ -0,0 +1,57 @@ +defmodule ReqAthena.Query do + @moduledoc false + # This module represents a query and its attributes. + + defstruct query: nil, params: nil, statement_name: nil, prepared: false + + @doc """ + Returns if this query is using params or not. + """ + def parameterized?(%__MODULE__{} = query), do: List.wrap(query.params) != [] + + @doc """ + Returns if this query is using params and if it was not prepared. + + This is useful to determine if the query is going to perform an "EXECUTE" or + a "PREPARE" command. + """ + def to_prepare?(%__MODULE__{} = query), do: parameterized?(query) and query.prepared == false + + @doc """ + Builds the final query to send to the Athena service. + """ + def to_query_string(%__MODULE__{} = query) do + cond do + query.prepared -> + "EXECUTE #{query.statement_name} USING " <> + Enum.map_join(query.params, ", ", &encode_value/1) + + parameterized?(query) -> + if is_nil(query.statement_name), + do: raise(":statement_name is required for a parameterized query") + + "PREPARE #{query.statement_name} FROM #{query.query}" + + true -> + query.query + end + end + + defp encode_value(value) when is_binary(value), do: "'#{value}'" + defp encode_value(%Date{} = value), do: to_string(value) |> encode_value() + + defp encode_value(%DateTime{} = value) do + value + |> DateTime.to_naive() + |> encode_value() + end + + defp encode_value(%NaiveDateTime{} = value) do + value + |> NaiveDateTime.truncate(:millisecond) + |> to_string() + |> encode_value() + end + + defp encode_value(value), do: value +end diff --git a/test/req_athena/query_test.exs b/test/req_athena/query_test.exs new file mode 100644 index 0000000..d7042b2 --- /dev/null +++ b/test/req_athena/query_test.exs @@ -0,0 +1,36 @@ +defmodule ReqAthena.QueryTest do + use ExUnit.Case, async: true + + alias ReqAthena.Query + + describe "to_query_string/1" do + test "simple query without params" do + query = %Query{query: "SELECT name, id FROM users"} + + assert Query.to_query_string(query) == "SELECT name, id FROM users" + end + + test "query with params unprepared" do + query = %Query{ + query: "SELECT name, id FROM users WHERE id > ?", + params: [420], + statement_name: "test_statement" + } + + assert Query.to_query_string(query) == + "PREPARE test_statement FROM SELECT name, id FROM users WHERE id > ?" + end + + test "query with params and prepared" do + query = %Query{ + query: "SELECT name, id FROM users WHERE id > ?", + params: [420], + prepared: true, + statement_name: "test_statement" + } + + assert Query.to_query_string(query) == + "EXECUTE test_statement USING 420" + end + end +end diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index f5f6bad..d81a872 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -226,8 +226,11 @@ defmodule ReqAthenaTest do results = %{ "GetQueryResults" => fn request -> + query = Req.Request.get_private(request, :athena_query) + to_prepare? = ReqAthena.Query.to_prepare?(query) + data = - if Req.Request.get_private(request, :athena_parameterized?) do + if to_prepare? do %{"ResultSet" => %{"Output" => ""}} else %{ From 42f1f619de66cab90577ec97818eb8bb6100f57a Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 21 Aug 2024 16:46:27 -0300 Subject: [PATCH 02/20] Introduce "UNLOAD" command to the query builder The "UNLOAD" command allows us to specify the format of the output, so we can tell Athena to save our results as Parquet. This will be useful for loading them with Explorer. --- lib/req_athena/query.ex | 64 ++++++++++++++++++++++++++++++++-- test/req_athena/query_test.exs | 37 ++++++++++++++++++++ 2 files changed, 98 insertions(+), 3 deletions(-) diff --git a/lib/req_athena/query.ex b/lib/req_athena/query.ex index dfe0df6..874b8c0 100644 --- a/lib/req_athena/query.ex +++ b/lib/req_athena/query.ex @@ -2,7 +2,7 @@ defmodule ReqAthena.Query do @moduledoc false # This module represents a query and its attributes. - defstruct query: nil, params: nil, statement_name: nil, prepared: false + defstruct query: nil, params: nil, statement_name: nil, prepared: false, unload: nil @doc """ Returns if this query is using params or not. @@ -30,10 +30,10 @@ defmodule ReqAthena.Query do if is_nil(query.statement_name), do: raise(":statement_name is required for a parameterized query") - "PREPARE #{query.statement_name} FROM #{query.query}" + "PREPARE #{query.statement_name} FROM #{maybe_around_unload(query)}" true -> - query.query + maybe_around_unload(query) end end @@ -54,4 +54,62 @@ defmodule ReqAthena.Query do end defp encode_value(value), do: value + + defp maybe_around_unload(%{query: query_string, unload: [_ | _] = opts}) + when is_binary(query_string) do + # UNLOAD works only with SELECT + if query_string =~ ~r/^[\s]*select/i do + {to, props} = Keyword.pop!(opts, :to) + + props = + Enum.intersperse( + for( + {key, value} <- props, + not is_nil(value), + do: [Atom.to_string(key), " = ", encode_value(value)] + ), + ", " + ) + + IO.iodata_to_binary([ + "UNLOAD (", + query_string, + ")", + "\n", + "TO ", + encode_value(to), + "\n", + "WITH (", + props, + ")" + ]) + else + query_string + end + end + + defp maybe_around_unload(%{query: query_string}), do: query_string + + @doc """ + Add attributes required by the "UNLOAD" command. + + See: https://docs.aws.amazon.com/athena/latest/ug/unload.html + """ + def with_unload(%__MODULE__{} = query, opts) do + opts = + Keyword.validate!(opts, + to: nil, + format: "PARQUET", + compression: "SNAPPY", + compression_level: nil, + field_delimiter: nil, + partitioned_by: nil + ) + + if opts[:to] in ["", nil] do + raise "`:to` is required by UNLOAD" + end + + %{query | unload: opts} + end end diff --git a/test/req_athena/query_test.exs b/test/req_athena/query_test.exs index d7042b2..f065067 100644 --- a/test/req_athena/query_test.exs +++ b/test/req_athena/query_test.exs @@ -33,4 +33,41 @@ defmodule ReqAthena.QueryTest do "EXECUTE test_statement USING 420" end end + + describe "with_unload/2" do + test "unload attributes" do + query = %Query{query: "SELECT name, id FROM users"} + query = Query.with_unload(query, to: "s3://my-bucket/my-dir") + + assert Query.to_query_string(query) == + "UNLOAD (SELECT name, id FROM users)\nTO 's3://my-bucket/my-dir'\nWITH (compression = 'SNAPPY', format = 'PARQUET')" + end + + test "unload attributes and a prepare statement does use unload command" do + query = %Query{ + query: "SELECT name, id FROM users WHERE id > ?", + params: [420], + statement_name: "test_statement" + } + + query = Query.with_unload(query, to: "s3://my-bucket/my-dir") + + assert Query.to_query_string(query) == + "PREPARE test_statement FROM UNLOAD (SELECT name, id FROM users WHERE id > ?)\nTO 's3://my-bucket/my-dir'\nWITH (compression = 'SNAPPY', format = 'PARQUET')" + end + + test "unload attributes and an execute command does not use the unload command" do + query = %Query{ + query: "SELECT name, id FROM users WHERE id > ?", + params: [420], + prepared: true, + statement_name: "test_statement" + } + + query = Query.with_unload(query, to: "s3://my-bucket/my-dir") + + assert Query.to_query_string(query) == + "EXECUTE test_statement USING 420" + end + end end From 670a179767c1c172f1c614a5b05eef3592e88ecb Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 21 Aug 2024 23:55:30 -0300 Subject: [PATCH 03/20] Add initial integration with Explorer See: https://github.com/livebook-dev/req_athena/issues/36 --- lib/req_athena.ex | 85 +++++++++++++++++++-- lib/req_athena/query.ex | 11 ++- mix.exs | 1 + mix.lock | 4 + test/integration_test.exs | 68 +++++++++++------ test/req_athena/query_test.exs | 34 +++++++++ test/req_athena_test.exs | 135 ++++++++++++++++----------------- 7 files changed, 237 insertions(+), 101 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 9234f53..e220ec3 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -2,8 +2,13 @@ defmodule ReqAthena do @moduledoc """ `Req` plugin for [AWS Athena](https://docs.aws.amazon.com/athena/latest/APIReference/Welcome.html). - ReqAthena makes it easy to make Athena queries. Query results are decoded into the `ReqAthena.Result` struct. - The struct implements the `Table.Reader` protocol and thus can be efficiently traversed by rows or columns. + ReqAthena makes it easy to make Athena queries and save the results into S3 buckets. + + By default, `ReqAthena` will save results using the Apache Parquet format, and return a + `Explorer.DataFrame` as a lazy frame pointing to all the partition files. These partitions + are sorted independently, but we cannot guarantee ordering as a whole. + + See the limitations in the [`UNLOAD` command docs](https://docs.aws.amazon.com/athena/latest/ug/unload.html#unload-considerations-and-limitations). """ require Logger @@ -19,8 +24,11 @@ defmodule ReqAthena do athena output_location cache_query + no_explorer )a + @credential_keys ~w(access_key_id secret_access_key region token)a + defguardp is_empty(value) when value in [nil, ""] @doc """ @@ -39,15 +47,23 @@ defmodule ReqAthena do * `:database` - Required. The AWS Athena database name. * `:output_location` - Conditional. The S3 URL location to output AWS Athena query results. + Results will be saved as Parquet and loaded with Explorer only if this option is given. * `:workgroup` - Conditional. The AWS Athena workgroup. * `:cache_query` - Optional. Forces a non-cached result from AWS Athena. - * `:athena` - Required. The query to execute. It can be a plain sql string or + * `:no_explorer` - Disable output as an Explorer dataframe. Defaults to `nil`, which + enables Explorer integration by default. + + * `:athena` - Required. The query to execute. It can be a plain SQL string or a `{query, params}` tuple, where `query` can contain `?` placeholders and `params` is a list of corresponding values. + There is a limitation of Athena that requires the `:output_location` to be empty + for every query. So we append "results" to the `:output_location`, so the partition + files are saved there. + Conditional fields must always be defined, and can be one of the fields or both. If you want to set any of these options when attaching the plugin, pass them as the second argument. @@ -163,6 +179,18 @@ defmodule ReqAthena do %{WorkGroup: workgroup, ResultConfiguration: %{OutputLocation: output}} end + query = + if not (!!request.options[:no_explorer]) and is_binary(request.options[:output_location]) do + ReqAthena.Query.with_unload( + query, + # We need to add this "subdirectory" because Athena expects the results directory + # to be empty. + to: Path.join(request.options[:output_location], "results") + ) + else + query + end + body = Map.merge(output_config, %{ QueryExecutionContext: %{Database: fetch_option!(request, :database)}, @@ -205,12 +233,59 @@ defmodule ReqAthena do execute_prepared_query(request) {"GetQueryResults", _} -> - decode_result(request, response) + if ReqAthena.Query.is_select(query) and not is_nil(query.unload) do + build_explorer_lazy_frame(request, response) + else + decode_result(request, response) + end end end defp handle_athena_result(request_response), do: request_response + defp build_explorer_lazy_frame(request, response) do + body = Jason.decode!(response.body) + + result = + if Map.has_key?(body, "ResultSet") do + manifest_csv_location = + Request.get_private(request, :athena_output_location) <> "-manifest.csv" + + aws_credentials = + for key <- @credential_keys, + value = request.options[key], + not is_nil(value), + do: {key, value} + + # This private field is only meant to be used in tests. + fetcher_and_builder = + Request.get_private(request, :athena_dataframe_builder, &fetch_and_build_dataframe/2) + + fetcher_and_builder.(manifest_csv_location, aws_credentials) + else + body + end + + Request.halt(request, %{response | body: result}) + end + + @doc false + def fetch_and_build_dataframe(manifest_csv_location, aws_credentials) do + # TODO: Should we handle errors here? + manifest_df = + Explorer.DataFrame.from_csv!(manifest_csv_location, + header: false, + config: aws_credentials + ) + + manifest_df[0] + |> Explorer.Series.to_list() + |> Enum.map(fn parquet_location -> + Explorer.DataFrame.from_parquet!(parquet_location, lazy: true, config: aws_credentials) + end) + |> Explorer.DataFrame.concat_rows() + end + defp get_query_state(request, response) do response = %{request | body: response.body} @@ -379,8 +454,6 @@ defmodule ReqAthena do ) end - @credential_keys ~w(access_key_id secret_access_key region token)a - defp maybe_put_aws_credentials(request) do case aws_credentials() do :undefined -> diff --git a/lib/req_athena/query.ex b/lib/req_athena/query.ex index 874b8c0..1658615 100644 --- a/lib/req_athena/query.ex +++ b/lib/req_athena/query.ex @@ -55,10 +55,17 @@ defmodule ReqAthena.Query do defp encode_value(value), do: value - defp maybe_around_unload(%{query: query_string, unload: [_ | _] = opts}) + def is_select(%{query: query_string}) + when is_binary(query_string) do + query_string =~ ~r/^[\s]*select/i + end + + def can_use_unload?(_), do: false + + defp maybe_around_unload(%{query: query_string, unload: [_ | _] = opts} = query) when is_binary(query_string) do # UNLOAD works only with SELECT - if query_string =~ ~r/^[\s]*select/i do + if is_select(query) do {to, props} = Keyword.pop!(opts, :to) props = diff --git a/mix.exs b/mix.exs index 7c15780..4861a0e 100644 --- a/mix.exs +++ b/mix.exs @@ -43,6 +43,7 @@ defmodule ReqAthena.MixProject do [ {:req, "~> 0.5.0"}, {:aws_signature, "~> 0.3.0"}, + {:explorer, "~> 0.9.0"}, {:aws_credentials, "~> 0.2", optional: true}, {:table, "~> 0.1.1", optional: true}, {:tzdata, "~> 1.1.1", only: :test}, diff --git a/mix.lock b/mix.lock index c847fdc..6887fcc 100644 --- a/mix.lock +++ b/mix.lock @@ -6,7 +6,9 @@ "earmark_parser": {:hex, :earmark_parser, "1.4.41", "ab34711c9dc6212dda44fcd20ecb87ac3f3fce6f0ca2f28d4a00e4154f8cd599", [:mix], [], "hexpm", "a81a04c7e34b6617c2792e291b5a2e57ab316365c2644ddc553bb9ed863ebefa"}, "eini": {:hex, :eini_beam, "2.2.4", "02143b1dce4dda4243248e7d9b3d8274b8d9f5a666445e3d868e2cce79e4ff22", [:rebar3], [], "hexpm", "12de479d144b19e09bb92ba202a7ea716739929afdf9dff01ad802e2b1508471"}, "ex_doc": {:hex, :ex_doc, "0.34.2", "13eedf3844ccdce25cfd837b99bea9ad92c4e511233199440488d217c92571e8", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "5ce5f16b41208a50106afed3de6a2ed34f4acfd65715b82a0b84b49d995f95c1"}, + "explorer": {:hex, :explorer, "0.9.1", "9c6f175dfd2fa2f432d5fe9a86b81875438a9a1110af5b952c284842bee434e4", [:mix], [{:adbc, "~> 0.1", [hex: :adbc, repo: "hexpm", optional: true]}, {:aws_signature, "~> 0.3", [hex: :aws_signature, repo: "hexpm", optional: false]}, {:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:flame, "~> 0.3", [hex: :flame, repo: "hexpm", optional: true]}, {:fss, "~> 0.1", [hex: :fss, repo: "hexpm", optional: false]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}, {:rustler, "~> 0.34.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}, {:table, "~> 0.1.2", [hex: :table, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1 or ~> 4.0.0", [hex: :table_rex, repo: "hexpm", optional: false]}], "hexpm", "d88ec0e78f904c5eaf0b37c4a0ce4632de133515f3740a29fbddd2c0d0a78e77"}, "finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"}, + "fss": {:hex, :fss, "0.1.1", "9db2344dbbb5d555ce442ac7c2f82dd975b605b50d169314a20f08ed21e08642", [:mix], [], "hexpm", "78ad5955c7919c3764065b21144913df7515d52e228c09427a004afe9c1a16b0"}, "hackney": {:hex, :hackney, "1.18.1", "f48bf88f521f2a229fc7bae88cf4f85adc9cd9bcf23b5dc8eb6a1788c662c4f6", [:rebar3], [{:certifi, "~> 2.9.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a4ecdaff44297e9b5894ae499e9a070ea1888c84afdd1fd9b7b2bc384950128e"}, "hpax": {:hex, :hpax, "1.0.0", "28dcf54509fe2152a3d040e4e3df5b265dcb6cb532029ecbacf4ce52caea3fd2", [:mix], [], "hexpm", "7f1314731d711e2ca5fdc7fd361296593fc2542570b3105595bb0bc6d0fad601"}, "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, @@ -25,8 +27,10 @@ "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, "req": {:hex, :req, "0.5.6", "8fe1eead4a085510fe3d51ad854ca8f20a622aae46e97b302f499dfb84f726ac", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "cfaa8e720945d46654853de39d368f40362c2641c4b2153c886418914b372185"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.7.2", "097f657e401f02e7bc1cab808cfc6abdc1f7b9dc5e5adee46bf2fd8fdcce9ecf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "7663faaeadc9e93e605164dcf9e69168e35f2f8b7f2b9eb4e400d1a8e0fe2999"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, "table": {:hex, :table, "0.1.2", "87ad1125f5b70c5dea0307aa633194083eb5182ec537efc94e96af08937e14a8", [:mix], [], "hexpm", "7e99bc7efef806315c7e65640724bf165c3061cdc5d854060f74468367065029"}, + "table_rex": {:hex, :table_rex, "4.0.0", "3c613a68ebdc6d4d1e731bc973c233500974ec3993c99fcdabb210407b90959b", [:mix], [], "hexpm", "c35c4d5612ca49ebb0344ea10387da4d2afe278387d4019e4d8111e815df8f55"}, "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, "tzdata": {:hex, :tzdata, "1.1.1", "20c8043476dfda8504952d00adac41c6eda23912278add38edc140ae0c5bcc46", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "a69cec8352eafcd2e198dea28a34113b60fdc6cb57eb5ad65c10292a6ba89787"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, diff --git a/test/integration_test.exs b/test/integration_test.exs index 458de3f..e916722 100644 --- a/test/integration_test.exs +++ b/test/integration_test.exs @@ -53,31 +53,49 @@ defmodule IntegrationTest do assert query_response.status == 200 - assert query_response.body.columns == [ - "id", - "type", - "tags", - "members", - "timestamp", - "visible" - ] - - refute query_response.body.statement_name - assert is_binary(query_response.body.query_execution_id) + assert %Explorer.DataFrame{} = ldf = query_response.body + assert Explorer.DataFrame.lazy?(ldf) + + df = Explorer.DataFrame.collect(ldf) + + names = [ + "id", + "type", + "tags", + "members", + "timestamp", + "visible" + ] - assert query_response.body.output_location == - "#{opts[:output_location]}/#{query_response.body.query_execution_id}.csv" + values = [ + 470_454, + "relation", + [ + %{ + "key" => "source", + "value" => "©IGN 2010 dans le cadre de la cartographie réglementaire" + }, + %{"key" => "site", "value" => "geodesic"}, + %{ + "key" => "url", + "value" => + "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A" + }, + %{"key" => "name", "value" => "Mérignac A"}, + %{"key" => "network", "value" => "NTF-5"}, + %{"key" => "ref", "value" => "17229A"}, + %{"key" => "type", "value" => "site"} + ], + [ + %{"ref" => 670_007_839, "role" => "", "type" => "node"}, + %{"ref" => 670_007_840, "role" => "", "type" => "node"} + ], + ~N[2017-01-21 12:51:34.000000], + true + ] - assert query_response.body.rows == [ - [ - 470_454, - "relation", - "{ref=17229A, site=geodesic, name=Mérignac A, source=©IGN 2010 dans le cadre de la cartographie réglementaire, type=site, url=http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A, network=NTF-5}", - "[{type=node, ref=670007839, role=}, {type=node, ref=670007840, role=}]", - ~N[2017-01-21 12:51:34.000], - true - ] - ] + assert Explorer.DataFrame.names(df) == names + assert Explorer.DataFrame.to_rows(df) == [Map.new(Enum.zip(names, values))] end test "returns the response from AWS Athena's API with parameterized query" do @@ -86,6 +104,7 @@ defmodule IntegrationTest do secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", + no_explorer: true, output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") ] @@ -121,6 +140,7 @@ defmodule IntegrationTest do secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", + no_explorer: true, output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") ] @@ -265,6 +285,7 @@ defmodule IntegrationTest do secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", + no_explorer: true, output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") ] @@ -301,6 +322,7 @@ defmodule IntegrationTest do secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", + no_explorer: true, output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), cache_query: false ] diff --git a/test/req_athena/query_test.exs b/test/req_athena/query_test.exs index f065067..d276b81 100644 --- a/test/req_athena/query_test.exs +++ b/test/req_athena/query_test.exs @@ -69,5 +69,39 @@ defmodule ReqAthena.QueryTest do assert Query.to_query_string(query) == "EXECUTE test_statement USING 420" end + + test "unload attributes and a create command does not use the unload command" do + create = """ + CREATE EXTERNAL TABLE IF NOT EXISTS planet ( + id BIGINT, + type STRING, + tags MAP, + lat DECIMAL(9,7), + lon DECIMAL(10,7), + nds ARRAY>, + members ARRAY>, + changeset BIGINT, + timestamp TIMESTAMP, + uid BIGINT, + user STRING, + version BIGINT, + visible BOOLEAN + ) + STORED AS ORCFILE + LOCATION 's3://osm-pds/planet/';\ + """ + + query = %Query{ + query: create, + params: [420], + prepared: true, + statement_name: "test_statement" + } + + query = Query.with_unload(query, to: "s3://my-bucket/my-dir") + + assert Query.to_query_string(query) == + "EXECUTE test_statement USING 420" + end end end diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index d81a872..95b0a53 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -2,7 +2,7 @@ defmodule ReqAthenaTest do use ExUnit.Case, async: true @moduletag :capture_log - test "executes a query string" do + test "executes a query string returning a data frame" do opts = [ access_key_id: "some key", secret_access_key: "dummy", @@ -11,47 +11,53 @@ defmodule ReqAthenaTest do output_location: "s3://foo" ] + request_validations = %{ + "StartQueryExecution" => fn request -> + decoded = Jason.decode!(request.body) + + assert %{ + "ClientRequestToken" => client_req_token, + "QueryExecutionContext" => %{ + "Database" => "my_awesome_database" + }, + "QueryString" => + "UNLOAD (select * from iris)\nTO 's3://foo/results'\nWITH (compression = 'SNAPPY', format = 'PARQUET')", + "ResultConfiguration" => %{"OutputLocation" => "s3://foo"} + } = decoded + + assert is_binary(client_req_token) + end + } + + me = self() + assert response = - Req.new(adapter: fake_athena()) + Req.new(adapter: fake_athena(request_validations)) |> Req.Request.put_header("x-auth", "my awesome auth header") + |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, + credentials -> + assert manifest_location == "s3://foo-manifest.csv" + + assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == + Enum.sort(credentials) + + send(me, {:explorer_built, manifest_location}) + + Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) + end) |> ReqAthena.attach(opts) |> Req.post!(athena: "select * from iris") assert response.status == 200 - assert response.body == %ReqAthena.Result{ - columns: ["id", "name"], - output_location: "s3://foo", - query_execution_id: "an uuid", - rows: [[1, "Ale"], [2, "Wojtek"]], - statement_name: nil, - metadata: [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] + assert df = %Explorer.DataFrame{} = response.body + + assert Explorer.DataFrame.to_columns(df, atom_keys: true) == %{ + id: [1, 2], + name: ["Ale", "Wojtek"] } + + assert_received {:explorer_built, _output_location} end test "parses a response with a datum object missing" do @@ -60,6 +66,7 @@ defmodule ReqAthenaTest do secret_access_key: "dummy", region: "us-east-1", database: "my_awesome_database", + no_explorer: true, output_location: "s3://foo" ] @@ -308,6 +315,7 @@ defmodule ReqAthenaTest do secret_access_key: "dummy", region: "us-east-1", database: "my_awesome_database", + no_explorer: true, output_location: "s3://foo" ] @@ -375,46 +383,33 @@ defmodule ReqAthenaTest do output_location: "s3://foo" ] - assert response = - Req.new(adapter: fake_athena(validations)) - |> ReqAthena.attach(opts) - |> Req.post!(athena: "select * from iris") + me = self() + + response = + Req.new(adapter: fake_athena(validations)) + |> ReqAthena.attach(opts) + |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, credentials -> + assert manifest_location == "s3://foo-manifest.csv" + + assert Enum.sort( + Keyword.take(opts, [:access_key_id, :secret_access_key, :region, :token]) + ) == + Enum.sort(credentials) + + send(me, :explorer_built) + + Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) + end) + |> Req.post!(athena: "select * from iris") assert response.status == 200 - assert response.body == %ReqAthena.Result{ - columns: ["id", "name"], - output_location: "s3://foo", - query_execution_id: "an uuid", - rows: [[1, "Ale"], [2, "Wojtek"]], - statement_name: nil, - metadata: [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] + assert Explorer.DataFrame.to_columns(response.body, atom_keys: true) == %{ + id: [1, 2], + name: ["Ale", "Wojtek"] } + + assert_received :explorer_built end test "executes a query with workgroup" do From 94ef79b3db6787ac63c78f701ecc202f79d80faa Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Thu, 29 Aug 2024 21:39:27 -0300 Subject: [PATCH 04/20] WIP: make the default output be the decoded API result --- lib/req_athena.ex | 233 +++++++++------- lib/req_athena/query.ex | 2 +- test/req_athena/query_test.exs | 12 +- test/req_athena_test.exs | 484 +++++++++++++-------------------- 4 files changed, 343 insertions(+), 388 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index e220ec3..4fb3463 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -4,11 +4,8 @@ defmodule ReqAthena do ReqAthena makes it easy to make Athena queries and save the results into S3 buckets. - By default, `ReqAthena` will save results using the Apache Parquet format, and return a - `Explorer.DataFrame` as a lazy frame pointing to all the partition files. These partitions - are sorted independently, but we cannot guarantee ordering as a whole. - - See the limitations in the [`UNLOAD` command docs](https://docs.aws.amazon.com/athena/latest/ug/unload.html#unload-considerations-and-limitations). + By default, `ReqAthena` will query results and use the default output format, + which is CSV. To change that, you can use the `:format` option documented bellow. """ require Logger @@ -24,7 +21,9 @@ defmodule ReqAthena do athena output_location cache_query - no_explorer + format + decode_body + output_compression )a @credential_keys ~w(access_key_id secret_access_key region token)a @@ -53,16 +52,32 @@ defmodule ReqAthena do * `:cache_query` - Optional. Forces a non-cached result from AWS Athena. - * `:no_explorer` - Disable output as an Explorer dataframe. Defaults to `nil`, which - enables Explorer integration by default. + * `:format` - Optional. It changes the output format. By default this is + `:none`, which means that we return the decoded result from the Athena API. + The supported formats are: `:csv`, `:explorer,`, `:json` and `:textfile`. + + For `:csv`, the contents of the CSV file are the output instead of the API return. + When `:json` is used, the contents of the JSON files are going to be the output. + Notice that the body is decoded by default and to prevent that, you need to use + the `:decode_body` option, so you get the "raw" data. + The `:explorer` format will perform the query unloading it to Parquet files, and + then will lazy load these parquet files into an Explorer dataframe. + + There are some limitations when using the `:json` and `:explorer` format. + See more about it reading the [`UNLOAD` command docs](https://docs.aws.amazon.com/athena/latest/ug/unload.html#unload-considerations-and-limitations). + + * `:output_compression` - Optional. Sets the Parquet compression format and level + for the output when using the Explorer output format. This can be a string, like `"gzip"`, + or a tuple with `{format, level}`, like: `{"ZSTD", 4}`. By default this is `nil`, + which means that for Parquet (the format that Explorer uses) this is going to be `"gzip"`. * `:athena` - Required. The query to execute. It can be a plain SQL string or a `{query, params}` tuple, where `query` can contain `?` placeholders and `params` is a list of corresponding values. - There is a limitation of Athena that requires the `:output_location` to be empty - for every query. So we append "results" to the `:output_location`, so the partition - files are saved there. + There is a limitation of Athena that requires the `:output_location` to be present + for every query that outputs to a format other than "CSV". So we append "results" + to the `:output_location` to make the partition files be saved there. Conditional fields must always be defined, and can be one of the fields or both. @@ -179,15 +194,22 @@ defmodule ReqAthena do %{WorkGroup: workgroup, ResultConfiguration: %{OutputLocation: output}} end + output_format = Request.get_option(request, :format, :none) + query = - if not (!!request.options[:no_explorer]) and is_binary(request.options[:output_location]) do + if output_format not in [:csv, :none] and is_binary(request.options[:output_location]) do ReqAthena.Query.with_unload( query, # We need to add this "subdirectory" because Athena expects the results directory - # to be empty. + # to be empty for the "UNLOAD" command. to: Path.join(request.options[:output_location], "results") ) else + if output_format in [:parquet, :orc, :avro, :json, :textfile] do + raise ArgumentError, + ":output_location needs to be defined in order to use the #{inspect(output_format)} format" + end + query end @@ -200,8 +222,7 @@ defmodule ReqAthena do client_request_token = generate_client_request_token(body, cache_query) body = Map.put(body, :ClientRequestToken, client_request_token) - %{request | body: Jason.encode!(body)} - |> Request.put_private(:athena_query, query) + Request.put_private(%{request | body: Jason.encode!(body)}, :athena_query, query) end defp generate_client_request_token(parameters, cache_query) do @@ -233,17 +254,43 @@ defmodule ReqAthena do execute_prepared_query(request) {"GetQueryResults", _} -> - if ReqAthena.Query.is_select(query) and not is_nil(query.unload) do - build_explorer_lazy_frame(request, response) - else - decode_result(request, response) + output_format = Request.get_option(request, :format, :none) + + case output_format do + :none -> + if Request.get_option(request, :decode_body, true) do + Request.halt(request, %{response | body: Jason.decode!(response.body)}) + else + Request.halt(request, response) + end + + :csv -> + get_csv_result(request, response) + + :json -> + get_json_result(request, response) + + :explorer -> + get_explorer_lazy_frame(request, response) + + other -> + raise ArgumentError, + ":format - `#{inspect(other)}` is not valid. Only :none, :csv, :json or :explorer are accepted." end end end defp handle_athena_result(request_response), do: request_response - defp build_explorer_lazy_frame(request, response) do + defp get_csv_result(request, response) do + Request.halt(request, response) + end + + defp get_json_result(request, response) do + Request.halt(request, response) + end + + defp get_explorer_lazy_frame(request, response) do body = Jason.decode!(response.body) result = @@ -339,7 +386,11 @@ defmodule ReqAthena do end _other_state -> - decode_result(request, response) + if Request.get_option(request, :decode_body, true) do + Request.halt(request, %{response | body: body}) + else + Request.halt(request, response) + end end end @@ -360,57 +411,57 @@ defmodule ReqAthena do Request.halt(request, Req.post!(request, athena: prepared_query)) end - defp decode_result(request, response) do - body = Jason.decode!(response.body) - query = Request.get_private(request, :athena_query) - query_execution_id = Request.get_private(request, :athena_query_execution_id) - output_location = Request.get_private(request, :athena_output_location) - - result = - case body do - %{ - "ResultSet" => %{ - "ResultSetMetadata" => %{"ColumnInfo" => columns_info}, - "Rows" => [%{"Data" => column_labels} | rows] - } - } -> - %ReqAthena.Result{ - query_execution_id: query_execution_id, - output_location: output_location, - statement_name: query.statement_name, - rows: decode_rows(rows, columns_info), - columns: decode_column_labels(column_labels), - metadata: columns_info - } - - %{"ResultSet" => _} -> - %ReqAthena.Result{ - query_execution_id: query_execution_id, - output_location: output_location, - statement_name: query.statement_name - } - - body -> - body - end - - Request.halt(request, %{response | body: result}) - end - - defp decode_column_labels(column_labels) do - Enum.map(column_labels, &Map.fetch!(&1, "VarCharValue")) - end - - defp decode_rows(rows, columns_info) do - column_types = Enum.map(columns_info, &Map.take(&1, ["Type"])) - - Enum.map(rows, fn %{"Data" => datums} -> - Enum.zip_with([datums, column_types], fn [datum, column_type] -> - value = datum["VarCharValue"] || "" - decode_value(value, column_type) - end) - end) - end + # defp decode_result(request, response) do + # body = Jason.decode!(response.body) + # query = Request.get_private(request, :athena_query) + # query_execution_id = Request.get_private(request, :athena_query_execution_id) + # output_location = Request.get_private(request, :athena_output_location) + + # result = + # case body do + # %{ + # "ResultSet" => %{ + # "ResultSetMetadata" => %{"ColumnInfo" => columns_info}, + # "Rows" => [%{"Data" => column_labels} | rows] + # } + # } -> + # %ReqAthena.Result{ + # query_execution_id: query_execution_id, + # output_location: output_location, + # statement_name: query.statement_name, + # rows: decode_rows(rows, columns_info), + # columns: decode_column_labels(column_labels), + # metadata: columns_info + # } + + # %{"ResultSet" => _} -> + # %ReqAthena.Result{ + # query_execution_id: query_execution_id, + # output_location: output_location, + # statement_name: query.statement_name + # } + + # body -> + # body + # end + + # Request.halt(request, %{response | body: result}) + # end + + # defp decode_column_labels(column_labels) do + # Enum.map(column_labels, &Map.fetch!(&1, "VarCharValue")) + # end + + # defp decode_rows(rows, columns_info) do + # column_types = Enum.map(columns_info, &Map.take(&1, ["Type"])) + + # Enum.map(rows, fn %{"Data" => datums} -> + # Enum.zip_with([datums, column_types], fn [datum, column_type] -> + # value = datum["VarCharValue"] || "" + # decode_value(value, column_type) + # end) + # end) + # end # TODO: Add step `put_aws_sigv4` to Req # See: https://github.com/wojtekmach/req/issues/62 @@ -482,34 +533,34 @@ defmodule ReqAthena do defp now, do: NaiveDateTime.utc_now() |> NaiveDateTime.to_erl() - defp decode_value(nil, _), do: nil + # defp decode_value(nil, _), do: nil - @integer_types ~w(bigint smallint integer) + # @integer_types ~w(bigint smallint integer) - defp decode_value(value, %{"Type" => type}) when type in @integer_types, - do: String.to_integer(value) + # defp decode_value(value, %{"Type" => type}) when type in @integer_types, + # do: String.to_integer(value) - @float_types ~w(double float decimal) + # @float_types ~w(double float decimal) - defp decode_value(value, %{"Type" => type}) when type in @float_types, - do: String.to_float(value) + # defp decode_value(value, %{"Type" => type}) when type in @float_types, + # do: String.to_float(value) - defp decode_value("true", %{"Type" => "boolean"}), do: true - defp decode_value("false", %{"Type" => "boolean"}), do: false - defp decode_value(value, %{"Type" => "date"}), do: Date.from_iso8601!(value) + # defp decode_value("true", %{"Type" => "boolean"}), do: true + # defp decode_value("false", %{"Type" => "boolean"}), do: false + # defp decode_value(value, %{"Type" => "date"}), do: Date.from_iso8601!(value) - defp decode_value(value, %{"Type" => "timestamp"}), do: NaiveDateTime.from_iso8601!(value) + # defp decode_value(value, %{"Type" => "timestamp"}), do: NaiveDateTime.from_iso8601!(value) - defp decode_value(value, %{"Type" => "timestamp with time zone"}) do - [d, t, tz] = String.split(value, " ", trim: true) - date = Date.from_iso8601!(d) - time = Time.from_iso8601!(t) + # defp decode_value(value, %{"Type" => "timestamp with time zone"}) do + # [d, t, tz] = String.split(value, " ", trim: true) + # date = Date.from_iso8601!(d) + # time = Time.from_iso8601!(t) - DateTime.new!(date, time, tz) - |> DateTime.truncate(:millisecond) - end + # DateTime.new!(date, time, tz) + # |> DateTime.truncate(:millisecond) + # end - defp decode_value(value, _), do: value + # defp decode_value(value, _), do: value # TODO: Use Req.Request.get_option/3 when Req 0.4.0 is out. defp get_option(request, key, default) when is_atom(key) do diff --git a/lib/req_athena/query.ex b/lib/req_athena/query.ex index 1658615..3ae8b0a 100644 --- a/lib/req_athena/query.ex +++ b/lib/req_athena/query.ex @@ -107,7 +107,7 @@ defmodule ReqAthena.Query do Keyword.validate!(opts, to: nil, format: "PARQUET", - compression: "SNAPPY", + compression: nil, compression_level: nil, field_delimiter: nil, partitioned_by: nil diff --git a/test/req_athena/query_test.exs b/test/req_athena/query_test.exs index d276b81..55b4e47 100644 --- a/test/req_athena/query_test.exs +++ b/test/req_athena/query_test.exs @@ -39,8 +39,16 @@ defmodule ReqAthena.QueryTest do query = %Query{query: "SELECT name, id FROM users"} query = Query.with_unload(query, to: "s3://my-bucket/my-dir") + assert query.unload[:to] == "s3://my-bucket/my-dir" + # Defaults + assert query.unload[:format] == "PARQUET" + assert is_nil(query.unload[:compression]) + assert is_nil(query.unload[:compression_level]) + assert is_nil(query.unload[:field_delimiter]) + assert is_nil(query.unload[:partitioned_by]) + assert Query.to_query_string(query) == - "UNLOAD (SELECT name, id FROM users)\nTO 's3://my-bucket/my-dir'\nWITH (compression = 'SNAPPY', format = 'PARQUET')" + "UNLOAD (SELECT name, id FROM users)\nTO 's3://my-bucket/my-dir'\nWITH (format = 'PARQUET')" end test "unload attributes and a prepare statement does use unload command" do @@ -53,7 +61,7 @@ defmodule ReqAthena.QueryTest do query = Query.with_unload(query, to: "s3://my-bucket/my-dir") assert Query.to_query_string(query) == - "PREPARE test_statement FROM UNLOAD (SELECT name, id FROM users WHERE id > ?)\nTO 's3://my-bucket/my-dir'\nWITH (compression = 'SNAPPY', format = 'PARQUET')" + "PREPARE test_statement FROM UNLOAD (SELECT name, id FROM users WHERE id > ?)\nTO 's3://my-bucket/my-dir'\nWITH (format = 'PARQUET')" end test "unload attributes and an execute command does not use the unload command" do diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index 95b0a53..27596c6 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -2,7 +2,7 @@ defmodule ReqAthenaTest do use ExUnit.Case, async: true @moduletag :capture_log - test "executes a query string returning a data frame" do + test "executes a query string returning the API result as it is" do opts = [ access_key_id: "some key", secret_access_key: "dummy", @@ -20,8 +20,8 @@ defmodule ReqAthenaTest do "QueryExecutionContext" => %{ "Database" => "my_awesome_database" }, - "QueryString" => - "UNLOAD (select * from iris)\nTO 's3://foo/results'\nWITH (compression = 'SNAPPY', format = 'PARQUET')", + # "UNLOAD (select * from iris)\nTO 's3://foo/results'\nWITH (compression = 'SNAPPY', format = 'PARQUET')", + "QueryString" => "select * from iris", "ResultConfiguration" => %{"OutputLocation" => "s3://foo"} } = decoded @@ -29,164 +29,106 @@ defmodule ReqAthenaTest do end } - me = self() + # me = self() assert response = Req.new(adapter: fake_athena(request_validations)) |> Req.Request.put_header("x-auth", "my awesome auth header") - |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, - credentials -> - assert manifest_location == "s3://foo-manifest.csv" + # |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, + # credentials -> + # assert manifest_location == "s3://foo-manifest.csv" - assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == - Enum.sort(credentials) + # assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == + # Enum.sort(credentials) - send(me, {:explorer_built, manifest_location}) + # send(me, {:explorer_built, manifest_location}) - Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) - end) + # Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) + # end) |> ReqAthena.attach(opts) |> Req.post!(athena: "select * from iris") assert response.status == 200 - assert df = %Explorer.DataFrame{} = response.body - - assert Explorer.DataFrame.to_columns(df, atom_keys: true) == %{ - id: [1, 2], - name: ["Ale", "Wojtek"] - } - - assert_received {:explorer_built, _output_location} - end - - test "parses a response with a datum object missing" do - opts = [ - access_key_id: "some key", - secret_access_key: "dummy", - region: "us-east-1", - database: "my_awesome_database", - no_explorer: true, - output_location: "s3://foo" - ] - - results = %{ - "GetQueryResults" => fn request -> - data = %{ - "ResultSet" => %{ - "ColumnInfos" => [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ], - "ResultRows" => [ - %{"Data" => ["id", "name"]}, - %{"Data" => ["1", "Ale"]}, - %{"Data" => ["2", "Wojtek"]} - ], - "ResultSetMetadata" => %{ - "ColumnInfo" => [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] - }, - "Rows" => [ - %{"Data" => [%{"VarCharValue" => "id"}, %{"VarCharValue" => "name"}]}, - %{"Data" => [%{"VarCharValue" => "1"}, %{"VarCharValue" => "Ale"}]}, - %{"Data" => [%{"VarCharValue" => "2"}, %{}]} - ] - }, - "UpdateCount" => 0 - } - - {request, %Req.Response{status: 200, body: Jason.encode!(data)}} - end - } + assert response.body == + %{ + "ResultSet" => %{ + "ColumnInfos" => [ + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "id", + "Name" => "id", + "Nullable" => "UNKNOWN", + "Precision" => 10, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "integer" + }, + %{ + "CaseSensitive" => true, + "CatalogName" => "hive", + "Label" => "name", + "Name" => "name", + "Nullable" => "UNKNOWN", + "Precision" => 2_147_483_647, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "varchar" + } + ], + "ResultRows" => [ + %{"Data" => ["id", "name"]}, + %{"Data" => ["1", "Ale"]}, + %{"Data" => ["2", "Wojtek"]} + ], + "ResultSetMetadata" => %{ + "ColumnInfo" => [ + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "id", + "Name" => "id", + "Nullable" => "UNKNOWN", + "Precision" => 10, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "integer" + }, + %{ + "CaseSensitive" => true, + "CatalogName" => "hive", + "Label" => "name", + "Name" => "name", + "Nullable" => "UNKNOWN", + "Precision" => 2_147_483_647, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "varchar" + } + ] + }, + "Rows" => [ + %{"Data" => [%{"VarCharValue" => "id"}, %{"VarCharValue" => "name"}]}, + %{"Data" => [%{"VarCharValue" => "1"}, %{"VarCharValue" => "Ale"}]}, + %{"Data" => [%{"VarCharValue" => "2"}, %{"VarCharValue" => "Wojtek"}]} + ] + }, + "UpdateCount" => 0 + } - response = - Req.new(adapter: fake_athena(%{}, results)) - |> Req.Request.put_header("x-auth", "my awesome auth header") - |> ReqAthena.attach(opts) - |> Req.post!(athena: "select * from iris") + # assert df = %Explorer.DataFrame{} = response.body - assert response.status == 200 + # assert Explorer.DataFrame.to_columns(df, atom_keys: true) == %{ + # id: [1, 2], + # name: ["Ale", "Wojtek"] + # } - assert response.body == %ReqAthena.Result{ - columns: ["id", "name"], - output_location: "s3://foo", - query_execution_id: "an uuid", - rows: [[1, "Ale"], [2, ""]], - statement_name: nil, - metadata: [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] - } + # assert_received {:explorer_built, _output_location} end test "executes a parameterized query" do @@ -231,6 +173,72 @@ defmodule ReqAthenaTest do end } + prepared_result = + %{ + "ResultSet" => %{ + "ColumnInfos" => [ + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "id", + "Name" => "id", + "Nullable" => "UNKNOWN", + "Precision" => 10, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "integer" + }, + %{ + "CaseSensitive" => true, + "CatalogName" => "hive", + "Label" => "name", + "Name" => "name", + "Nullable" => "UNKNOWN", + "Precision" => 2_147_483_647, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "varchar" + } + ], + "ResultRows" => [%{"Data" => ["id", "name"]}, %{"Data" => ["1", "Ale"]}], + "ResultSetMetadata" => %{ + "ColumnInfo" => [ + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "id", + "Name" => "id", + "Nullable" => "UNKNOWN", + "Precision" => 10, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "integer" + }, + %{ + "CaseSensitive" => true, + "CatalogName" => "hive", + "Label" => "name", + "Name" => "name", + "Nullable" => "UNKNOWN", + "Precision" => 2_147_483_647, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "varchar" + } + ] + }, + "Rows" => [ + %{"Data" => [%{"VarCharValue" => "id"}, %{"VarCharValue" => "name"}]}, + %{"Data" => [%{"VarCharValue" => "1"}, %{"VarCharValue" => "Ale"}]} + ] + }, + "UpdateCount" => 0 + } + results = %{ "GetQueryResults" => fn request -> query = Req.Request.get_private(request, :athena_query) @@ -240,70 +248,7 @@ defmodule ReqAthenaTest do if to_prepare? do %{"ResultSet" => %{"Output" => ""}} else - %{ - "ResultSet" => %{ - "ColumnInfos" => [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ], - "ResultRows" => [%{"Data" => ["id", "name"]}, %{"Data" => ["1", "Ale"]}], - "ResultSetMetadata" => %{ - "ColumnInfo" => [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] - }, - "Rows" => [ - %{"Data" => [%{"VarCharValue" => "id"}, %{"VarCharValue" => "name"}]}, - %{"Data" => [%{"VarCharValue" => "1"}, %{"VarCharValue" => "Ale"}]} - ] - }, - "UpdateCount" => 0 - } + prepared_result end {request, %Req.Response{status: 200, body: Jason.encode!(data)}} @@ -315,7 +260,6 @@ defmodule ReqAthenaTest do secret_access_key: "dummy", region: "us-east-1", database: "my_awesome_database", - no_explorer: true, output_location: "s3://foo" ] @@ -326,45 +270,18 @@ defmodule ReqAthenaTest do assert response.status == 200 - assert response.body == %ReqAthena.Result{ - columns: ["id", "name"], - output_location: "s3://foo", - query_execution_id: "an uuid", - rows: [[1, "Ale"]], - statement_name: "query_8CD6B60FAFA18EBFA8719A6EAC192624", - metadata: [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] - } + assert response.body == prepared_result end test "executes a query with session token" do + me = self() + session_token = "giant dummy session token" + token_validation = fn request -> + send(me, :token_validation) + assert Req.Request.get_header(request, "x-amz-security-token") == [ - "giant dummy session token" + session_token ] end @@ -377,42 +294,48 @@ defmodule ReqAthenaTest do opts = [ access_key_id: "some key", secret_access_key: "dummy", - token: "giant dummy session token", + token: session_token, region: "us-east-1", database: "my_awesome_database", output_location: "s3://foo" ] - me = self() + # me = self() response = Req.new(adapter: fake_athena(validations)) |> ReqAthena.attach(opts) - |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, credentials -> - assert manifest_location == "s3://foo-manifest.csv" + # |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, credentials -> + # assert manifest_location == "s3://foo-manifest.csv" - assert Enum.sort( - Keyword.take(opts, [:access_key_id, :secret_access_key, :region, :token]) - ) == - Enum.sort(credentials) + # assert Enum.sort( + # Keyword.take(opts, [:access_key_id, :secret_access_key, :region, :token]) + # ) == + # Enum.sort(credentials) - send(me, :explorer_built) + # send(me, :explorer_built) - Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) - end) + # Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) + # end) |> Req.post!(athena: "select * from iris") assert response.status == 200 + assert is_map(response.body) - assert Explorer.DataFrame.to_columns(response.body, atom_keys: true) == %{ - id: [1, 2], - name: ["Ale", "Wojtek"] - } + # assert Explorer.DataFrame.to_columns(response.body, atom_keys: true) == %{ + # id: [1, 2], + # name: ["Ale", "Wojtek"] + # } - assert_received :explorer_built + # assert_received :explorer_built + assert_received :token_validation + assert_received :token_validation + assert_received :token_validation end test "executes a query with workgroup" do + me = self() + validations = %{ "StartQueryExecution" => fn request -> client_req_token = @@ -429,6 +352,8 @@ defmodule ReqAthenaTest do "QueryString" => "select * from iris", "WorkGroup" => "default" } = decoded + + send(me, :start_query_validation) end } @@ -453,6 +378,7 @@ defmodule ReqAthenaTest do } } + send(me, :get_query_execution) {request, %Req.Response{status: 200, body: Jason.encode!(data)}} end } @@ -471,40 +397,10 @@ defmodule ReqAthenaTest do |> Req.post!(athena: "select * from iris") assert response.status == 200 + assert %{"ResultSet" => _} = response.body - assert response.body == %ReqAthena.Result{ - columns: ["id", "name"], - output_location: "s3://foo", - query_execution_id: "an uuid", - rows: [[1, "Ale"], [2, "Wojtek"]], - statement_name: nil, - metadata: [ - %{ - "CaseSensitive" => false, - "CatalogName" => "hive", - "Label" => "id", - "Name" => "id", - "Nullable" => "UNKNOWN", - "Precision" => 10, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "integer" - }, - %{ - "CaseSensitive" => true, - "CatalogName" => "hive", - "Label" => "name", - "Name" => "name", - "Nullable" => "UNKNOWN", - "Precision" => 2_147_483_647, - "Scale" => 0, - "SchemaName" => "", - "TableName" => "", - "Type" => "varchar" - } - ] - } + assert_received :start_query_validation + assert_received :get_query_execution end test "raises the request when neither workgroup and output location are defined" do From dde6c2b63c7eb0ba6aeda92f40218872e2ff591a Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Mon, 2 Sep 2024 19:00:36 -0300 Subject: [PATCH 05/20] Add tests for both cases that work --- test/req_athena_test.exs | 113 ++++++++++++++++++++++++++++++++------- 1 file changed, 93 insertions(+), 20 deletions(-) diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index 27596c6..86f594d 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -20,7 +20,6 @@ defmodule ReqAthenaTest do "QueryExecutionContext" => %{ "Database" => "my_awesome_database" }, - # "UNLOAD (select * from iris)\nTO 's3://foo/results'\nWITH (compression = 'SNAPPY', format = 'PARQUET')", "QueryString" => "select * from iris", "ResultConfiguration" => %{"OutputLocation" => "s3://foo"} } = decoded @@ -29,22 +28,9 @@ defmodule ReqAthenaTest do end } - # me = self() - assert response = Req.new(adapter: fake_athena(request_validations)) |> Req.Request.put_header("x-auth", "my awesome auth header") - # |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, - # credentials -> - # assert manifest_location == "s3://foo-manifest.csv" - - # assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == - # Enum.sort(credentials) - - # send(me, {:explorer_built, manifest_location}) - - # Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) - # end) |> ReqAthena.attach(opts) |> Req.post!(athena: "select * from iris") @@ -120,15 +106,44 @@ defmodule ReqAthenaTest do }, "UpdateCount" => 0 } + end + + test "executes a query string returning the API result without decoding" do + opts = [ + access_key_id: "some key", + secret_access_key: "dummy", + region: "us-east-1", + database: "my_awesome_database", + output_location: "s3://foo" + ] - # assert df = %Explorer.DataFrame{} = response.body + request_validations = %{ + "StartQueryExecution" => fn request -> + decoded = Jason.decode!(request.body) - # assert Explorer.DataFrame.to_columns(df, atom_keys: true) == %{ - # id: [1, 2], - # name: ["Ale", "Wojtek"] - # } + assert %{ + "ClientRequestToken" => client_req_token, + "QueryExecutionContext" => %{ + "Database" => "my_awesome_database" + }, + "QueryString" => "select * from iris", + "ResultConfiguration" => %{"OutputLocation" => "s3://foo"} + } = decoded + + assert is_binary(client_req_token) + end + } + + assert response = + Req.new(adapter: fake_athena(request_validations)) + |> Req.Request.put_header("x-auth", "my awesome auth header") + |> ReqAthena.attach(opts) + |> Req.post!(athena: "select * from iris", decode_body: false) - # assert_received {:explorer_built, _output_location} + assert response.status == 200 + + assert response.body == + ~s|{"ResultSet":{"ColumnInfos":[{"CaseSensitive":false,"CatalogName":"hive","Label":"id","Name":"id","Nullable":"UNKNOWN","Precision":10,"Scale":0,"SchemaName":"","TableName":"","Type":"integer"},{"CaseSensitive":true,"CatalogName":"hive","Label":"name","Name":"name","Nullable":"UNKNOWN","Precision":2147483647,"Scale":0,"SchemaName":"","TableName":"","Type":"varchar"}],"ResultRows":[{"Data":["id","name"]},{"Data":["1","Ale"]},{"Data":["2","Wojtek"]}],"ResultSetMetadata":{"ColumnInfo":[{"CaseSensitive":false,"CatalogName":"hive","Label":"id","Name":"id","Nullable":"UNKNOWN","Precision":10,"Scale":0,"SchemaName":"","TableName":"","Type":"integer"},{"CaseSensitive":true,"CatalogName":"hive","Label":"name","Name":"name","Nullable":"UNKNOWN","Precision":2147483647,"Scale":0,"SchemaName":"","TableName":"","Type":"varchar"}]},"Rows":[{"Data":[{"VarCharValue":"id"},{"VarCharValue":"name"}]},{"Data":[{"VarCharValue":"1"},{"VarCharValue":"Ale"}]},{"Data":[{"VarCharValue":"2"},{"VarCharValue":"Wojtek"}]}]},"UpdateCount":0}| end test "executes a parameterized query" do @@ -403,6 +418,64 @@ defmodule ReqAthenaTest do assert_received :get_query_execution end + test "executes a query string with :explorer format" do + opts = [ + access_key_id: "some key", + secret_access_key: "dummy", + region: "us-east-1", + database: "my_awesome_database", + output_location: "s3://foo" + ] + + request_validations = %{ + "StartQueryExecution" => fn request -> + decoded = Jason.decode!(request.body) + + assert %{ + "ClientRequestToken" => client_req_token, + "QueryExecutionContext" => %{ + "Database" => "my_awesome_database" + }, + "QueryString" => + "UNLOAD (select * from iris)\nTO 's3://foo/results'\nWITH (format = 'PARQUET')", + "ResultConfiguration" => %{"OutputLocation" => "s3://foo"} + } = decoded + + assert is_binary(client_req_token) + end + } + + me = self() + + assert response = + Req.new(adapter: fake_athena(request_validations)) + |> Req.Request.put_header("x-auth", "my awesome auth header") + |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, + credentials -> + assert manifest_location == "s3://foo-manifest.csv" + + assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == + Enum.sort(credentials) + + send(me, {:explorer_built, manifest_location}) + + Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) + end) + |> ReqAthena.attach(opts) + |> Req.post!(athena: "select * from iris", format: :explorer) + + assert response.status == 200 + + assert df = %Explorer.DataFrame{} = response.body + + assert Explorer.DataFrame.to_columns(df, atom_keys: true) == %{ + id: [1, 2], + name: ["Ale", "Wojtek"] + } + + assert_received {:explorer_built, _output_location} + end + test "raises the request when neither workgroup and output location are defined" do opts = [ access_key_id: "some key", From 3f68631e9ad32f8ea97c7cc271042b7b1437bb22 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 3 Sep 2024 18:10:21 -0300 Subject: [PATCH 06/20] Add "format: :explorer" decoding body to Explorer DFs --- lib/req_athena.ex | 119 ++++++++------------------------------- mix.exs | 3 +- mix.lock | 9 +-- test/req_athena_test.exs | 65 ++++++++++++++------- 4 files changed, 76 insertions(+), 120 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 4fb3463..5becd83 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -271,7 +271,7 @@ defmodule ReqAthena do get_json_result(request, response) :explorer -> - get_explorer_lazy_frame(request, response) + get_explorer_result(request, response) other -> raise ArgumentError, @@ -290,7 +290,7 @@ defmodule ReqAthena do Request.halt(request, response) end - defp get_explorer_lazy_frame(request, response) do + defp get_explorer_result(request, response) do body = Jason.decode!(response.body) result = @@ -306,9 +306,11 @@ defmodule ReqAthena do # This private field is only meant to be used in tests. fetcher_and_builder = - Request.get_private(request, :athena_dataframe_builder, &fetch_and_build_dataframe/2) + Request.get_private(request, :athena_dataframe_builder, &fetch_and_build_dataframe/3) - fetcher_and_builder.(manifest_csv_location, aws_credentials) + decode_body = Req.Request.get_option(request, :decode_body, true) + + fetcher_and_builder.(manifest_csv_location, aws_credentials, decode_body) else body end @@ -317,22 +319,32 @@ defmodule ReqAthena do end @doc false - def fetch_and_build_dataframe(manifest_csv_location, aws_credentials) do + def fetch_and_build_dataframe(manifest_csv_location, aws_credentials, true = _decode_body) do # TODO: Should we handle errors here? - manifest_df = - Explorer.DataFrame.from_csv!(manifest_csv_location, - header: false, - config: aws_credentials - ) - - manifest_df[0] - |> Explorer.Series.to_list() + manifest_csv_location + |> get_from_s3(aws_credentials) + |> String.trim() + |> String.split("\n") |> Enum.map(fn parquet_location -> Explorer.DataFrame.from_parquet!(parquet_location, lazy: true, config: aws_credentials) end) |> Explorer.DataFrame.concat_rows() end + def fetch_and_build_dataframe(manifest_csv_location, aws_credentials, false = _decode_body) do + manifest_csv_location + |> get_from_s3(aws_credentials) + |> String.trim() + |> String.split("\n") + end + + defp get_from_s3(location, aws_credentials) do + req = Req.new() |> ReqS3.attach(aws_sigv4: aws_credentials) + + response = Req.get!(req, url: location) + response.body + end + defp get_query_state(request, response) do response = %{request | body: response.body} @@ -411,58 +423,6 @@ defmodule ReqAthena do Request.halt(request, Req.post!(request, athena: prepared_query)) end - # defp decode_result(request, response) do - # body = Jason.decode!(response.body) - # query = Request.get_private(request, :athena_query) - # query_execution_id = Request.get_private(request, :athena_query_execution_id) - # output_location = Request.get_private(request, :athena_output_location) - - # result = - # case body do - # %{ - # "ResultSet" => %{ - # "ResultSetMetadata" => %{"ColumnInfo" => columns_info}, - # "Rows" => [%{"Data" => column_labels} | rows] - # } - # } -> - # %ReqAthena.Result{ - # query_execution_id: query_execution_id, - # output_location: output_location, - # statement_name: query.statement_name, - # rows: decode_rows(rows, columns_info), - # columns: decode_column_labels(column_labels), - # metadata: columns_info - # } - - # %{"ResultSet" => _} -> - # %ReqAthena.Result{ - # query_execution_id: query_execution_id, - # output_location: output_location, - # statement_name: query.statement_name - # } - - # body -> - # body - # end - - # Request.halt(request, %{response | body: result}) - # end - - # defp decode_column_labels(column_labels) do - # Enum.map(column_labels, &Map.fetch!(&1, "VarCharValue")) - # end - - # defp decode_rows(rows, columns_info) do - # column_types = Enum.map(columns_info, &Map.take(&1, ["Type"])) - - # Enum.map(rows, fn %{"Data" => datums} -> - # Enum.zip_with([datums, column_types], fn [datum, column_type] -> - # value = datum["VarCharValue"] || "" - # decode_value(value, column_type) - # end) - # end) - # end - # TODO: Add step `put_aws_sigv4` to Req # See: https://github.com/wojtekmach/req/issues/62 defp sign_request(request, action) when is_binary(action) do @@ -533,35 +493,6 @@ defmodule ReqAthena do defp now, do: NaiveDateTime.utc_now() |> NaiveDateTime.to_erl() - # defp decode_value(nil, _), do: nil - - # @integer_types ~w(bigint smallint integer) - - # defp decode_value(value, %{"Type" => type}) when type in @integer_types, - # do: String.to_integer(value) - - # @float_types ~w(double float decimal) - - # defp decode_value(value, %{"Type" => type}) when type in @float_types, - # do: String.to_float(value) - - # defp decode_value("true", %{"Type" => "boolean"}), do: true - # defp decode_value("false", %{"Type" => "boolean"}), do: false - # defp decode_value(value, %{"Type" => "date"}), do: Date.from_iso8601!(value) - - # defp decode_value(value, %{"Type" => "timestamp"}), do: NaiveDateTime.from_iso8601!(value) - - # defp decode_value(value, %{"Type" => "timestamp with time zone"}) do - # [d, t, tz] = String.split(value, " ", trim: true) - # date = Date.from_iso8601!(d) - # time = Time.from_iso8601!(t) - - # DateTime.new!(date, time, tz) - # |> DateTime.truncate(:millisecond) - # end - - # defp decode_value(value, _), do: value - # TODO: Use Req.Request.get_option/3 when Req 0.4.0 is out. defp get_option(request, key, default) when is_atom(key) do Map.get(request.options, key, default) diff --git a/mix.exs b/mix.exs index 4861a0e..b45c275 100644 --- a/mix.exs +++ b/mix.exs @@ -43,7 +43,8 @@ defmodule ReqAthena.MixProject do [ {:req, "~> 0.5.0"}, {:aws_signature, "~> 0.3.0"}, - {:explorer, "~> 0.9.0"}, + {:req_s3, "~> 0.2"}, + {:explorer, "~> 0.9"}, {:aws_credentials, "~> 0.2", optional: true}, {:table, "~> 0.1.1", optional: true}, {:tzdata, "~> 1.1.1", only: :test}, diff --git a/mix.lock b/mix.lock index 6887fcc..65725c0 100644 --- a/mix.lock +++ b/mix.lock @@ -1,12 +1,12 @@ %{ - "aws_credentials": {:hex, :aws_credentials, "0.3.1", "0d4de58e0548ec6cfc663bef4fb0d59e611c182c4e6ee3dda14f35d0ac74fcfd", [:rebar3], [{:eini, "~> 2.2.4", [hex: :eini_beam, repo: "hexpm", optional: false]}, {:iso8601, "~> 1.3.4", [hex: :iso8601, repo: "hexpm", optional: false]}, {:jsx, "~> 3.1.0", [hex: :jsx, repo: "hexpm", optional: false]}], "hexpm", "f212bc8f2f24e73b8faa08cc37f01fb55c7ae36c9d3dc3175ec6248d4c89e8c7"}, + "aws_credentials": {:hex, :aws_credentials, "0.3.2", "ba2ccee4ec6dcb5426cf71830b7afd73795b1f19655f401d4401015b468fec6f", [:rebar3], [{:eini, "~> 2.2.4", [hex: :eini_beam, repo: "hexpm", optional: false]}, {:iso8601, "~> 1.3.4", [hex: :iso8601, repo: "hexpm", optional: false]}, {:jsx, "~> 3.1.0", [hex: :jsx, repo: "hexpm", optional: false]}], "hexpm", "2e748626a935a7a544647fb79d7054f38db8bf378978542c962ccbeab387387b"}, "aws_signature": {:hex, :aws_signature, "0.3.2", "adf33bc4af00b2089b7708bf20e3246f09c639a905a619b3689f0a0a22c3ef8f", [:rebar3], [], "hexpm", "b0daf61feb4250a8ab0adea60db3e336af732ff71dd3fb22e45ae3dcbd071e44"}, "castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"}, "certifi": {:hex, :certifi, "2.9.0", "6f2a475689dd47f19fb74334859d460a2dc4e3252a3324bd2111b8f0429e7e21", [:rebar3], [], "hexpm", "266da46bdb06d6c6d35fde799bcb28d36d985d424ad7c08b5bb48f5b5cdd4641"}, "earmark_parser": {:hex, :earmark_parser, "1.4.41", "ab34711c9dc6212dda44fcd20ecb87ac3f3fce6f0ca2f28d4a00e4154f8cd599", [:mix], [], "hexpm", "a81a04c7e34b6617c2792e291b5a2e57ab316365c2644ddc553bb9ed863ebefa"}, "eini": {:hex, :eini_beam, "2.2.4", "02143b1dce4dda4243248e7d9b3d8274b8d9f5a666445e3d868e2cce79e4ff22", [:rebar3], [], "hexpm", "12de479d144b19e09bb92ba202a7ea716739929afdf9dff01ad802e2b1508471"}, "ex_doc": {:hex, :ex_doc, "0.34.2", "13eedf3844ccdce25cfd837b99bea9ad92c4e511233199440488d217c92571e8", [:mix], [{:earmark_parser, "~> 1.4.39", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "5ce5f16b41208a50106afed3de6a2ed34f4acfd65715b82a0b84b49d995f95c1"}, - "explorer": {:hex, :explorer, "0.9.1", "9c6f175dfd2fa2f432d5fe9a86b81875438a9a1110af5b952c284842bee434e4", [:mix], [{:adbc, "~> 0.1", [hex: :adbc, repo: "hexpm", optional: true]}, {:aws_signature, "~> 0.3", [hex: :aws_signature, repo: "hexpm", optional: false]}, {:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:flame, "~> 0.3", [hex: :flame, repo: "hexpm", optional: true]}, {:fss, "~> 0.1", [hex: :fss, repo: "hexpm", optional: false]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}, {:rustler, "~> 0.34.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}, {:table, "~> 0.1.2", [hex: :table, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1 or ~> 4.0.0", [hex: :table_rex, repo: "hexpm", optional: false]}], "hexpm", "d88ec0e78f904c5eaf0b37c4a0ce4632de133515f3740a29fbddd2c0d0a78e77"}, + "explorer": {:hex, :explorer, "0.9.2", "a9598eeff8d36d88f643d14818bea1869ca70c4def61bfba22f040ee315b84b6", [:mix], [{:adbc, "~> 0.1", [hex: :adbc, repo: "hexpm", optional: true]}, {:aws_signature, "~> 0.3", [hex: :aws_signature, repo: "hexpm", optional: false]}, {:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:flame, "~> 0.3", [hex: :flame, repo: "hexpm", optional: true]}, {:fss, "~> 0.1", [hex: :fss, repo: "hexpm", optional: false]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}, {:rustler, "~> 0.34.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}, {:table, "~> 0.1.2", [hex: :table, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1 or ~> 4.0.0", [hex: :table_rex, repo: "hexpm", optional: false]}], "hexpm", "63057e318d613c1819bd8bee2d8ed4f7061c3136edc6832ad18243d28e6344eb"}, "finch": {:hex, :finch, "0.18.0", "944ac7d34d0bd2ac8998f79f7a811b21d87d911e77a786bc5810adb75632ada4", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.3", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.6 or ~> 1.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "69f5045b042e531e53edc2574f15e25e735b522c37e2ddb766e15b979e03aa65"}, "fss": {:hex, :fss, "0.1.1", "9db2344dbbb5d555ce442ac7c2f82dd975b605b50d169314a20f08ed21e08642", [:mix], [], "hexpm", "78ad5955c7919c3764065b21144913df7515d52e228c09427a004afe9c1a16b0"}, "hackney": {:hex, :hackney, "1.18.1", "f48bf88f521f2a229fc7bae88cf4f85adc9cd9bcf23b5dc8eb6a1788c662c4f6", [:rebar3], [{:certifi, "~> 2.9.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a4ecdaff44297e9b5894ae499e9a070ea1888c84afdd1fd9b7b2bc384950128e"}, @@ -27,11 +27,12 @@ "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, "req": {:hex, :req, "0.5.6", "8fe1eead4a085510fe3d51ad854ca8f20a622aae46e97b302f499dfb84f726ac", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "cfaa8e720945d46654853de39d368f40362c2641c4b2153c886418914b372185"}, - "rustler_precompiled": {:hex, :rustler_precompiled, "0.7.2", "097f657e401f02e7bc1cab808cfc6abdc1f7b9dc5e5adee46bf2fd8fdcce9ecf", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "7663faaeadc9e93e605164dcf9e69168e35f2f8b7f2b9eb4e400d1a8e0fe2999"}, + "req_s3": {:hex, :req_s3, "0.2.3", "ede5f4c792cf39995379307733ff4593032a876f38da29d9d7ea03881b498b51", [:mix], [{:req, "~> 0.5.6", [hex: :req, repo: "hexpm", optional: false]}], "hexpm", "31b5d52490495c8aeea7e3c5cbcec82f49035e11bdaf41f0e58ab716fefe44ca"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.0", "02d218b575d8175e80138557f46bee7af5598f29e9aff8935a6c369c0e6c47a5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "00b1711d8d828200fe931e23bb0e72c2672a3a0ef76740e3c50433afda1965fb"}, "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, "table": {:hex, :table, "0.1.2", "87ad1125f5b70c5dea0307aa633194083eb5182ec537efc94e96af08937e14a8", [:mix], [], "hexpm", "7e99bc7efef806315c7e65640724bf165c3061cdc5d854060f74468367065029"}, "table_rex": {:hex, :table_rex, "4.0.0", "3c613a68ebdc6d4d1e731bc973c233500974ec3993c99fcdabb210407b90959b", [:mix], [], "hexpm", "c35c4d5612ca49ebb0344ea10387da4d2afe278387d4019e4d8111e815df8f55"}, - "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, + "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, "tzdata": {:hex, :tzdata, "1.1.1", "20c8043476dfda8504952d00adac41c6eda23912278add38edc140ae0c5bcc46", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "a69cec8352eafcd2e198dea28a34113b60fdc6cb57eb5ad65c10292a6ba89787"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, } diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index 86f594d..66f7d19 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -315,34 +315,14 @@ defmodule ReqAthenaTest do output_location: "s3://foo" ] - # me = self() - response = Req.new(adapter: fake_athena(validations)) |> ReqAthena.attach(opts) - # |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, credentials -> - # assert manifest_location == "s3://foo-manifest.csv" - - # assert Enum.sort( - # Keyword.take(opts, [:access_key_id, :secret_access_key, :region, :token]) - # ) == - # Enum.sort(credentials) - - # send(me, :explorer_built) - - # Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) - # end) |> Req.post!(athena: "select * from iris") assert response.status == 200 assert is_map(response.body) - # assert Explorer.DataFrame.to_columns(response.body, atom_keys: true) == %{ - # id: [1, 2], - # name: ["Ale", "Wojtek"] - # } - - # assert_received :explorer_built assert_received :token_validation assert_received :token_validation assert_received :token_validation @@ -451,9 +431,12 @@ defmodule ReqAthenaTest do Req.new(adapter: fake_athena(request_validations)) |> Req.Request.put_header("x-auth", "my awesome auth header") |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, - credentials -> + credentials, + decode_body -> assert manifest_location == "s3://foo-manifest.csv" + assert decode_body + assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == Enum.sort(credentials) @@ -476,6 +459,46 @@ defmodule ReqAthenaTest do assert_received {:explorer_built, _output_location} end + test "executes a query string with :explorer format and :decode_body as false" do + opts = [ + access_key_id: "some key", + secret_access_key: "dummy", + region: "us-east-1", + database: "my_awesome_database", + output_location: "s3://foo" + ] + + me = self() + + request_validations = %{ + "StartQueryExecution" => &Function.identity/1 + } + + assert response = + Req.new(adapter: fake_athena(request_validations)) + |> Req.Request.put_header("x-auth", "my awesome auth header") + |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, + credentials, + decode_body -> + refute decode_body + + assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == + Enum.sort(credentials) + + send(me, {:explorer_built, manifest_location}) + + ["s3://foo/results/first"] + end) + |> ReqAthena.attach(opts) + |> Req.post!(athena: "select * from iris", format: :explorer, decode_body: false) + + assert response.status == 200 + + assert response.body == ["s3://foo/results/first"] + + assert_received {:explorer_built, _output_location} + end + test "raises the request when neither workgroup and output location are defined" do opts = [ access_key_id: "some key", From 996706daa9c6b0dd91322211c35aa4fd2c3bbca5 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 3 Sep 2024 19:51:33 -0300 Subject: [PATCH 07/20] Add support for the ":csv" format Using decode_body: false is going to return the output location only. --- lib/req_athena.ex | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 5becd83..62e5e4f 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -258,11 +258,17 @@ defmodule ReqAthena do case output_format do :none -> - if Request.get_option(request, :decode_body, true) do - Request.halt(request, %{response | body: Jason.decode!(response.body)}) - else - Request.halt(request, response) - end + response = + if Request.get_option(request, :decode_body, true) do + %{response | body: Jason.decode!(response.body)} + else + response + end + + Request.halt(request, response) + + :explorer -> + get_explorer_result(request, response) :csv -> get_csv_result(request, response) @@ -270,9 +276,6 @@ defmodule ReqAthena do :json -> get_json_result(request, response) - :explorer -> - get_explorer_result(request, response) - other -> raise ArgumentError, ":format - `#{inspect(other)}` is not valid. Only :none, :csv, :json or :explorer are accepted." @@ -283,7 +286,24 @@ defmodule ReqAthena do defp handle_athena_result(request_response), do: request_response defp get_csv_result(request, response) do - Request.halt(request, response) + csv_location = Request.get_private(request, :athena_output_location) + + decode_body = Req.Request.get_option(request, :decode_body, true) + + result = + if decode_body do + aws_credentials = + for key <- @credential_keys, + value = request.options[key], + not is_nil(value), + do: {key, value} + + get_from_s3(csv_location, aws_credentials) + else + csv_location + end + + Request.halt(request, %{response | body: result}) end defp get_json_result(request, response) do From c66782d44f25f560496eb030e00cb6ecf37f9cbd Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 3 Sep 2024 20:07:29 -0300 Subject: [PATCH 08/20] Fix supported formats --- lib/req_athena.ex | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 62e5e4f..80a93ed 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -198,14 +198,26 @@ defmodule ReqAthena do query = if output_format not in [:csv, :none] and is_binary(request.options[:output_location]) do - ReqAthena.Query.with_unload( - query, + format_str = + case output_format do + :explorer -> "PARQUET" + :json -> "JSON" + other -> raise ArgumentError, ":format - not supported #{inspect(other)}" + end + + unload_opts = [ + format: format_str, # We need to add this "subdirectory" because Athena expects the results directory # to be empty for the "UNLOAD" command. to: Path.join(request.options[:output_location], "results") + ] + + ReqAthena.Query.with_unload( + query, + unload_opts ) else - if output_format in [:parquet, :orc, :avro, :json, :textfile] do + if output_format in [:explorer, :json] do raise ArgumentError, ":output_location needs to be defined in order to use the #{inspect(output_format)} format" end From 22056ee2a443e78a3fd9e4c2765076ff8e3662e5 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 4 Sep 2024 17:00:41 -0300 Subject: [PATCH 09/20] Add JSON format support and fix Explorer result handle --- lib/req_athena.ex | 67 +++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 80a93ed..6632427 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -304,11 +304,7 @@ defmodule ReqAthena do result = if decode_body do - aws_credentials = - for key <- @credential_keys, - value = request.options[key], - not is_nil(value), - do: {key, value} + aws_credentials = aws_credentials_from_request(request) get_from_s3(csv_location, aws_credentials) else @@ -319,40 +315,61 @@ defmodule ReqAthena do end defp get_json_result(request, response) do - Request.halt(request, response) + manifest_csv_location = + Request.get_private(request, :athena_output_location) <> "-manifest.csv" + + aws_credentials = aws_credentials_from_request(request) + + decode_body = Req.Request.get_option(request, :decode_body, true) + + results_locations = + manifest_csv_location + |> get_from_s3(aws_credentials) + |> String.trim() + |> String.split("\n") + + # OPTIMIZE: use tasks to retrieve files. + results = + if decode_body do + Enum.flat_map(results_locations, fn location -> + contents = get_from_s3(location, aws_credentials) + for line <- String.split(contents, "\n"), line != "", do: Jason.decode!(line) + end) + else + results_locations + end + + Request.halt(request, %{response | body: results}) end defp get_explorer_result(request, response) do - body = Jason.decode!(response.body) + manifest_csv_location = + Request.get_private(request, :athena_output_location) <> "-manifest.csv" - result = - if Map.has_key?(body, "ResultSet") do - manifest_csv_location = - Request.get_private(request, :athena_output_location) <> "-manifest.csv" - - aws_credentials = - for key <- @credential_keys, - value = request.options[key], - not is_nil(value), - do: {key, value} + aws_credentials = aws_credentials_from_request(request) - # This private field is only meant to be used in tests. - fetcher_and_builder = - Request.get_private(request, :athena_dataframe_builder, &fetch_and_build_dataframe/3) + # This private field is only meant to be used in tests. + fetcher_and_builder = + Request.get_private(request, :athena_dataframe_builder, &fetch_and_build_dataframe/3) - decode_body = Req.Request.get_option(request, :decode_body, true) + decode_body = Req.Request.get_option(request, :decode_body, true) - fetcher_and_builder.(manifest_csv_location, aws_credentials, decode_body) - else - body - end + result = fetcher_and_builder.(manifest_csv_location, aws_credentials, decode_body) Request.halt(request, %{response | body: result}) end + defp aws_credentials_from_request(request) do + for key <- @credential_keys, + value = request.options[key], + not is_nil(value), + do: {key, value} + end + @doc false def fetch_and_build_dataframe(manifest_csv_location, aws_credentials, true = _decode_body) do # TODO: Should we handle errors here? + # OPTIMIZE: use tasks for retrieving parquets. manifest_csv_location |> get_from_s3(aws_credentials) |> String.trim() From 5853765384738071178cd452d2c995734051008e Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Mon, 9 Sep 2024 14:05:22 -0300 Subject: [PATCH 10/20] Fix docs --- lib/req_athena.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 6632427..328945b 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -54,7 +54,7 @@ defmodule ReqAthena do * `:format` - Optional. It changes the output format. By default this is `:none`, which means that we return the decoded result from the Athena API. - The supported formats are: `:csv`, `:explorer,`, `:json` and `:textfile`. + The supported formats are: `:csv`, `:explorer,`, and `:json`. For `:csv`, the contents of the CSV file are the output instead of the API return. When `:json` is used, the contents of the JSON files are going to be the output. From 059646c50a32bee31a7fb6f4bb1db3f76fe9c0cd Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 09:09:49 -0300 Subject: [PATCH 11/20] Add test cases for JSON and CSV formats - integration tests --- lib/req_athena.ex | 6 +- test/integration_test.exs | 260 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 251 insertions(+), 15 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 328945b..99228bf 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -415,7 +415,7 @@ defmodule ReqAthena do count = Request.get_private(request, :athena_wait_count, 1) if count >= 3 do - Logger.info("ReqAthena: query is in QUEUED state, will retry in 1000ms") + Logger.info("ReqAthena: query is in QUEUED state, will retry in #{@wait_delay}ms") end request = Request.put_private(request, :athena_wait_count, count + 1) @@ -446,7 +446,9 @@ defmodule ReqAthena do Request.halt(request, %{response | body: body}) end - _other_state -> + other_state -> + Logger.warning("ReqAthena: query returned an unknown state -> #{other_state}") + if Request.get_option(request, :decode_body, true) do Request.halt(request, %{response | body: body}) else diff --git a/test/integration_test.exs b/test/integration_test.exs index e916722..03a8a43 100644 --- a/test/integration_test.exs +++ b/test/integration_test.exs @@ -22,13 +22,15 @@ defmodule IntegrationTest do LOCATION 's3://osm-pds/planet/';\ """ - test "returns the response from AWS Athena's API" do + test "returns the response in an Explorer dataframe" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + opts = [ access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", - output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") ] # create table @@ -48,7 +50,8 @@ defmodule IntegrationTest do FROM planet WHERE id = 470454 and type = 'relation' - """ + """, + format: :explorer ) assert query_response.status == 200 @@ -56,8 +59,6 @@ defmodule IntegrationTest do assert %Explorer.DataFrame{} = ldf = query_response.body assert Explorer.DataFrame.lazy?(ldf) - df = Explorer.DataFrame.collect(ldf) - names = [ "id", "type", @@ -94,18 +95,255 @@ defmodule IntegrationTest do true ] + df = Explorer.DataFrame.collect(ldf) + assert Explorer.DataFrame.names(df) == names assert Explorer.DataFrame.to_rows(df) == [Map.new(Enum.zip(names, values))] end + test "format as explorer without decoding body returns the list of parquet files" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + + opts = [ + access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + region: System.fetch_env!("AWS_REGION"), + database: "default", + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") + ] + + # create table + req = + Req.new(http_errors: :raise) + |> ReqAthena.attach(opts) + + response = Req.post!(req, athena: @create_table) + + assert response.status == 200 + + # query single row from planet table + assert query_response = + Req.post!(req, + athena: """ + SELECT id, type, tags, members, timestamp, visible + FROM planet + WHERE id = 470454 + and type = 'relation' + """, + format: :explorer, + decode_body: false + ) + + assert query_response.status == 200 + + assert [first_file | _] = query_response.body + + assert String.starts_with?(first_file, "s3://") + end + + test "returns the response as CSV" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + + opts = [ + access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + region: System.fetch_env!("AWS_REGION"), + database: "default", + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") + ] + + # create table + req = + Req.new(http_errors: :raise) + |> ReqAthena.attach(opts) + + response = Req.post!(req, athena: @create_table) + + assert response.status == 200 + + # query single row from planet table + assert query_response = + Req.post!(req, + athena: """ + SELECT id, type, tags, members, timestamp, visible + FROM planet + WHERE id = 470454 + and type = 'relation' + """, + format: :csv + ) + + assert query_response.status == 200 + + assert query_response.body == + ~s|"id","type","tags","members","timestamp","visible" +"470454","relation","{ref=17229A, site=geodesic, name=Mérignac A, source=©IGN 2010 dans le cadre de la cartographie réglementaire, type=site, url=http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A, network=NTF-5}","[{type=node, ref=670007839, role=}, {type=node, ref=670007840, role=}]","2017-01-21 12:51:34.000","true" +| + end + + test "format as CSV without decoding body returns the CSV file path" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + + opts = [ + access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + region: System.fetch_env!("AWS_REGION"), + database: "default", + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") + ] + + # create table + req = + Req.new(http_errors: :raise) + |> ReqAthena.attach(opts) + + response = Req.post!(req, athena: @create_table) + + assert response.status == 200 + + # query single row from planet table + assert query_response = + Req.post!(req, + athena: """ + SELECT id, type, tags, members, timestamp, visible + FROM planet + WHERE id = 470454 + and type = 'relation' + """, + format: :csv, + decode_body: false + ) + + assert query_response.status == 200 + assert String.starts_with?(query_response.body, "s3://") + assert String.ends_with?(query_response.body, ".csv") + end + + test "returns the response as a list of JSON objects" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + + opts = [ + access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + region: System.fetch_env!("AWS_REGION"), + database: "default", + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") + ] + + # create table + req = + Req.new(http_errors: :raise) + |> ReqAthena.attach(opts) + + response = Req.post!(req, athena: @create_table) + + assert response.status == 200 + + # query single row from planet table + assert query_response = + Req.post!(req, + athena: """ + SELECT id, type, tags, members, timestamp, visible + FROM planet + WHERE (id in (470454, 470455)) + and type = 'relation' + """, + format: :json + ) + + assert query_response.status == 200 + + assert query_response.body == + [ + %{ + "id" => 470_454, + "members" => [ + %{"ref" => 670_007_839, "role" => "", "type" => "node"}, + %{"ref" => 670_007_840, "role" => "", "type" => "node"} + ], + "tags" => %{ + "name" => "Mérignac A", + "network" => "NTF-5", + "ref" => "17229A", + "site" => "geodesic", + "source" => "©IGN 2010 dans le cadre de la cartographie réglementaire", + "type" => "site", + "url" => + "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A" + }, + "timestamp" => "2017-01-21 12:51:34", + "type" => "relation", + "visible" => true + }, + %{ + "id" => 470_455, + "members" => [%{"ref" => 670_007_841, "role" => "", "type" => "node"}], + "tags" => %{ + "name" => "Meschers-sur-Gironde A", + "network" => "NTF-5", + "ref" => "17230A", + "site" => "geodesic", + "source" => "©IGN 2010 dans le cadre de la cartographie réglementaire", + "type" => "site", + "url" => + "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17230A" + }, + "timestamp" => "2017-01-21 12:51:34", + "type" => "relation", + "visible" => true + } + ] + end + + test "returns the response as a list paths to NDJSON files" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + + opts = [ + access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + region: System.fetch_env!("AWS_REGION"), + database: "default", + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") + ] + + # create table + req = + Req.new(http_errors: :raise) + |> ReqAthena.attach(opts) + + response = Req.post!(req, athena: @create_table) + + assert response.status == 200 + + # query single row from planet table + assert query_response = + Req.post!(req, + athena: """ + SELECT id, type, tags, members, timestamp, visible + FROM planet + WHERE id = 470454 + and type = 'relation' + """, + format: :json, + decode_body: false + ) + + assert query_response.status == 200 + + assert [first_file | _] = query_response.body + + assert String.starts_with?(first_file, "s3://") + end + test "returns the response from AWS Athena's API with parameterized query" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + opts = [ access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", - no_explorer: true, - output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") ] # create table @@ -120,18 +358,14 @@ defmodule IntegrationTest do # query single row from planet table assert query_response = Req.post!(req, + format: :json, athena: {"SELECT id, type FROM planet WHERE id = ? and type = ?", [239_970_142, "node"]} ) assert query_response.status == 200 - assert query_response.body.columns == ["id", "type"] - assert query_response.body.statement_name == "query_C71EF77B8B7B92D9846C6D7E70136448" - assert is_binary(query_response.body.query_execution_id) - assert query_response.body.rows == [[239_970_142, "node"]] - assert query_response.body.output_location == - "#{opts[:output_location]}/#{query_response.body.query_execution_id}.csv" + assert query_response.body == [%{"id" => 239_970_142, "type" => "node"}] end test "encodes and decodes types received from AWS Athena's response" do From 762443baebffb14a8bff7d956b29906fcabcac3f Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 10:03:47 -0300 Subject: [PATCH 12/20] Refactor to isolate S3 interactions to a module --- lib/req_athena.ex | 74 +++++++++++++++------------------------- lib/req_athena/s3.ex | 20 +++++++++++ test/req_athena_test.exs | 10 +++--- 3 files changed, 52 insertions(+), 52 deletions(-) create mode 100644 lib/req_athena/s3.ex diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 99228bf..39a3049 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -300,13 +300,11 @@ defmodule ReqAthena do defp get_csv_result(request, response) do csv_location = Request.get_private(request, :athena_output_location) - decode_body = Req.Request.get_option(request, :decode_body, true) - result = - if decode_body do + if Req.Request.get_option(request, :decode_body, true) do aws_credentials = aws_credentials_from_request(request) - - get_from_s3(csv_location, aws_credentials) + req_s3 = ReqAthena.S3.new(aws_credentials) + ReqAthena.S3.get_body(req_s3, csv_location) else csv_location end @@ -315,36 +313,30 @@ defmodule ReqAthena do end defp get_json_result(request, response) do - manifest_csv_location = - Request.get_private(request, :athena_output_location) <> "-manifest.csv" + output_location = Request.get_private(request, :athena_output_location) aws_credentials = aws_credentials_from_request(request) + req_s3 = ReqAthena.S3.new(aws_credentials) - decode_body = Req.Request.get_option(request, :decode_body, true) + locations = ReqAthena.S3.get_locations(req_s3, output_location) - results_locations = - manifest_csv_location - |> get_from_s3(aws_credentials) - |> String.trim() - |> String.split("\n") - - # OPTIMIZE: use tasks to retrieve files. + # OPTIMIZE: use tasks to retrieve locations. results = - if decode_body do - Enum.flat_map(results_locations, fn location -> - contents = get_from_s3(location, aws_credentials) + if Req.Request.get_option(request, :decode_body, true) do + Enum.flat_map(locations, fn location -> + contents = ReqAthena.S3.get_body(req_s3, location) + for line <- String.split(contents, "\n"), line != "", do: Jason.decode!(line) end) else - results_locations + locations end Request.halt(request, %{response | body: results}) end defp get_explorer_result(request, response) do - manifest_csv_location = - Request.get_private(request, :athena_output_location) <> "-manifest.csv" + output_location = Request.get_private(request, :athena_output_location) aws_credentials = aws_credentials_from_request(request) @@ -354,7 +346,7 @@ defmodule ReqAthena do decode_body = Req.Request.get_option(request, :decode_body, true) - result = fetcher_and_builder.(manifest_csv_location, aws_credentials, decode_body) + result = fetcher_and_builder.(output_location, aws_credentials, decode_body) Request.halt(request, %{response | body: result}) end @@ -367,31 +359,19 @@ defmodule ReqAthena do end @doc false - def fetch_and_build_dataframe(manifest_csv_location, aws_credentials, true = _decode_body) do - # TODO: Should we handle errors here? - # OPTIMIZE: use tasks for retrieving parquets. - manifest_csv_location - |> get_from_s3(aws_credentials) - |> String.trim() - |> String.split("\n") - |> Enum.map(fn parquet_location -> - Explorer.DataFrame.from_parquet!(parquet_location, lazy: true, config: aws_credentials) - end) - |> Explorer.DataFrame.concat_rows() - end - - def fetch_and_build_dataframe(manifest_csv_location, aws_credentials, false = _decode_body) do - manifest_csv_location - |> get_from_s3(aws_credentials) - |> String.trim() - |> String.split("\n") - end - - defp get_from_s3(location, aws_credentials) do - req = Req.new() |> ReqS3.attach(aws_sigv4: aws_credentials) - - response = Req.get!(req, url: location) - response.body + def fetch_and_build_dataframe(output_location, aws_credentials, decode_body) do + req_s3 = ReqAthena.S3.new(aws_credentials) + locations = ReqAthena.S3.get_locations(req_s3, output_location) + + if decode_body do + locations + |> Enum.map(fn parquet_location -> + Explorer.DataFrame.from_parquet!(parquet_location, lazy: true, config: aws_credentials) + end) + |> Explorer.DataFrame.concat_rows() + else + locations + end end defp get_query_state(request, response) do diff --git a/lib/req_athena/s3.ex b/lib/req_athena/s3.ex new file mode 100644 index 0000000..91ecd79 --- /dev/null +++ b/lib/req_athena/s3.ex @@ -0,0 +1,20 @@ +defmodule ReqAthena.S3 do + def new(aws_credentials, options \\ []) do + options |> Req.new() |> ReqS3.attach(aws_sigv4: aws_credentials) + end + + def get_locations(req_s3, output_location) do + manifest_csv_location = output_location <> "-manifest.csv" + + req_s3 + |> get_body(manifest_csv_location) + |> String.trim() + |> String.split("\n") + end + + def get_body(req_s3, location) do + response = Req.get!(req_s3, url: location) + + response.body + end +end diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index 66f7d19..2759429 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -430,17 +430,17 @@ defmodule ReqAthenaTest do assert response = Req.new(adapter: fake_athena(request_validations)) |> Req.Request.put_header("x-auth", "my awesome auth header") - |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, + |> Req.Request.put_private(:athena_dataframe_builder, fn output_location, credentials, decode_body -> - assert manifest_location == "s3://foo-manifest.csv" + assert String.starts_with?(output_location, "s3://") assert decode_body assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == Enum.sort(credentials) - send(me, {:explorer_built, manifest_location}) + send(me, {:explorer_built, output_location}) Explorer.DataFrame.new(id: [1, 2], name: ["Ale", "Wojtek"]) end) @@ -477,7 +477,7 @@ defmodule ReqAthenaTest do assert response = Req.new(adapter: fake_athena(request_validations)) |> Req.Request.put_header("x-auth", "my awesome auth header") - |> Req.Request.put_private(:athena_dataframe_builder, fn manifest_location, + |> Req.Request.put_private(:athena_dataframe_builder, fn output_location, credentials, decode_body -> refute decode_body @@ -485,7 +485,7 @@ defmodule ReqAthenaTest do assert Enum.sort(Keyword.take(opts, [:access_key_id, :secret_access_key, :region])) == Enum.sort(credentials) - send(me, {:explorer_built, manifest_location}) + send(me, {:explorer_built, output_location}) ["s3://foo/results/first"] end) From 5f86feaf905ffe4094ccd01b5f08632e88744373 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 10:04:25 -0300 Subject: [PATCH 13/20] Fix integration tests with new responses --- test/integration_test.exs | 344 +++++++++++++++++++++++++++----------- 1 file changed, 247 insertions(+), 97 deletions(-) diff --git a/test/integration_test.exs b/test/integration_test.exs index 03a8a43..1c4e2ca 100644 --- a/test/integration_test.exs +++ b/test/integration_test.exs @@ -22,6 +22,237 @@ defmodule IntegrationTest do LOCATION 's3://osm-pds/planet/';\ """ + test "without a given format returns the response as it is from the API" do + now = DateTime.utc_now() |> DateTime.to_iso8601() + + opts = [ + access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + region: System.fetch_env!("AWS_REGION"), + database: "default", + output_location: Path.join(System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), "test-#{now}") + ] + + # create table + req = + Req.new(http_errors: :raise) + |> ReqAthena.attach(opts) + + response = Req.post!(req, athena: @create_table) + + assert response.status == 200 + + # query single row from planet table + assert query_response = + Req.post!(req, + athena: """ + SELECT id, type, tags, members, timestamp, visible + FROM planet + WHERE id = 470454 + and type = 'relation' + """ + ) + + assert query_response.status == 200 + + assert %{ + "ResultSet" => %{ + "ColumnInfos" => [ + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "id", + "Name" => "id", + "Nullable" => "UNKNOWN", + "Precision" => 19, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "bigint" + }, + %{ + "CaseSensitive" => true, + "CatalogName" => "hive", + "Label" => "type", + "Name" => "type", + "Nullable" => "UNKNOWN", + "Precision" => 2_147_483_647, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "varchar" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "tags", + "Name" => "tags", + "Nullable" => "UNKNOWN", + "Precision" => 0, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "map" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "members", + "Name" => "members", + "Nullable" => "UNKNOWN", + "Precision" => 0, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "array" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "timestamp", + "Name" => "timestamp", + "Nullable" => "UNKNOWN", + "Precision" => 3, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "timestamp" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "visible", + "Name" => "visible", + "Nullable" => "UNKNOWN", + "Precision" => 0, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "boolean" + } + ], + "ResultRows" => [ + %{"Data" => ["id", "type", "tags", "members", "timestamp", "visible"]}, + %{ + "Data" => [ + "470454", + "relation", + "{ref=17229A, site=geodesic, name=Mérignac A, source=©IGN 2010 dans le cadre de la cartographie réglementaire, type=site, url=http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A, network=NTF-5}", + "[{type=node, ref=670007839, role=}, {type=node, ref=670007840, role=}]", + "2017-01-21 12:51:34.000", + "true" + ] + } + ], + "ResultSetMetadata" => %{ + "ColumnInfo" => [ + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "id", + "Name" => "id", + "Nullable" => "UNKNOWN", + "Precision" => 19, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "bigint" + }, + %{ + "CaseSensitive" => true, + "CatalogName" => "hive", + "Label" => "type", + "Name" => "type", + "Nullable" => "UNKNOWN", + "Precision" => 2_147_483_647, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "varchar" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "tags", + "Name" => "tags", + "Nullable" => "UNKNOWN", + "Precision" => 0, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "map" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "members", + "Name" => "members", + "Nullable" => "UNKNOWN", + "Precision" => 0, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "array" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "timestamp", + "Name" => "timestamp", + "Nullable" => "UNKNOWN", + "Precision" => 3, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "timestamp" + }, + %{ + "CaseSensitive" => false, + "CatalogName" => "hive", + "Label" => "visible", + "Name" => "visible", + "Nullable" => "UNKNOWN", + "Precision" => 0, + "Scale" => 0, + "SchemaName" => "", + "TableName" => "", + "Type" => "boolean" + } + ] + }, + "Rows" => [ + %{ + "Data" => [ + %{"VarCharValue" => "id"}, + %{"VarCharValue" => "type"}, + %{"VarCharValue" => "tags"}, + %{"VarCharValue" => "members"}, + %{"VarCharValue" => "timestamp"}, + %{"VarCharValue" => "visible"} + ] + }, + %{ + "Data" => [ + %{"VarCharValue" => "470454"}, + %{"VarCharValue" => "relation"}, + %{ + "VarCharValue" => + "{ref=17229A, site=geodesic, name=Mérignac A, source=©IGN 2010 dans le cadre de la cartographie réglementaire, type=site, url=http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A, network=NTF-5}" + }, + %{ + "VarCharValue" => + "[{type=node, ref=670007839, role=}, {type=node, ref=670007840, role=}]" + }, + %{"VarCharValue" => "2017-01-21 12:51:34.000"}, + %{"VarCharValue" => "true"} + ] + } + ] + }, + "UpdateCount" => 0 + } == query_response.body + end + test "returns the response in an Explorer dataframe" do now = DateTime.utc_now() |> DateTime.to_iso8601() @@ -368,65 +599,6 @@ defmodule IntegrationTest do assert query_response.body == [%{"id" => 239_970_142, "type" => "node"}] end - test "encodes and decodes types received from AWS Athena's response" do - opts = [ - access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), - secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), - region: System.fetch_env!("AWS_REGION"), - database: "default", - no_explorer: true, - output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") - ] - - req = Req.new(http_errors: :raise) |> ReqAthena.attach(opts) - - value = "req" - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = 1 - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = 1.1 - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = -1.1 - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = true - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = 1.175494351e-38 - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = 3.402823466e+38 - assert Req.post!(req, athena: {"SELECT ?", [value]}).body.rows == [[value]] - - value = Date.utc_today() - query = "SELECT CAST(? AS DATE)" - assert Req.post!(req, athena: {query, [value]}).body.rows == [[value]] - - naive_dt = NaiveDateTime.utc_now() - value = NaiveDateTime.truncate(naive_dt, :millisecond) - query = "SELECT CAST(? AS TIMESTAMP)" - assert Req.post!(req, athena: {query, [naive_dt]}).body.rows == [[value]] - - datetime = DateTime.utc_now() - value = DateTime.to_naive(datetime) |> NaiveDateTime.truncate(:millisecond) - assert Req.post!(req, athena: {query, [datetime]}).body.rows == [[value]] - - query = "SELECT timestamp '2012-10-31 01:00:00.000 UTC' AT TIME ZONE 'America/Sao_Paulo'" - value = DateTime.new!(~D[2012-10-30], ~T[23:00:00.000], "America/Sao_Paulo") - assert Req.post!(req, athena: query).body.rows == [[value]] - - value = "{name=aleDsz, id=1}" - query = "SELECT MAP(ARRAY['name', 'id'], ARRAY['aleDsz', '1'])" - assert Req.post!(req, athena: query).body.rows == [[value]] - - value = "{ids=[10, 20]}" - query = "SELECT CAST(ROW(ARRAY[10, 20]) AS ROW(ids ARRAY))" - assert Req.post!(req, athena: query).body.rows == [[value]] - end - test "returns failed AWS Athena's response" do opts = [ access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), @@ -470,26 +642,21 @@ defmodule IntegrationTest do assert response.status == 200 end - test "creates table inside AWS Athena's database with workgroup" do - opts = [ - access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), - secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), - region: System.fetch_env!("AWS_REGION"), - database: "default", - workgroup: "primary" - ] + # TODO: check why it's not working only with "workgroup" + # test "creates table inside AWS Athena's database with workgroup" do + # opts = [ + # access_key_id: System.fetch_env!("AWS_ACCESS_KEY_ID"), + # secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), + # region: System.fetch_env!("AWS_REGION"), + # database: "default", + # workgroup: "primary" + # ] - req = Req.new(http_errors: :raise) |> ReqAthena.attach(opts) - response = Req.post!(req, athena: @create_table) - result = response.body + # req = Req.new(http_errors: :raise) |> ReqAthena.attach(opts) + # response = Req.post!(req, athena: @create_table) - assert response.status == 200 - - assert result.columns == [] - refute result.statement_name - assert is_binary(result.query_execution_id) - assert result.output_location =~ "#{result.query_execution_id}.txt" - end + # assert response.status == 200 + # end test "creates table inside AWS Athena's database with workgroup and output location" do opts = [ @@ -503,14 +670,9 @@ defmodule IntegrationTest do req = Req.new(http_errors: :raise) |> ReqAthena.attach(opts) response = Req.post!(req, athena: @create_table) - result = response.body + assert %{} = response.body assert response.status == 200 - - assert result.columns == [] - refute result.statement_name - assert is_binary(result.query_execution_id) - assert result.output_location == "#{opts[:output_location]}/#{result.query_execution_id}.txt" end test "returns the cached result from AWS Athena's response" do @@ -519,7 +681,6 @@ defmodule IntegrationTest do secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", - no_explorer: true, output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION") ] @@ -538,16 +699,11 @@ defmodule IntegrationTest do assert query_response.status == 200 result = query_response.body - query_execution_id = result.query_execution_id - - assert result.columns == ~w(id type tags members timestamp visible) - refute result.statement_name - assert is_binary(result.query_execution_id) assert response = Req.post!(req, athena: query) assert response.status == 200 - assert response.body.query_execution_id == query_execution_id + assert result == response.body end test "force new result from AWS Athena's response" do @@ -556,7 +712,6 @@ defmodule IntegrationTest do secret_access_key: System.fetch_env!("AWS_SECRET_ACCESS_KEY"), region: System.fetch_env!("AWS_REGION"), database: "default", - no_explorer: true, output_location: System.fetch_env!("AWS_ATHENA_OUTPUT_LOCATION"), cache_query: false ] @@ -576,16 +731,11 @@ defmodule IntegrationTest do assert query_response.status == 200 result = query_response.body - query_execution_id = result.query_execution_id - - assert result.columns == ~w(id type tags members timestamp visible) - refute result.statement_name - assert is_binary(result.query_execution_id) assert response = Req.post!(req, athena: query) assert response.status == 200 - refute response.body.query_execution_id == query_execution_id + assert result == response.body end describe "with aws_credentials" do From fe5d7b41374998e49fea0ca0a3c58962c2470224 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 10:22:41 -0300 Subject: [PATCH 14/20] Update docs and make Explorer optional --- lib/req_athena.ex | 84 ++++++++++++++++++++++------------------------- mix.exs | 2 +- 2 files changed, 41 insertions(+), 45 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 39a3049..7d4bd3d 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -64,7 +64,10 @@ defmodule ReqAthena do then will lazy load these parquet files into an Explorer dataframe. There are some limitations when using the `:json` and `:explorer` format. - See more about it reading the [`UNLOAD` command docs](https://docs.aws.amazon.com/athena/latest/ug/unload.html#unload-considerations-and-limitations). + First, you need to install Explorer in order to use the `:explorer` format. + Second, when using these format, you always need to provide a different output location. + See the [`UNLOAD` command docs](https://docs.aws.amazon.com/athena/latest/ug/unload.html#unload-considerations-and-limitations) + for more details. * `:output_compression` - Optional. Sets the Parquet compression format and level for the output when using the Explorer output format. This can be a string, like `"gzip"`, @@ -96,18 +99,25 @@ defmodule ReqAthena do ...> ] iex> query = "SELECT id, type, tags, members, timestamp, visible FROM planet WHERE id = 470454 and type = 'relation'" iex> req = Req.new() |> ReqAthena.attach(opts) - iex> Req.post!(req, athena: query).body - %ReqAthena.Result{ - columns: ["id", "type", "tags", "members", "timestamp", "visible"], - output_location: "s3://my-bucket/c594d5df-9879-4bf7-8796-780e0b87a673.csv", - query_execution_id: "c594d5df-9879-4bf7-8796-780e0b87a673", - rows: [ - [470454, "relation", - "{ref=17229A, site=geodesic, name=Mérignac A, source=©IGN 2010 dans le cadre de la cartographie réglementaire, type=site, url=http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A, network=NTF-5}", - "[{type=node, ref=670007839, role=}, {type=node, ref=670007840, role=}]", - ~N[2017-01-21 12:51:34.000], true] + iex> Req.post!(req, athena: query, format: :json).body + %{ + "id" => 470454, + "members" => [ + %{"ref" => 670007839, "role" => "", "type" => "node"}, + %{"ref" => 670007840, "role" => "", "type" => "node"} ], - statement_name: nil + "tags" => %{ + "name" => "Mérignac A", + "network" => "NTF-5", + "ref" => "17229A", + "site" => "geodesic", + "source" => "©IGN 2010 dans le cadre de la cartographie réglementaire", + "type" => "site", + "url" => "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A" + }, + "timestamp" => "2017-01-21 12:51:34", + "type" => "relation", + "visible" => true } With parameterized query: @@ -121,14 +131,8 @@ defmodule ReqAthena do ...> ] iex> query = "SELECT id, type FROM planet WHERE id = ? and type = ?" iex> req = Req.new() |> ReqAthena.attach(opts) - iex> Req.post!(req, athena: {query, [239_970_142, "node"]}).body - %ReqAthena.Result{ - columns: ["id", "type"], - output_location: "s3://my-bucket/dda41d66-1eea-4588-850a-945c9def9163.csv", - query_execution_id: "dda41d66-1eea-4588-850a-945c9def9163", - rows: [[239_970_142, "node"]], - statement_name: "query_C71EF77B8B7B92D9846C6D7E70136448" - } + iex> Req.post!(req, athena: {query, [239_970_142, "node"]}, format: :json).body + [%{"id" => 239970142, "type" => "node"}] """ def attach(%Request{} = request, options \\ []) do @@ -143,10 +147,10 @@ defmodule ReqAthena do defp run(request) do if query = request.options[:athena] do - region = fetch_option!(request, :region) + region = Request.fetch_option!(request, :region) url = "https://athena.#{region}.amazonaws.com" - cache_query = get_option(request, :cache_query, true) + cache_query = Request.get_option(request, :cache_query, true) %{request | url: URI.parse(url)} |> put_request_body(query, cache_query) @@ -227,7 +231,7 @@ defmodule ReqAthena do body = Map.merge(output_config, %{ - QueryExecutionContext: %{Database: fetch_option!(request, :database)}, + QueryExecutionContext: %{Database: Request.fetch_option!(request, :database)}, QueryString: ReqAthena.Query.to_query_string(query) }) @@ -364,13 +368,24 @@ defmodule ReqAthena do locations = ReqAthena.S3.get_locations(req_s3, output_location) if decode_body do + build_lazy_frame(locations, aws_credentials) + else locations + end + end + + if Code.ensure_loaded?(Explorer) do + defp build_lazy_frame(parquet_locations, aws_credentials) do + parquet_locations |> Enum.map(fn parquet_location -> Explorer.DataFrame.from_parquet!(parquet_location, lazy: true, config: aws_credentials) end) |> Explorer.DataFrame.concat_rows() - else - locations + end + else + defp build_lazy_frame(parquet_locations, aws_credentials) do + raise ArgumentError, + "format: :explorer - you need to install Explorer as a dependency in order to use this format" end end @@ -523,23 +538,4 @@ defmodule ReqAthena do end defp now, do: NaiveDateTime.utc_now() |> NaiveDateTime.to_erl() - - # TODO: Use Req.Request.get_option/3 when Req 0.4.0 is out. - defp get_option(request, key, default) when is_atom(key) do - Map.get(request.options, key, default) - end - - # TODO: Use Req.Request.fetch_option!/2 when Req 0.4.0 is out. - def fetch_option!(request, key) when is_atom(key) do - case Map.fetch(request.options, key) do - {:ok, value} -> - value - - :error -> - raise KeyError, - term: request.options, - key: key, - message: "option #{inspect(key)} is not set" - end - end end diff --git a/mix.exs b/mix.exs index b45c275..c2b6c1b 100644 --- a/mix.exs +++ b/mix.exs @@ -44,7 +44,7 @@ defmodule ReqAthena.MixProject do {:req, "~> 0.5.0"}, {:aws_signature, "~> 0.3.0"}, {:req_s3, "~> 0.2"}, - {:explorer, "~> 0.9"}, + {:explorer, "~> 0.9", optional: true}, {:aws_credentials, "~> 0.2", optional: true}, {:table, "~> 0.1.1", optional: true}, {:tzdata, "~> 1.1.1", only: :test}, From 888f506ac9a51d9ef641343a96b95a2d14ca9de1 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 10:30:43 -0300 Subject: [PATCH 15/20] Fix readme --- README.md | 56 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 9bb00db..bea26b7 100644 --- a/README.md +++ b/README.md @@ -49,45 +49,47 @@ LOCATION 's3://osm-pds/planet/'; """ Req.post!(req, athena: query).body -#=> -# %ReqAthena.Result{ -# columns: [], -# output_location: "s3://my-bucket/a034610b-daaf-4c8d-aa61-d1a706231062.txt", -# query_execution_id: "a034610b-daaf-4c8d-aa61-d1a706231062", -# rows: [], -# statement_name: nil +# => +# %{ +# "Output" => "", +# "ResultSet" => %{ +# "ColumnInfos" => [], +# "ResultRows" => [], +# "ResultSetMetadata" => %{"ColumnInfo" => []}, +# "Rows" => [] +# } # } # With plain string query query = "SELECT id, type, tags, members, timestamp, visible FROM planet WHERE id = 470454 and type = 'relation'" -Req.post!(req, athena: query).body -#=> -# %ReqAthena.Result{ -# columns: ["id", "type", "tags", "members", "timestamp", "visible"], -# output_location: "s3://my-bucket/c594d5df-9879-4bf7-8796-780e0b87a673.csv", -# query_execution_id: "c594d5df-9879-4bf7-8796-780e0b87a673", -# rows: [ -# [470454, "relation", -# "{ref=17229A, site=geodesic, name=Mérignac A, source=©IGN 2010 dans le cadre de la cartographie réglementaire, type=site, url=http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A, network=NTF-5}", -# "[{type=node, ref=670007839, role=}, {type=node, ref=670007840, role=}]", -# ~N[2017-01-21 12:51:34.000], true] +Req.post!(req, athena: query, format: :json).body +# => +# %{ +# "id" => 470454, +# "members" => [ +# %{"ref" => 670007839, "role" => "", "type" => "node"}, +# %{"ref" => 670007840, "role" => "", "type" => "node"} # ], -# statement_name: nil +# "tags" => %{ +# "name" => "Mérignac A", +# "network" => "NTF-5", +# "ref" => "17229A", +# "site" => "geodesic", +# "source" => "©IGN 2010 dans le cadre de la cartographie réglementaire", +# "type" => "site", +# "url" => "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A" +# }, +# "timestamp" => "2017-01-21 12:51:34", +# "type" => "relation", +# "visible" => true # } # With parameterized query query = "SELECT id, type FROM planet WHERE id = ? and type = ?" Req.post!(req, athena: {query, [239_970_142, "node"]}).body -#=> -# %ReqAthena.Result{ -# columns: ["id", "type"], -# output_location: "s3://my-bucket/dda41d66-1eea-4588-850a-945c9def9163.csv", -# query_execution_id: "dda41d66-1eea-4588-850a-945c9def9163", -# rows: [[239970142, "node"]], -# statement_name: "query_C71EF77B8B7B92D9846C6D7E70136448" -# } +#=> [%{"id" => 239970142, "type" => "node"}] ``` ## License From 8dba4142046b133b432d2456319d8e447d354b97 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 10:37:29 -0300 Subject: [PATCH 16/20] Remove unused Result struct --- lib/req_athena/result.ex | 42 ---------------------------------------- 1 file changed, 42 deletions(-) delete mode 100644 lib/req_athena/result.ex diff --git a/lib/req_athena/result.ex b/lib/req_athena/result.ex deleted file mode 100644 index de5a236..0000000 --- a/lib/req_athena/result.ex +++ /dev/null @@ -1,42 +0,0 @@ -defmodule ReqAthena.Result do - @moduledoc """ - Result struct returned from any successful query. - - Its fields are: - - * `columns` - The column names; - * `rows` - The result set. A list of lists, each inner list corresponding to a - row, each element in the inner list corresponds to a column; - * `statement_name` - The statement name from executed query; - * `query_execution_id` - The id from executed query; - * `output_location` - The S3 url location where the result was output. - * `metadata` - The columnInfo from https://docs.aws.amazon.com/athena/latest/APIReference/API_GetQueryResults.html - """ - - @type t :: %__MODULE__{ - columns: [String.t()], - rows: [[term()]], - statement_name: binary(), - query_execution_id: binary(), - output_location: binary(), - metadata: [term()] - } - - defstruct [ - :statement_name, - :query_execution_id, - :output_location, - rows: [], - columns: [], - metadata: [] - ] -end - -if Code.ensure_loaded?(Table.Reader) do - defimpl Table.Reader, for: ReqAthena.Result do - def init(result) do - {:rows, %{{:athena, :column_infos} => result.metadata, columns: result.columns}, - result.rows} - end - end -end From cb5308c2e28f4a810df27d7f6e69a04029d58c2d Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 10:42:12 -0300 Subject: [PATCH 17/20] Fix output at README --- README.md | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index bea26b7..a0ed146 100644 --- a/README.md +++ b/README.md @@ -65,25 +65,27 @@ query = "SELECT id, type, tags, members, timestamp, visible FROM planet WHERE id Req.post!(req, athena: query, format: :json).body # => -# %{ -# "id" => 470454, -# "members" => [ -# %{"ref" => 670007839, "role" => "", "type" => "node"}, -# %{"ref" => 670007840, "role" => "", "type" => "node"} -# ], -# "tags" => %{ -# "name" => "Mérignac A", -# "network" => "NTF-5", -# "ref" => "17229A", -# "site" => "geodesic", -# "source" => "©IGN 2010 dans le cadre de la cartographie réglementaire", -# "type" => "site", -# "url" => "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A" -# }, -# "timestamp" => "2017-01-21 12:51:34", -# "type" => "relation", -# "visible" => true -# } +# [ +# %{ +# "id" => 470454, +# "members" => [ +# %{"ref" => 670007839, "role" => "", "type" => "node"}, +# %{"ref" => 670007840, "role" => "", "type" => "node"} +# ], +# "tags" => %{ +# "name" => "Mérignac A", +# "network" => "NTF-5", +# "ref" => "17229A", +# "site" => "geodesic", +# "source" => "©IGN 2010 dans le cadre de la cartographie réglementaire", +# "type" => "site", +# "url" => "http://geodesie.ign.fr/fiches/index.php?module=e&action=fichepdf&source=carte&sit_no=17229A" +# }, +# "timestamp" => "2017-01-21 12:51:34", +# "type" => "relation", +# "visible" => true +# } +# ] # With parameterized query query = "SELECT id, type FROM planet WHERE id = ? and type = ?" From 1eefe508a056f12425363116570892dbcf3ed561 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 13:15:42 -0300 Subject: [PATCH 18/20] Hide ReqAthena.S3 --- lib/req_athena/s3.ex | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/req_athena/s3.ex b/lib/req_athena/s3.ex index 91ecd79..c8ffd09 100644 --- a/lib/req_athena/s3.ex +++ b/lib/req_athena/s3.ex @@ -1,4 +1,5 @@ defmodule ReqAthena.S3 do + @moduledoc false def new(aws_credentials, options \\ []) do options |> Req.new() |> ReqS3.attach(aws_sigv4: aws_credentials) end From 10f7ecace495b618fcc0c9a9b0bc969098bfb5b6 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 11 Sep 2024 12:21:37 -0300 Subject: [PATCH 19/20] Apply suggestions from code review Co-authored-by: Wojtek Mach --- lib/req_athena.ex | 23 +++++++++++++---------- lib/req_athena/s3.ex | 2 +- test/req_athena_test.exs | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/req_athena.ex b/lib/req_athena.ex index 7d4bd3d..065db04 100644 --- a/lib/req_athena.ex +++ b/lib/req_athena.ex @@ -45,23 +45,26 @@ defmodule ReqAthena do * `:database` - Required. The AWS Athena database name. - * `:output_location` - Conditional. The S3 URL location to output AWS Athena query results. + * `:output_location` - Optional. The S3 URL location to output AWS Athena query results. Results will be saved as Parquet and loaded with Explorer only if this option is given. * `:workgroup` - Conditional. The AWS Athena workgroup. * `:cache_query` - Optional. Forces a non-cached result from AWS Athena. - * `:format` - Optional. It changes the output format. By default this is - `:none`, which means that we return the decoded result from the Athena API. - The supported formats are: `:csv`, `:explorer,`, and `:json`. + * `:format` - Optional. The output format. Can be one of: + + * `:none` (default) - return decoded API response from Athena. + + * `:csv` - return contents of the CSV file. + + * `:json` - return contents of the JSON file. + + Note: Req by default automatically decodes JSON response body ([`decode_body`](Req.Steps.decode_body1/) step) + and to prevent it from doing so, set `decode_body: false`. + + * `:explorer` - return contents in parquet format, lazy loaded into Explorer data frame. - For `:csv`, the contents of the CSV file are the output instead of the API return. - When `:json` is used, the contents of the JSON files are going to be the output. - Notice that the body is decoded by default and to prevent that, you need to use - the `:decode_body` option, so you get the "raw" data. - The `:explorer` format will perform the query unloading it to Parquet files, and - then will lazy load these parquet files into an Explorer dataframe. There are some limitations when using the `:json` and `:explorer` format. First, you need to install Explorer in order to use the `:explorer` format. diff --git a/lib/req_athena/s3.ex b/lib/req_athena/s3.ex index c8ffd09..daa76f7 100644 --- a/lib/req_athena/s3.ex +++ b/lib/req_athena/s3.ex @@ -14,7 +14,7 @@ defmodule ReqAthena.S3 do end def get_body(req_s3, location) do - response = Req.get!(req_s3, url: location) + %{status: 200} = response = Req.get!(req_s3, url: location) response.body end diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index 2759429..1e93b32 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -433,7 +433,7 @@ defmodule ReqAthenaTest do |> Req.Request.put_private(:athena_dataframe_builder, fn output_location, credentials, decode_body -> - assert String.starts_with?(output_location, "s3://") + assert "s3://" <> _ = output_location assert decode_body From 30593d5fe03be513a6a130ceab74230a4cc44901 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 11 Sep 2024 12:22:06 -0300 Subject: [PATCH 20/20] Update test/req_athena_test.exs Co-authored-by: Wojtek Mach --- test/req_athena_test.exs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/req_athena_test.exs b/test/req_athena_test.exs index 1e93b32..c8a2396 100644 --- a/test/req_athena_test.exs +++ b/test/req_athena_test.exs @@ -135,8 +135,10 @@ defmodule ReqAthenaTest do } assert response = - Req.new(adapter: fake_athena(request_validations)) - |> Req.Request.put_header("x-auth", "my awesome auth header") + Req.new( + adapter: fake_athena(request_validations), + headers: [x_auth: "my awesome auth header"] + ) |> ReqAthena.attach(opts) |> Req.post!(athena: "select * from iris", decode_body: false)