From f3a8f5cb94f50a4e3c011bc529d5621bd521ac18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibaut=20Barr=C3=A8re?= Date: Wed, 10 Jan 2024 15:15:00 +0100 Subject: [PATCH 1/2] =?UTF-8?q?Moteur=20de=20recherche=20avec=20mod=C3=A8l?= =?UTF-8?q?e=20d'indexation=20plus=20efficace=20(#3703)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Create search_engine.exs * Create 20231231135108_add_dataset_search_payload.exs * Update dataset.ex * Update search_engine.exs [skip ci] * Implement title + format search & render something [skip ci] * Add web view * Add bootstrap * Save bogus auto-complete implementation * Fix various bugs * Add modes to instant search Pair-session with Aurélien :-) * Remove TODOs * Update search_engine.exs * Remove left-over * Try to fix the CI --------- Co-authored-by: Antoine Augusti --- apps/transport/lib/db/dataset.ex | 2 + ...31231135108_add_dataset_search_payload.exs | 9 + scripts/search_engine.exs | 292 ++++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 apps/transport/priv/repo/migrations/20231231135108_add_dataset_search_payload.exs create mode 100755 scripts/search_engine.exs diff --git a/apps/transport/lib/db/dataset.ex b/apps/transport/lib/db/dataset.ex index 6a8026df50..53c55ef479 100644 --- a/apps/transport/lib/db/dataset.ex +++ b/apps/transport/lib/db/dataset.ex @@ -84,6 +84,8 @@ defmodule DB.Dataset do # we ask in the backoffice for a name to display # (used in the long title of a dataset and to find the associated datasets) field(:associated_territory_name, :string) + + field(:search_payload, :map) end def base_query, do: from(d in DB.Dataset, as: :dataset, where: d.is_active) diff --git a/apps/transport/priv/repo/migrations/20231231135108_add_dataset_search_payload.exs b/apps/transport/priv/repo/migrations/20231231135108_add_dataset_search_payload.exs new file mode 100644 index 0000000000..92486e43a0 --- /dev/null +++ b/apps/transport/priv/repo/migrations/20231231135108_add_dataset_search_payload.exs @@ -0,0 +1,9 @@ +defmodule DB.Repo.Migrations.AddDatasetSearchPayload do + use Ecto.Migration + + def change do + alter table(:dataset) do + add(:search_payload, :map, default: %{}) + end + end +end diff --git a/scripts/search_engine.exs b/scripts/search_engine.exs new file mode 100755 index 0000000000..fb117b07d8 --- /dev/null +++ b/scripts/search_engine.exs @@ -0,0 +1,292 @@ +#! elixir + +my_app_root = Path.join(__DIR__, "..") + +Application.put_env(:search, Search.Endpoint, + http: [ip: {127, 0, 0, 1}, port: 5001], + server: true, + live_view: [signing_salt: "aaaaaaaa"], + secret_key_base: String.duplicate("a", 64) +) + +Mix.install( + [ + {:my_app, path: my_app_root, env: :dev}, + {:io_ansi_table, "~> 1.0"} + ], + config_path: Path.join(my_app_root, "config/config.exs"), + lockfile: Path.join(my_app_root, "mix.lock") +) + +# NOTE: wondering if Flop (https://github.com/woylie/flop) could be a better fit than Scrivener (which is hardly maintained) +# It provides cursor-based pagination as well as regular limit/offset stuff. + +defmodule Search.ErrorView do + def render(template, _), do: Phoenix.Controller.status_message_from_template(template) +end + +defmodule Search.HomeLive do + use Phoenix.LiveView, layout: {__MODULE__, :live} + + use Phoenix.HTML, only: [text_input: 2] + + def mount(_params, _session, socket) do + {:ok, + socket + |> assign(:title, "") + |> assign(:format, "") + |> assign(:mode, "") + |> update_datasets(%{"config" => %{}})} + end + + defp phx_vsn, do: Application.spec(:phoenix, :vsn) + defp lv_vsn, do: Application.spec(:phoenix_live_view, :vsn) + + def render("live.html", assigns) do + ~H""" + + + + + + + <%= @inner_content %> + """ + end + + def render(assigns) do + ~H""" +
+ <.form :let={f} id="search" for={%{}} as={:config} phx-change="change_form" phx-submit="ignore"> +
+ <%= text_input(f, :title, + value: @title, + placeholder: "Title", + autocorrect: "off" + ) %> + <%= text_input(f, :format, + value: @format, + placeholder: "Resource Format", + autocorrect: "off" + ) %> + <%= text_input(f, :mode, + value: @mode, + placeholder: "Resource Mode", + autocorrect: "off" + ) %> +
+ + +

+ <%= @datasets |> length %> datasets found +

+ + + <%= for dataset <- @datasets do %> + + + + + + + <% end %> + +
<%= dataset.id %><%= dataset.title %><%= dataset.formats %><%= dataset.modes %>
+
+ """ + end + + import Ecto.Query + + def nil_if_blank(value) do + value = (value || "") |> String.trim() + if value == "", do: nil, else: value + end + + def update_datasets(socket, params) do + # NOTE: could be improved here to tap directly in the assigns + datasets = + Searcher.search( + title: nil_if_blank(get_in(params, ["config", "title"])), + format: nil_if_blank(get_in(params, ["config", "format"])), + mode: nil_if_blank(get_in(params, ["config", "mode"])) + ) + |> Enum.map(&Searcher.render(&1)) + + assign(socket, :datasets, datasets) + end + + def handle_event("change_form", params, socket) do + {:noreply, update_datasets(socket, params)} + end +end + +defmodule Search.Router do + use Phoenix.Router + import Phoenix.LiveView.Router + + pipeline :browser do + plug(:accepts, ["html"]) + end + + scope "/", Search do + pipe_through(:browser) + + live("/", HomeLive, :index) + end +end + +defmodule Search.Endpoint do + use Phoenix.Endpoint, otp_app: :search + socket("/live", Phoenix.LiveView.Socket) + plug(Search.Router) +end + +defmodule SearchIndexer do + import Ecto.Query + + def fetch_items do + from(d in DB.Dataset, + preload: :resources + # limit: 10 + ) + |> DB.Repo.all() + end + + def find_resource_history(resource_id) do + from(rh in DB.ResourceHistory) + |> where([rh], rh.resource_id == ^resource_id) + |> order_by([rh], {:desc, rh.inserted_at}) + |> limit(1) + |> DB.Repo.one() + end + + def find_resource_metadata(resource_history_id) do + from(rm in DB.ResourceMetadata) + |> where([rm], rm.resource_history_id == ^resource_history_id) + |> order_by([rh], {:desc, rh.inserted_at}) + |> limit(1) + |> DB.Repo.one() + end + + def compute_payload(%DB.Dataset{} = dataset) do + # NOTE: not optimized for N+1 because performance is good enough for now + + modes = + dataset.resources + |> Enum.map(fn r -> + rh_id = if x = find_resource_history(r.id), do: x.id, else: nil + + if rh_id != nil do + rm = find_resource_metadata(rh_id) + + if rm != nil do + rm.metadata["modes"] + else + nil + end + else + nil + end + end) + |> List.flatten() + |> Enum.reject(&(&1 == nil)) + + %{ + id: dataset.id, + datagouv_id: dataset.datagouv_id, + title: dataset.custom_title, + description: dataset.description, + formats: dataset.resources |> Enum.map(& &1.format), + modes: modes + } + end + + def reindex! do + # NOTE: much, much too slow for my taste, should be bulked and/or parallelized + fetch_items() + |> Enum.map(&{&1, compute_payload(&1)}) + |> Enum.each(fn {%DB.Dataset{} = d, %{} = payload} -> + Ecto.Changeset.change(d, %{search_payload: payload}) + |> DB.Repo.update!() + end) + end +end + +defmodule Searcher do + import Ecto.Query + + def maybe_search_title(query, nil), do: query + + def maybe_search_title(query, search_title) do + safe_like_title = "%" <> DB.Contact.safe_like_pattern(search_title) <> "%" + + query + |> where([d], fragment("search_payload->>'title' ilike ?", ^safe_like_title)) + end + + def maybe_search_resources_formats(query, nil), do: query + + def maybe_search_resources_formats(query, search_format) do + query + |> where([d], fragment("search_payload #> Array['formats'] \\? ?", ^search_format)) + end + + # NOTE: could be DRYed with formats + def maybe_search_resources_modes(query, nil), do: query + + def maybe_search_resources_modes(query, mode) do + query + |> where([d], fragment("search_payload #> Array['modes'] \\? ?", ^mode)) + end + + def search(options) do + from(d in DB.Dataset) + |> maybe_search_title(options[:title]) + |> maybe_search_resources_formats(options[:format]) + |> maybe_search_resources_modes(options[:mode]) + |> select([d], [:id, :custom_title, :search_payload]) + |> DB.Repo.all() + end + + def render(%{} = item) do + %{ + id: item.id, + title: item.custom_title, + formats: (item.search_payload["formats"] || []) |> Enum.join(", "), + modes: (item.search_payload["modes"] || []) |> Enum.join(", ") + } + end + + def render(items) do + IO.ANSI.Table.start([:id, :title, :formats]) + IO.ANSI.Table.format(items |> Enum.map(&render(&1))) + end +end + +if System.get_env("REINDEX") == "1" do + SearchIndexer.reindex!() +end + +# Uncomment for fancy ANSI-console rendering + +# Searcher.search(title: "bibus") +# |> Searcher.render() + +# Searcher.search(format: "SIRI") +# |> Searcher.render() + +if System.get_env("RUN_SERVER") == "1" do + {:ok, _} = Supervisor.start_link([Search.Endpoint], strategy: :one_for_one) + Process.sleep(:infinity) +end + +IO.puts("Done") From c471d43bb8d7b888d03c03968544f72984b6c0b2 Mon Sep 17 00:00:00 2001 From: Antoine Augusti Date: Wed, 10 Jan 2024 15:22:18 +0100 Subject: [PATCH 2/2] =?UTF-8?q?Ajuste=20cl=C3=A9s=20=C3=A9trang=C3=A8res?= =?UTF-8?q?=20pour=20resource=5Frelated=20(#3713)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../20230404074406_add_resource_related.exs | 4 ++-- ...0240110085755_resource_related_on_delete.exs | 17 +++++++++++++++++ .../transport/test/db/resource_related_test.exs | 6 +++++- 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 apps/transport/priv/repo/migrations/20240110085755_resource_related_on_delete.exs diff --git a/apps/transport/priv/repo/migrations/20230404074406_add_resource_related.exs b/apps/transport/priv/repo/migrations/20230404074406_add_resource_related.exs index 0367a16f12..957977e21e 100644 --- a/apps/transport/priv/repo/migrations/20230404074406_add_resource_related.exs +++ b/apps/transport/priv/repo/migrations/20230404074406_add_resource_related.exs @@ -3,8 +3,8 @@ defmodule DB.Repo.Migrations.AddResourceRelated do def change do create table(:resource_related, primary_key: false) do - add(:resource_src_id, references(:resource), on_delete: :delete_all, null: false) - add(:resource_dst_id, references(:resource), on_delete: :delete_all, null: false) + add(:resource_src_id, references(:resource, on_delete: :nothing), null: false) + add(:resource_dst_id, references(:resource, on_delete: :nothing), null: false) add(:reason, :string, null: false) end diff --git a/apps/transport/priv/repo/migrations/20240110085755_resource_related_on_delete.exs b/apps/transport/priv/repo/migrations/20240110085755_resource_related_on_delete.exs new file mode 100644 index 0000000000..4894933822 --- /dev/null +++ b/apps/transport/priv/repo/migrations/20240110085755_resource_related_on_delete.exs @@ -0,0 +1,17 @@ +defmodule DB.Repo.Migrations.ResourceRelatedOnDelete do + use Ecto.Migration + + def change do + # Review foreign key settings previously set in + # 20230404074406_add_resource_related.exs + alter table(:resource_related) do + modify(:resource_src_id, references(:resource, on_delete: :delete_all), + from: references(:resource, on_delete: :nothing) + ) + + modify(:resource_dst_id, references(:resource, on_delete: :delete_all), + from: references(:resource, on_delete: :nothing) + ) + end + end +end diff --git a/apps/transport/test/db/resource_related_test.exs b/apps/transport/test/db/resource_related_test.exs index de0ac5da14..ae32e0625e 100644 --- a/apps/transport/test/db/resource_related_test.exs +++ b/apps/transport/test/db/resource_related_test.exs @@ -8,7 +8,7 @@ defmodule DB.ResourceRelatedTest do Ecto.Adapters.SQL.Sandbox.checkout(DB.Repo) end - test "can create and load resources_related" do + test "can create, load and delete resources_related" do %DB.Resource{id: r1_id} = r1 = insert(:resource) %DB.Resource{id: r2_id} = insert(:resource) insert(:resource_related, resource_src_id: r1_id, resource_dst_id: r2_id, reason: :gtfs_rt_gtfs) @@ -21,5 +21,9 @@ defmodule DB.ResourceRelatedTest do resource_dst: %DB.Resource{id: ^r2_id} } ] = DB.Repo.preload(r1, resources_related: [:resource_dst]).resources_related + + # When deleting a resource, the `resource_related` row linking to it is deleted as well + DB.Repo.delete!(r1) + assert [] == DB.Repo.all(DB.ResourceRelated) end end