Skip to content
This repository has been archived by the owner on Sep 3, 2024. It is now read-only.

Commit

Permalink
Add simple fragmentation scheme (#7)
Browse files Browse the repository at this point in the history
* Remove duplication of docs in database

* Add simple fragmentation scheme

* Update code for better clarity

* Add recombine function

* Add test suite for fragmentatin scheme

* Add tests for recombine

* Fix pattern matching in HexClient.get_releases

* Update search.add mix task to work with fragmentation

* Add handling for fragmentation in web view

* Run formatter

* Add sorting for recombining doc fragments

* Use OptionParser instead of homebrew parsing

* Add a new edge case to fragmentation tests

* Overhaul FragmentationScheme.split to only build output binaries at the end of processing and remove Regex.run

* Run formatter

* Remove unnecessary tail recursion

* Overhaul the compute_splits function once more

* Add docs to FragmentationString.recombine/1

Co-authored-by: Jonatan Kłosko <[email protected]>

* Change formatting

Co-authored-by: Jonatan Kłosko <[email protected]>

---------

Co-authored-by: Jonatan Kłosko <[email protected]>
  • Loading branch information
karol-t-wilk and jonatanklosko authored Aug 12, 2024
1 parent 6350441 commit 9da417c
Show file tree
Hide file tree
Showing 12 changed files with 256 additions and 43 deletions.
49 changes: 35 additions & 14 deletions lib/mix/tasks/search.add.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ defmodule Mix.Tasks.Search.Add do
alias Search.HexClient

@moduledoc """
Usage: mix #{Mix.Task.task_name(__MODULE__)} <PACKAGE> [<VERSION>]
Usage: mix #{Mix.Task.task_name(__MODULE__)} <PACKAGE> [--version <VERSION>] [--max-size <MAX_SIZE>]
Fetches the documentation for the given package from Hex. Does not embed it yet.
Expand All @@ -17,21 +17,42 @@ defmodule Mix.Tasks.Search.Add do

@impl Mix.Task
def run(args) do
[package | args_tail] = args

package_or_release =
case args_tail do
[version] ->
version = Version.parse!(version)
%HexClient.Release{package_name: package, version: version}
case OptionParser.parse(args, strict: [version: :string, max_size: :integer]) do
{opts, [package_name], []} ->
version = Keyword.get(opts, :version)
fragmentation_opts = Keyword.take(opts, [:max_size])

with {:ok, package_or_release} <- package_or_release(package_name, version),
{:ok, package} <-
Packages.add_package(package_or_release, fragmentation_opts: fragmentation_opts) do
Mix.shell().info("Package #{package.name}@#{package.version} added.")
else
{:error, err} ->
Mix.shell().error("Error: #{err}")
end

{_opts, [], []} ->
Mix.shell().error("Expected a package name as one of the arguments.")

{_opts, _more_than_one, []} ->
Mix.shell().error("Too many arguments.")

{_opts, _, invalid} ->
invalid =
invalid
|> Enum.map(&elem(&1, 0))
|> Enum.join(", ")

Mix.shell().error("Incorrect or unknown options: #{invalid}")
end
end

[] ->
package
end
defp package_or_release(package_name, nil), do: {:ok, package_name}

case Packages.add_package(package_or_release) do
{:ok, package} -> Mix.shell().info("Package #{package.name}@#{package.version} added.")
{:error, err} -> Mix.shell().error("Error: #{err}")
defp package_or_release(package_name, version) do
case Version.parse(version) do
{:ok, version} -> {:ok, %HexClient.Release{package_name: package_name, version: version}}
:error -> {:error, "Could not parse the requested version."}
end
end
end
62 changes: 62 additions & 0 deletions lib/search/fragmentation_scheme.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
defmodule Search.FragmentationScheme do
@doc """
Splits a binary into multiple binaries that satisfy limitations specified by opts.
If possible, splits the text on whitespace to preserve words. If that is impossible, splits text in between graphemes.
Supported options:
* `:max_size` - maximum byte_size of the output binaries. The output binaries may have size less or equal to that
value, which also should guarantee the sequence length after tokenization will be bounded by this value.
"""
def split(text, opts \\ [])
def split("", _opts), do: []

def split(text, opts) when is_binary(text) do
case Keyword.get(opts, :max_size) do
nil ->
[text]

max_size ->
text
|> compute_splits(max_size, 0, nil, [])
|> split_binary(text)
end
end

@doc """
Recreates the original text from a list of chunks.
"""
def recombine(chunks), do: Enum.join(chunks)

defp split_binary([], ""), do: []

defp split_binary([split_size | splits_tail], string) do
<<chunk::binary-size(^split_size), rest::binary>> = string
[chunk | split_binary(splits_tail, rest)]
end

defp compute_splits("", _, size, _, sizes), do: Enum.reverse(sizes, [size])

defp compute_splits(string, max_size, size, size_until_word, sizes) do
{grapheme, string} = String.next_grapheme(string)
grapheme_size = byte_size(grapheme)

if size + grapheme_size > max_size do
if size_until_word do
# Split before the current unfinished word
next = size - size_until_word
compute_splits(string, max_size, next + grapheme_size, nil, [size_until_word | sizes])
else
# The current chunk has a single word, just split it
compute_splits(string, max_size, grapheme_size, nil, [size | sizes])
end
else
new_size = size + grapheme_size
size_until_word = if whitespace?(grapheme), do: new_size, else: size_until_word
compute_splits(string, max_size, new_size, size_until_word, sizes)
end
end

defp whitespace?(grapheme), do: grapheme =~ ~r/\s/
end
2 changes: 1 addition & 1 deletion lib/search/hex_client.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ defmodule Search.HexClient do

def get_releases(package_name) when is_binary(package_name) do
case get("packages/#{package_name}") do
{:ok, %{status: 200, body: releases}} ->
{:ok, %{status: 200, body: %{releases: releases}}} ->
res =
for %{version: version} <- releases do
%HexClient.Release{
Expand Down
53 changes: 41 additions & 12 deletions lib/search/packages.ex
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
defmodule Search.Packages do
import Ecto.Query, warn: false
alias Search.FragmentationScheme
alias Search.Repo

alias Search.Packages.{Package, DocItem, DocFragment}
Expand All @@ -11,20 +12,27 @@ defmodule Search.Packages do
If given a package name, adds the latest version of the package to the app. If given a `%HexClient.Release{}` adds
the specified release. Does not embed it yet.
"""
def add_package(package_name) when is_binary(package_name) do
def add_package(name_or_release, opts \\ [])

def add_package(package_name, opts) when is_binary(package_name) do
case HexClient.get_releases(package_name) do
{:ok, releases} ->
latest = HexClient.Release.latest(releases)
add_package(latest)
add_package(latest, opts)

err ->
err
end
end

def add_package(%HexClient.Release{package_name: package_name, version: version} = release) do
def add_package(
%HexClient.Release{package_name: package_name, version: version} = release,
opts
) do
version = Version.to_string(version)

fragmentation_opts = Keyword.get(opts, :fragmentation_opts, [])

with {:ok, docs} <- HexClient.get_docs_tarball(release),
{:ok, search_data} <- ExDocParser.extract_search_data(docs) do
Repo.transaction_with(fn ->
Expand All @@ -43,25 +51,46 @@ defmodule Search.Packages do
|> Ecto.Changeset.put_assoc(:doc_items, [])

with {:ok, package} <- Repo.insert_or_update(package),
:ok <- create_items_from_package(package, search_data) do
:ok <- create_items_from_package(package, search_data, fragmentation_opts) do
{:ok, package}
end
end)
end
end

defp create_items_from_package(%Package{} = _package, []), do: :ok
defp create_items_from_package(%Package{} = _package, [], _fragmentation_opts), do: :ok

defp create_items_from_package(%Package{} = package, [search_data_head | search_data_tail]) do
defp create_items_from_package(
%Package{} = package,
[search_data_head | search_data_tail],
fragmentation_opts
) do
%{"doc" => doc, "title" => title, "ref" => ref, "type" => type} = search_data_head

with {:ok, item} <-
create_doc_item(package, %{doc: doc, title: title, ref: ref, type: type}),
{:ok, _fragment} <-
create_doc_fragment(item, %{
text: "# #{title}\n\n#{doc}"
}) do
create_items_from_package(package, search_data_tail)
create_doc_item(package, %{title: title, ref: ref, type: type}),
fragments =
doc
|> FragmentationScheme.split(fragmentation_opts)
|> Enum.with_index(),
{:ok, _fragments} <-
create_doc_fragments_from_binaries(item, fragments, []) do
create_items_from_package(package, search_data_tail, fragmentation_opts)
end
end

defp create_doc_fragments_from_binaries(_doc_item, [], acc), do: {:ok, acc}

defp create_doc_fragments_from_binaries(doc_item, [{text, order} | texts_tail], acc) do
case create_doc_fragment(doc_item, %{
text: text,
order: order
}) do
{:ok, fragment} ->
create_doc_fragments_from_binaries(doc_item, texts_tail, [fragment | acc])

{:error, _} = err ->
err
end
end

Expand Down
6 changes: 4 additions & 2 deletions lib/search/packages/doc_fragment.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ defmodule Search.Packages.DocFragment do

schema "doc_fragments" do
field :text, :string
field :order, :integer

belongs_to :doc_item, Packages.DocItem

timestamps(type: :utc_datetime)
Expand All @@ -13,8 +15,8 @@ defmodule Search.Packages.DocFragment do
@doc false
def changeset(doc_fragment, attrs) do
doc_fragment
|> cast(attrs, [:text])
|> cast(attrs, [:text, :order])
|> cast_assoc(:doc_item)
|> validate_required([:text])
|> validate_required([:text, :order])
end
end
3 changes: 1 addition & 2 deletions lib/search/packages/doc_item.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ defmodule Search.Packages.DocItem do
field :type, :string
field :title, :string
field :ref, :string
field :doc, :string
belongs_to :package, Packages.Package
has_many :doc_fragments, Packages.DocFragment, on_replace: :delete

Expand All @@ -17,7 +16,7 @@ defmodule Search.Packages.DocItem do
@doc false
def changeset(doc_item, attrs) do
doc_item
|> cast(attrs, [:ref, :type, :title, :doc])
|> cast(attrs, [:ref, :type, :title])
|> cast_assoc(:package)
|> cast_assoc(:doc_fragments)
|> validate_required([:ref, :type, :title])
Expand Down
10 changes: 10 additions & 0 deletions lib/search_web/controllers/page_controller.ex
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ defmodule SearchWeb.PageController do
Search.Embeddings.knn_query(embedding_model, query_tensor, k: k)
|> Stream.map(& &1.doc_fragment.doc_item)
|> Enum.uniq_by(& &1.id)
|> Search.Repo.preload(:doc_fragments)
|> Stream.map(fn item ->
doc_content =
item.doc_fragments
|> Enum.sort_by(& &1.order)
|> Enum.map(& &1.text)
|> Search.FragmentationScheme.recombine()

{item, doc_content}
end)

render(conn, :search, items: items)
else
Expand Down
6 changes: 2 additions & 4 deletions lib/search_web/controllers/page_html/search.html.heex
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
<div :for={item <- @items} class="bg-gray-100 p-4 m-4 rounded">
<div :for={{item, doc_content} <- @items} class="bg-gray-100 p-4 m-4 rounded">
<p class="text-lg font-bold"><%= item.title %></p>
<%= if item.doc do %>
<%= raw(Earmark.as_html!(item.doc)) %>
<% end %>
<%= raw(Earmark.as_html!(doc_content)) %>
</div>
2 changes: 1 addition & 1 deletion priv/repo/migrations/20240411191321_create_schema.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ defmodule Search.Repo.Migrations.CreateSchema do
add :ref, :string, null: false
add :type, :string, null: false
add :title, :string, null: false
add :doc, :text
add :package_id, references("packages", on_delete: :delete_all), null: false

timestamps(type: :utc_datetime)
end

create table(:doc_fragments) do
add :text, :text, null: false
add :order, :integer, null: false
add :doc_item_id, references("doc_items", on_delete: :delete_all), null: false

timestamps(type: :utc_datetime)
Expand Down
Loading

0 comments on commit 9da417c

Please sign in to comment.