From ff379f76a243351fcb49ffa3076f519e522cc105 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonatan=20M=C3=A4nnchen?= <jonatan@maennchen.ch>
Date: Thu, 5 Sep 2024 12:08:47 +0000
Subject: [PATCH] Introduce PO Parser Metadata Stripping

---
 lib/expo/po.ex            | 12 +++++++-
 lib/expo/po/parser.ex     |  6 ++--
 lib/expo/po/tokenizer.ex  | 65 +++++++++++++++++++++++----------------
 test/expo/parser_test.exs | 32 +++++++++++++++++--
 4 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/lib/expo/po.ex b/lib/expo/po.ex
index f23f62c..ab50723 100644
--- a/lib/expo/po.ex
+++ b/lib/expo/po.ex
@@ -6,7 +6,17 @@ defmodule Expo.PO do
   alias Expo.Messages
   alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError}
 
-  @type parse_option :: {:file, Path.t()}
+  @typedoc """
+  Parsing option.
+
+    * `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors
+      don't have a path.
+
+    * `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file
+    to reduce memory usage when meta information is not needed.
+    Defaults to `false`.
+  """
+  @type parse_option :: {:file, Path.t()} | {:strip_meta, boolean()}
 
   @doc """
   Dumps a `Expo.Messages` struct as iodata.
diff --git a/lib/expo/po/parser.ex b/lib/expo/po/parser.ex
index ece931e..aa5c3dc 100644
--- a/lib/expo/po/parser.ex
+++ b/lib/expo/po/parser.ex
@@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do
   def parse(content, opts) do
     content = prune_bom(content, Keyword.get(opts, :file, "nofile"))
 
-    with {:ok, tokens} <- tokenize(content),
+    with {:ok, tokens} <- tokenize(content, opts),
          {:ok, po} <- parse_tokens(tokens),
          {:ok, po} <- check_for_duplicates(po) do
       {:ok, %Messages{po | file: Keyword.get(opts, :file)}}
@@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do
     end
   end
 
-  defp tokenize(content) do
-    case Tokenizer.tokenize(content) do
+  defp tokenize(content, opts) do
+    case Tokenizer.tokenize(content, opts) do
       {:ok, tokens} -> {:ok, tokens}
       {:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}}
     end
diff --git a/lib/expo/po/tokenizer.ex b/lib/expo/po/tokenizer.ex
index d32f0ba..b33cd96 100644
--- a/lib/expo/po/tokenizer.ex
+++ b/lib/expo/po/tokenizer.ex
@@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
     * `{:str, 6, "foo"}`
 
   """
-  @spec tokenize(binary) :: {:ok, [token]} | {:error, pos_integer, binary}
-  def tokenize(str) do
-    tokenize_line(str, _line = 1, _tokens_acc = [])
+  @spec tokenize(binary, [Expo.PO.parse_option()]) ::
+          {:ok, [token]} | {:error, pos_integer, binary}
+  def tokenize(str, opts \\ []) do
+    strip_meta? = Keyword.get(opts, :strip_meta, false)
+    tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = [])
   end
 
   # Reverse str_lines strings.
@@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
   end
 
   # End of file.
-  defp tokenize_line(<<>>, line, acc) do
+  defp tokenize_line(<<>>, line, _strip_meta?, acc) do
     {:ok, [{:"$end", line} | acc] |> Enum.reverse() |> postprocess_tokens()}
   end
 
   # Go to the next line.
-  defp tokenize_line(<<?\n, rest::binary>>, line, acc) do
-    tokenize_line(rest, line + 1, acc)
+  defp tokenize_line(<<?\n, rest::binary>>, line, strip_meta?, acc) do
+    tokenize_line(rest, line + 1, strip_meta?, acc)
   end
 
   # Skip other whitespace.
-  defp tokenize_line(<<char, rest::binary>>, line, acc)
+  defp tokenize_line(<<char, rest::binary>>, line, strip_meta?, acc)
        when char in @whitespace_no_nl do
-    tokenize_line(rest, line, acc)
+    tokenize_line(rest, line, strip_meta?, acc)
+  end
+
+  # Skip Meta Information when strip_meta is enabled
+  defp tokenize_line(<<?#, rest::binary>>, line, true, acc) do
+    from_next_line = discard_until_nl(rest)
+    tokenize_line(from_next_line, line, true, acc)
   end
 
   # Obsolete comment.
-  defp tokenize_line(<<"#~", rest::binary>>, line, acc) do
-    tokenize_line(rest, line, [{:obsolete, line} | acc])
+  defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do
+    tokenize_line(rest, line, strip_meta?, [{:obsolete, line} | acc])
   end
 
   # Previous comment.
-  defp tokenize_line(<<"#|", rest::binary>>, line, acc) do
-    tokenize_line(rest, line, [{:previous, line} | acc])
+  defp tokenize_line(<<"#|", rest::binary>>, line, strip_meta?, acc) do
+    tokenize_line(rest, line, strip_meta?, [{:previous, line} | acc])
   end
 
   # Normal comment.
-  defp tokenize_line(<<?#, _rest::binary>> = rest, line, acc) do
+  defp tokenize_line(<<?#, _rest::binary>> = rest, line, strip_meta?, acc) do
     {contents, rest} = to_eol_or_eof(rest, "")
-    tokenize_line(rest, line, [{:comment, line, contents} | acc])
+    tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} | acc])
   end
 
   # Keywords.
   for kw <- @string_keywords do
-    defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, acc)
+    defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, strip_meta?, acc)
          when char in @whitespace do
       acc = [{unquote(String.to_existing_atom(kw)), line} | acc]
-      tokenize_line(rest, line, acc)
+      tokenize_line(rest, line, strip_meta?, acc)
     end
 
-    defp tokenize_line(unquote(kw) <> _rest, line, _acc) do
+    defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do
       {:error, line, "no space after '#{unquote(kw)}'"}
     end
   end
 
   # `msgstr`.
-  defp tokenize_line("msgstr[" <> <<rest::binary>>, line, acc) do
+  defp tokenize_line("msgstr[" <> <<rest::binary>>, line, strip_meta?, acc) do
     case tokenize_plural_form(rest, "") do
       {:ok, plural_form, rest} ->
         # The order of the :plural_form and :msgstr tokens is inverted since
         # the `acc` array of tokens will be reversed at the end.
         acc = [{:plural_form, line, plural_form}, {:msgstr, line} | acc]
-        tokenize_line(rest, line, acc)
+        tokenize_line(rest, line, strip_meta?, acc)
 
       {:error, reason} ->
         {:error, line, reason}
     end
   end
 
-  defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, acc)
+  defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, strip_meta?, acc)
        when char in @whitespace do
     acc = [{:msgstr, line} | acc]
-    tokenize_line(rest, line, acc)
+    tokenize_line(rest, line, strip_meta?, acc)
   end
 
-  defp tokenize_line("msgstr" <> _rest, line, _acc) do
+  defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do
     {:error, line, "no space after 'msgstr'"}
   end
 
   # String.
-  defp tokenize_line(<<?", rest::binary>>, line, acc) do
+  defp tokenize_line(<<?", rest::binary>>, line, strip_meta?, acc) do
     case tokenize_string(rest, "") do
       {:ok, string, rest} ->
-        tokenize_line(rest, line, add_str_lines(line, string, acc))
+        tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc))
 
       {:error, reason} ->
         {:error, line, reason}
@@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
   # a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
   # we assume there's an unknown keyword. We parse it with a regex
   # so that the error message is informative.
-  defp tokenize_line(<<letter, _rest::binary>> = binary, line, _acc)
+  defp tokenize_line(<<letter, _rest::binary>> = binary, line, _strip_meta?, _acc)
        when letter in ?a..?z or letter in ?A..?Z do
     next_word = List.first(Regex.run(~r/\w+/u, binary))
     {:error, line, "unknown keyword '#{next_word}'"}
@@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
   # Last resort: this is just a plain unexpected token. We take the first
   # Unicode char of the given binary and build an informative error message
   # (with the codepoint of the char).
-  defp tokenize_line(binary, line, _acc) when is_binary(binary) do
+  defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do
     # To get the first Unicode char, we convert to char list first.
     [char | _] = String.to_charlist(binary)
     msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char])
     {:error, line, :unicode.characters_to_binary(msg)}
   end
 
+  defp discard_until_nl(content)
+  defp discard_until_nl(<<?\n, _rest::binary>> = content), do: content
+  defp discard_until_nl(<<>>), do: <<>>
+  defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest)
+
   @obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a
 
   # Collapse the string into the previous str_lines token if there is one *on the same line*.
diff --git a/test/expo/parser_test.exs b/test/expo/parser_test.exs
index 363abe2..fc155ef 100644
--- a/test/expo/parser_test.exs
+++ b/test/expo/parser_test.exs
@@ -454,8 +454,36 @@ defmodule Expo.ParserTest do
     end
   end
 
-  defp parse(string) do
-    case PO.parse_string(string) do
+  describe "strip meta" do
+    test "does not include extra information" do
+      assert [
+               %Message.Plural{
+                 msgid: ["foo"],
+                 msgid_plural: ["foos"],
+                 msgstr: %{0 => ["bar"], 1 => ["bars"]},
+                 comments: [],
+                 extracted_comments: [],
+                 references: []
+               }
+             ] =
+               parse(
+                 """
+                 # This is a message
+                 #: lib/foo.ex:32
+                 # Ah, another comment!
+                 #. An extracted comment
+                 msgid "foo"
+                 msgid_plural "foos"
+                 msgstr[0] "bar"
+                 msgstr[1] "bars"
+                 """,
+                 strip_meta: true
+               )
+    end
+  end
+
+  defp parse(string, options \\ []) do
+    case PO.parse_string(string, options) do
       {:ok, %Messages{messages: messages}} ->
         messages