From ff379f76a243351fcb49ffa3076f519e522cc105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20M=C3=A4nnchen?= Date: Thu, 5 Sep 2024 12:08:47 +0000 Subject: [PATCH] Introduce PO Parser Metadata Stripping --- lib/expo/po.ex | 12 +++++++- lib/expo/po/parser.ex | 6 ++-- lib/expo/po/tokenizer.ex | 65 +++++++++++++++++++++++---------------- test/expo/parser_test.exs | 32 +++++++++++++++++-- 4 files changed, 83 insertions(+), 32 deletions(-) diff --git a/lib/expo/po.ex b/lib/expo/po.ex index f23f62c..ab50723 100644 --- a/lib/expo/po.ex +++ b/lib/expo/po.ex @@ -6,7 +6,17 @@ defmodule Expo.PO do alias Expo.Messages alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError} - @type parse_option :: {:file, Path.t()} + @typedoc """ + Parsing option. + + * `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors + don't have a path. + + * `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file + to reduce memory usage when meta information is not needed. + Defaults to `false`. + """ + @type parse_option :: {:file, Path.t()} | {:strip_meta, boolean()} @doc """ Dumps a `Expo.Messages` struct as iodata. diff --git a/lib/expo/po/parser.ex b/lib/expo/po/parser.ex index ece931e..aa5c3dc 100644 --- a/lib/expo/po/parser.ex +++ b/lib/expo/po/parser.ex @@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do def parse(content, opts) do content = prune_bom(content, Keyword.get(opts, :file, "nofile")) - with {:ok, tokens} <- tokenize(content), + with {:ok, tokens} <- tokenize(content, opts), {:ok, po} <- parse_tokens(tokens), {:ok, po} <- check_for_duplicates(po) do {:ok, %Messages{po | file: Keyword.get(opts, :file)}} @@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do end end - defp tokenize(content) do - case Tokenizer.tokenize(content) do + defp tokenize(content, opts) do + case Tokenizer.tokenize(content, opts) do {:ok, tokens} -> {:ok, tokens} {:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}} end diff --git a/lib/expo/po/tokenizer.ex b/lib/expo/po/tokenizer.ex index d32f0ba..b33cd96 100644 --- a/lib/expo/po/tokenizer.ex +++ b/lib/expo/po/tokenizer.ex @@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do * `{:str, 6, "foo"}` """ - @spec tokenize(binary) :: {:ok, [token]} | {:error, pos_integer, binary} - def tokenize(str) do - tokenize_line(str, _line = 1, _tokens_acc = []) + @spec tokenize(binary, [Expo.PO.parse_option()]) :: + {:ok, [token]} | {:error, pos_integer, binary} + def tokenize(str, opts \\ []) do + strip_meta? = Keyword.get(opts, :strip_meta, false) + tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = []) end # Reverse str_lines strings. @@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do end # End of file. - defp tokenize_line(<<>>, line, acc) do + defp tokenize_line(<<>>, line, _strip_meta?, acc) do {:ok, [{:"$end", line} | acc] |> Enum.reverse() |> postprocess_tokens()} end # Go to the next line. - defp tokenize_line(<>, line, acc) do - tokenize_line(rest, line + 1, acc) + defp tokenize_line(<>, line, strip_meta?, acc) do + tokenize_line(rest, line + 1, strip_meta?, acc) end # Skip other whitespace. - defp tokenize_line(<>, line, acc) + defp tokenize_line(<>, line, strip_meta?, acc) when char in @whitespace_no_nl do - tokenize_line(rest, line, acc) + tokenize_line(rest, line, strip_meta?, acc) + end + + # Skip Meta Information when strip_meta is enabled + defp tokenize_line(<>, line, true, acc) do + from_next_line = discard_until_nl(rest) + tokenize_line(from_next_line, line, true, acc) end # Obsolete comment. - defp tokenize_line(<<"#~", rest::binary>>, line, acc) do - tokenize_line(rest, line, [{:obsolete, line} | acc]) + defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do + tokenize_line(rest, line, strip_meta?, [{:obsolete, line} | acc]) end # Previous comment. - defp tokenize_line(<<"#|", rest::binary>>, line, acc) do - tokenize_line(rest, line, [{:previous, line} | acc]) + defp tokenize_line(<<"#|", rest::binary>>, line, strip_meta?, acc) do + tokenize_line(rest, line, strip_meta?, [{:previous, line} | acc]) end # Normal comment. - defp tokenize_line(<> = rest, line, acc) do + defp tokenize_line(<> = rest, line, strip_meta?, acc) do {contents, rest} = to_eol_or_eof(rest, "") - tokenize_line(rest, line, [{:comment, line, contents} | acc]) + tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} | acc]) end # Keywords. for kw <- @string_keywords do - defp tokenize_line(unquote(kw) <> <>, line, acc) + defp tokenize_line(unquote(kw) <> <>, line, strip_meta?, acc) when char in @whitespace do acc = [{unquote(String.to_existing_atom(kw)), line} | acc] - tokenize_line(rest, line, acc) + tokenize_line(rest, line, strip_meta?, acc) end - defp tokenize_line(unquote(kw) <> _rest, line, _acc) do + defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do {:error, line, "no space after '#{unquote(kw)}'"} end end # `msgstr`. - defp tokenize_line("msgstr[" <> <>, line, acc) do + defp tokenize_line("msgstr[" <> <>, line, strip_meta?, acc) do case tokenize_plural_form(rest, "") do {:ok, plural_form, rest} -> # The order of the :plural_form and :msgstr tokens is inverted since # the `acc` array of tokens will be reversed at the end. acc = [{:plural_form, line, plural_form}, {:msgstr, line} | acc] - tokenize_line(rest, line, acc) + tokenize_line(rest, line, strip_meta?, acc) {:error, reason} -> {:error, line, reason} end end - defp tokenize_line("msgstr" <> <>, line, acc) + defp tokenize_line("msgstr" <> <>, line, strip_meta?, acc) when char in @whitespace do acc = [{:msgstr, line} | acc] - tokenize_line(rest, line, acc) + tokenize_line(rest, line, strip_meta?, acc) end - defp tokenize_line("msgstr" <> _rest, line, _acc) do + defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do {:error, line, "no space after 'msgstr'"} end # String. - defp tokenize_line(<>, line, acc) do + defp tokenize_line(<>, line, strip_meta?, acc) do case tokenize_string(rest, "") do {:ok, string, rest} -> - tokenize_line(rest, line, add_str_lines(line, string, acc)) + tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc)) {:error, reason} -> {:error, line, reason} @@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do # a letter (we don't take care of unicode or fancy stuff, just ASCII letters), # we assume there's an unknown keyword. We parse it with a regex # so that the error message is informative. - defp tokenize_line(<> = binary, line, _acc) + defp tokenize_line(<> = binary, line, _strip_meta?, _acc) when letter in ?a..?z or letter in ?A..?Z do next_word = List.first(Regex.run(~r/\w+/u, binary)) {:error, line, "unknown keyword '#{next_word}'"} @@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do # Last resort: this is just a plain unexpected token. We take the first # Unicode char of the given binary and build an informative error message # (with the codepoint of the char). - defp tokenize_line(binary, line, _acc) when is_binary(binary) do + defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do # To get the first Unicode char, we convert to char list first. [char | _] = String.to_charlist(binary) msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char]) {:error, line, :unicode.characters_to_binary(msg)} end + defp discard_until_nl(content) + defp discard_until_nl(<> = content), do: content + defp discard_until_nl(<<>>), do: <<>> + defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest) + @obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a # Collapse the string into the previous str_lines token if there is one *on the same line*. diff --git a/test/expo/parser_test.exs b/test/expo/parser_test.exs index 363abe2..fc155ef 100644 --- a/test/expo/parser_test.exs +++ b/test/expo/parser_test.exs @@ -454,8 +454,36 @@ defmodule Expo.ParserTest do end end - defp parse(string) do - case PO.parse_string(string) do + describe "strip meta" do + test "does not include extra information" do + assert [ + %Message.Plural{ + msgid: ["foo"], + msgid_plural: ["foos"], + msgstr: %{0 => ["bar"], 1 => ["bars"]}, + comments: [], + extracted_comments: [], + references: [] + } + ] = + parse( + """ + # This is a message + #: lib/foo.ex:32 + # Ah, another comment! + #. An extracted comment + msgid "foo" + msgid_plural "foos" + msgstr[0] "bar" + msgstr[1] "bars" + """, + strip_meta: true + ) + end + end + + defp parse(string, options \\ []) do + case PO.parse_string(string, options) do {:ok, %Messages{messages: messages}} -> messages