diff --git a/README.md b/README.md index 494b9da4e4..27d646128d 100644 --- a/README.md +++ b/README.md @@ -166,7 +166,7 @@ mix cmd --app transport mix test --color mix cmd --app unlock mix test --color # or, for a single file, or single test -mix cmd --app transport mix test --color test/transport_web/integrations/backoffice_test.exs +mix cmd --app transport mix test --color test/transport_web/integrations/backoffice_test.exs mix cmd --app transport mix test --color test/transport_web/integrations/backoffice_test.exs:8 ``` diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000000..304f760382 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,12 @@ +# Scripts + +A unstructured collection of scripts to explore or patch data. Can be used to +fix production data, to do some cold analysis, or to test some piece of code. + +## Usage + +Assuming you've sourced the required environment variables (mostly the database connector), you can launch a given script with `mix run`: + +``` +mix run scripts/my-script.exs +``` diff --git a/scripts/netex_analyzer.exs b/scripts/netex_analyzer.exs index 3bfcdce63c..5661ec404a 100644 --- a/scripts/netex_analyzer.exs +++ b/scripts/netex_analyzer.exs @@ -20,57 +20,53 @@ df = end) |> Enum.filter(&(&1.unverified_format == "NeTEx")) -netex = - df - |> Task.async_stream( - fn r -> - url = r.url - file = Path.join("cache-dir", "resource-#{r.id}.dat") - status_file = file <> ".status.json" +download_resource = fn r -> + url = r.url + file = Path.join("cache-dir", "resource-#{r.id}.dat") + status_file = file <> ".status.json" - unless File.exists?(status_file) do - IO.puts("Saving #{url}") - url = if String.contains?(url, "|"), do: URI.encode(url), else: url + unless File.exists?(status_file) do + IO.puts("Saving #{url}") + url = if String.contains?(url, "|"), do: URI.encode(url), else: url - %{status: status} = - Transport.HTTPClient.get!(url, - decode_body: false, - compressed: false, - into: File.stream!(file) - ) + %{status: status} = + Transport.HTTPClient.get!(url, + decode_body: false, + compressed: false, + into: File.stream!(file) + ) - File.write!(status_file, %{status: status} |> Jason.encode!()) - end + File.write!(status_file, %{status: status} |> Jason.encode!()) + end - %{"status" => status} = File.read!(status_file) |> Jason.decode!() + %{"status" => status} = File.read!(status_file) |> Jason.decode!() - r - |> Map.put(:http_status, status) - |> Map.put(:local_path, file) - end, - max_concurrency: 10, - timeout: 120_000 - ) - |> Stream.map(fn {:ok, result} -> result end) - |> Stream.reject(&is_nil(&1)) - |> Task.async_stream( - fn r -> - IO.puts("Processing file #{r.id}") + r + |> Map.put(:http_status, status) + |> Map.put(:local_path, file) +end + +count_relevant_stop_places_per_resource = fn r -> + IO.puts("Processing file #{r.id}") - try do - count = - Transport.NeTEx.read_all_stop_places(r.local_path) - |> Enum.flat_map(fn {_file, stops} -> stops end) - # some stop places have no latitude in NeTEx - |> Enum.reject(fn p -> is_nil(p[:latitude]) end) - |> Enum.count() + try do + count = + Transport.NeTEx.read_all_stop_places(r.local_path) + |> Enum.flat_map(fn {_file, stops} -> stops end) + # some stop places have no latitude in NeTEx + |> Enum.reject(fn p -> is_nil(p[:latitude]) end) + |> Enum.count() - IO.puts("#{count} StopPlaces detected") - rescue - e -> IO.puts("Som'thing bad happened") - end - end, - max_concurrency: 5, - timeout: 60_000 * 5 - ) + IO.puts("#{count} StopPlaces detected") + rescue + _ -> IO.puts("Som'thing bad happened") + end +end + +netex = + df + |> Task.async_stream(download_resource, max_concurrency: 10, timeout: 120_000) + |> Stream.map(fn {:ok, result} -> result end) + |> Stream.reject(&is_nil(&1)) + |> Task.async_stream(count_relevant_stop_places_per_resource, max_concurrency: 5, timeout: 60_000 * 5) |> Stream.run() diff --git a/scripts/netex_layout_analyzer.exs b/scripts/netex_layout_analyzer.exs new file mode 100644 index 0000000000..522f9bd849 --- /dev/null +++ b/scripts/netex_layout_analyzer.exs @@ -0,0 +1,82 @@ +resources = + DB.Resource + |> DB.Repo.all() + +# count +resources +|> Enum.count() +|> IO.inspect() + +df = + resources + |> Enum.map(fn r -> + %{ + id: r.id, + url: r.url, + title: r.title, + unverified_format: r.format, + description: r.description + } + end) + |> Enum.filter(&(&1.unverified_format == "NeTEx")) + +download_resource = fn r -> + url = r.url + file = Path.join("cache-dir", "resource-#{r.id}.dat") + status_file = file <> ".status.json" + + unless File.exists?(status_file) do + IO.puts("Saving #{url}") + url = if String.contains?(url, "|"), do: URI.encode(url), else: url + + %{status: status} = + Transport.HTTPClient.get!(url, + decode_body: false, + compressed: false, + into: File.stream!(file) + ) + + File.write!(status_file, %{status: status} |> Jason.encode!()) + end + + %{"status" => status} = File.read!(status_file) |> Jason.decode!() + + r + |> Map.put(:http_status, status) + |> Map.put(:local_path, file) +end + +hierarchy_level = fn file -> file |> String.split("/") |> Enum.count() end + +dump_netex_files = fn r -> + IO.puts("Processing file #{r.id}") + + url = "https://transport.data.gouv.fr/resources/#{r.id}" + + result = + try do + Transport.NeTEx.read_all_stop_places(r.local_path) + |> Enum.map(fn {file, _stops} -> file end) + |> Enum.reject(fn file -> String.ends_with?(file, "/") end) + |> Enum.map(fn file -> [url, r.title, r.url, file, hierarchy_level.(file)] end) + rescue + _ -> + IO.puts("Som'thing bad happened") + [] + end + + NimbleCSV.RFC4180.dump_to_iodata(result) +end + +output_file = "netex_layout_analysis.csv" + +File.write(output_file, NimbleCSV.RFC4180.dump_to_iodata([~w(resource title url file hierarchy)])) + +df +|> Task.async_stream(download_resource, max_concurrency: 10, timeout: 120_000) +|> Stream.map(fn {:ok, result} -> result end) +|> Stream.reject(&is_nil(&1)) +|> Task.async_stream(dump_netex_files, max_concurrency: 5, timeout: 60_000 * 5) +|> Stream.map(fn {:ok, result} -> result end) +|> Stream.into(File.stream!(output_file, [:append, :utf8])) +|> Stream.run()