Add graph of compile time vs schema size (#128)

sourcemeta-research · Jan 29, 2025 · 35285bb · 35285bb
1 parent fd2e5aa
commit 35285bb
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 70 deletions.
diff --git a/.github/dataset_summary_table.sh b/.github/dataset_summary_table.sh
diff --git a/.github/plot_markdown.py b/.github/plot_markdown.py
@@ -1,11 +1,12 @@
 import glob
 import json
 import os
+import sys
 
 
 if __name__ == '__main__':
     img_urls = json.loads(os.environ['IMG_URLS'])
-    img_files = sorted(glob.glob('dist/results/plots/*.png'))
+    img_files = sorted(glob.glob(os.path.join(sys.argv[1], '*.png')))
     for (url, file) in zip(img_urls, img_files):
         name = file.split('/')[-1].split('.')[0]
         print(f"## {name}")

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -128,6 +128,16 @@ jobs:
           python-version: '3.12.5'
       - name: Install uv
         run: pipx install uv
+      - name: Get jsonschema-strip binary
+        run: |
+          git clone --depth 1 --branch main https://github.com/sourcemeta-research/jsonschema-strip
+          cmake -S jsonschema-strip -B ./jsonschema-strip/build -DCMAKE_BUILD_TYPE:STRING=Release
+          cmake --build ./jsonschema-strip/build --config Release --target strip
+          mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip
+          echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH
+      - uses: actions/setup-go@v5
+      - name: Install gron
+        run: go install github.com/tomnomnom/gron@latest
 
       - name: Process CSV
         run: uv run python .github/csv_min.py > dist/report-min.csv
@@ -147,17 +157,29 @@ jobs:
         run: echo "$MARKDOWN_TABLE" >> $GITHUB_STEP_SUMMARY
 
       - name: Generate plots
-        run: mkdir -p dist/results/plots && uv run python plot.py
+        run: |
+          ./dataset_summary.sh csv >> dist/summary.csv
+          mkdir -p dist/results/plots && uv run python plot.py && uv run python plot_compile.py
       - name: Upload plots
         id: imgur
         uses: devicons/[email protected]
         with:
           path: dist/results/plots/*.png
           client_id: ${{secrets.IMGUR_CLIENT_ID}}
+      - name: Upload compile plot
+        id: imgur_compile
+        uses: devicons/[email protected]
+        with:
+          path: dist/results/compile.png
+          client_id: ${{secrets.IMGUR_CLIENT_ID}}
       - name: Add plots to summary
         env:
           IMG_URLS: ${{ steps.imgur.outputs.imgur_urls }}
-        run: python .github/plot_markdown.py >> $GITHUB_STEP_SUMMARY
+        run: python .github/plot_markdown.py dist/results/plots >> $GITHUB_STEP_SUMMARY
+      - name: Add plots to summary
+        env:
+          IMG_URLS: ${{ steps.imgur_compile.outputs.imgur_urls }}
+        run: python .github/plot_markdown.py dist/results >> $GITHUB_STEP_SUMMARY
       - uses: actions/upload-artifact@v4
         if: always()
         with:
@@ -177,12 +199,8 @@ jobs:
           cmake --build ./jsonschema-strip/build --config Release --target strip
           mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip
           echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH
-      - name: Install moreutils
-        run: sudo apt-get install moreutils
-      - name: Strip schemas
-        run: |
-          for f in schemas/*/schema.json; do
-            ./jsonschema-strip/build/jsonschema-strip "$f" | sponge "$f"
-          done
+      - uses: actions/setup-go@v5
+      - name: Install gron
+        run: go install github.com/tomnomnom/gron@latest
       - name: Create summary
-        run: ./.github/dataset_summary_table.sh >> $GITHUB_STEP_SUMMARY
+        run: ./dataset_summary.sh md >> $GITHUB_STEP_SUMMARY
diff --git a/Makefile b/Makefile
@@ -21,8 +21,20 @@ dist/temp/$1: | dist/temp ; mkdir $$@
 ALL_TARGETS += $$(addprefix dist/results/$1/,$(SCHEMAS))
 endef
 ALL_PLOTS := $(foreach schema,$(SCHEMAS),dist/results/plots/$(schema).png)
+ALL_SCHEMAS := $(foreach schema,$(SCHEMAS),schemas/$(schema)/schema-noformat.json)
+ALL_INSTANCES := $(foreach schema,$(SCHEMAS),schemas/$(schema)/instances.jsonl)
 $(foreach implementation,$(IMPLEMENTATIONS),$(eval $(call PREPARE_IMPLEMENTATION,$(implementation))))
 dist/report.csv: report.sh $(ALL_TARGETS) | dist ; ./$< $(ALL_TARGETS) > $@
+dist/summary.csv: \
+	$(ALL_SCHEMAS) \
+	$(ALL_INSTANCES) \
+	dataset_summary.sh
+	./dataset_summary.sh csv > $@
+dist/results/compile.svg dist/results/compile.png: \
+	dist/report.csv \
+	dist/summary.csv \
+	plot_compile.py
+	uv run python plot_compile.py
 dist/results/plots/%.png: \
 	dist/results/plots \
 	dist/report.csv \

diff --git a/dataset_summary.sh b/dataset_summary.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 csv|md"
+  exit
+fi
+
+FORMAT=$1
+SCHEMAS=$(ls schemas/)
+
+LATEX_ROWS=""
+
+# Output each table row
+for schema in $SCHEMAS; do
+  make "schemas/$schema/schema-noformat.json" > /dev/null
+  docs=$(wc -l < "schemas/$schema/instances.jsonl")
+  size=$(jsonschema-strip "schemas/$schema/schema-noformat.json" 2> /dev/null | wc -c)
+  size_kb=$(bc <<<"scale=1; $size / 1024")
+  avg_doc_size=$(cat "schemas/$schema/instances.jsonl" | while read l; do echo "$l" | wc -c; done | awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }')
+
+  if [ "$FORMAT" = "csv" ]; then
+    CSV_ROWS=$(printf "%s%s,%d,%.1f,%.0f%s" "$CSV_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
+  elif [ "$FORMAT" = "md" ]; then
+    LATEX_ROWS=$(printf "%s        %s & %d & %.1f & %.0f %s" "$LATEX_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\\\\\n')
+    MARKDOWN_ROWS=$(printf "%s| %s | %d | %.1f | %.0f |%s" "$MARKDOWN_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
+  fi
+done
+
+if [ "$FORMAT" = "csv" ]; then
+  cat << EOF
+name,docs,size_kb,avg_doc_size
+EOF
+
+  echo -ne "$CSV_ROWS"
+fi
+
+if [ "$FORMAT" = "md" ]; then
+  # Print the table header
+  cat << EOF
+|Dataset name|# Docs|Schema Size (KB)|Avg. Doc. Size (B)|
+|---|---|---|---|
+EOF
+
+  echo -e $MARKDOWN_ROWS
+
+  cat << EOF
+<details>
+
+<summary>LaTeX table</summary>
+
+EOF
+
+  echo '```'
+
+  cat << EOF
+\begin{table}[h]
+    {\small
+    \centering
+    \begin{tabular}{l r r r}
+        \hline
+        Name & \# Docs & Schema Size (KB) & Avg. Doc. Size (B) \\\\
+        \hline
+EOF
+
+  echo -ne "$LATEX_ROWS"
+
+  # Print the table footer
+  cat << EOF
+    \end{tabular}
+    }
+    \caption{Dataset used for validator evaluation}\label{tab:datasets}
+\end{table}
+EOF
+
+  echo '```'
+
+  cat << EOF
+</details>
+EOF
+fi
diff --git a/plot_compile.py b/plot_compile.py
@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+
+if __name__ == "__main__":
+    # Average out runtime across runs
+    runtime = (
+        pd.read_csv("dist/report.csv")
+        .groupby(["implementation", "version", "name"])
+        .mean()
+        .astype("int")
+    )
+    runtime.reset_index(inplace=True)
+    runtime = runtime[runtime['implementation'] == 'blaze']
+    runtime.set_index(["name"], inplace=True)
+
+    data = pd.read_csv('dist/summary.csv')
+    data.set_index(['name'], inplace=True)
+    joined = runtime.join(data, on='name')
+
+    fig, ax = plt.subplots()
+    ax.set(xscale='log', yscale='log')
+    ax.set_xlabel('Schema size (KB)')
+    ax.set_ylabel('Compile time (ns)')
+    plot = sns.scatterplot(data=joined, x='size_kb', y='compile_ns')
+    plot.get_figure().savefig(
+        f"dist/results/compile.png", dpi=96, bbox_inches="tight"
+    )
+    plot.get_figure().savefig(
+        f"dist/results/compile.svg", dpi=96, bbox_inches="tight"
+    )