From 35285bbc8a2de545c82d045baa92f5a736ddd3fb Mon Sep 17 00:00:00 2001
From: Michael Mior <mmior@mail.rit.edu>
Date: Wed, 29 Jan 2025 11:18:46 -0500
Subject: [PATCH] Add graph of compile time vs schema size (#128)

---
 .github/dataset_summary_table.sh | 59 -----------------------
 .github/plot_markdown.py         |  3 +-
 .github/workflows/ci.yml         | 38 +++++++++++----
 Makefile                         | 12 +++++
 dataset_summary.sh               | 80 ++++++++++++++++++++++++++++++++
 plot_compile.py                  | 32 +++++++++++++
 6 files changed, 154 insertions(+), 70 deletions(-)
 delete mode 100755 .github/dataset_summary_table.sh
 create mode 100755 dataset_summary.sh
 create mode 100644 plot_compile.py
diff --git a/.github/dataset_summary_table.sh b/.github/dataset_summary_table.sh
deleted file mode 100755
index f76a0f9..0000000
--- a/.github/dataset_summary_table.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-SCHEMAS=$(ls schemas/)
-
-LATEX_ROWS=""
-
-# Output each table row
-for schema in $SCHEMAS; do
-  docs=$(wc -l < "schemas/$schema/instances.jsonl")
-  size=$(wc -c < "schemas/$schema/schema.json")
-  size_kb=$(bc <<<"scale=1; $size / 1024")
-  avg_doc_size=$(cat "schemas/$schema/instances.jsonl" | while read l; do echo "$l" | wc -c; done | awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }')
-
-  LATEX_ROWS=$(printf "%s        %s & %d & %.1f & %.0f %s" "$LATEX_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\\\\\n')
-  MARKDOWN_ROWS=$(printf "%s| %s | %d | %.1f | %.0f |%s" "$MARKDOWN_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
-done
-
-# Print the table header
-cat << EOF
-|Dataset name|# Docs|Schema Size (KB)|Avg. Doc. Size (B)|
-|---|---|---|---|
-EOF
-
-echo -e $MARKDOWN_ROWS
-
-cat << EOF
-<details>
-
-<summary>LaTeX table</summary>
-
-EOF
-
-echo '```'
-
-cat << EOF
-\begin{table}[h]
-    {\small
-    \centering
-    \begin{tabular}{l r r r}
-        \hline
-        Name & \# Docs & Schema Size (KB) & Avg. Doc. Size (B) \\\\
-        \hline
-EOF
-
-echo -ne "$LATEX_ROWS"
-
-# Print the table footer
-cat << EOF
-    \end{tabular}
-    }
-    \caption{Dataset used for validator evaluation}\label{tab:datasets}
-\end{table}
-EOF
-
-echo '```'
-
-cat << EOF
-</details>
-EOF
diff --git a/.github/plot_markdown.py b/.github/plot_markdown.py
index ac28c84..892e2a2 100644
--- a/.github/plot_markdown.py
+++ b/.github/plot_markdown.py
@@ -1,11 +1,12 @@
 import glob
 import json
 import os
+import sys
 
 
 if __name__ == '__main__':
     img_urls = json.loads(os.environ['IMG_URLS'])
-    img_files = sorted(glob.glob('dist/results/plots/*.png'))
+    img_files = sorted(glob.glob(os.path.join(sys.argv[1], '*.png')))
     for (url, file) in zip(img_urls, img_files):
         name = file.split('/')[-1].split('.')[0]
         print(f"## {name}")
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7171f5b..bf53e69 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -128,6 +128,16 @@ jobs:
           python-version: '3.12.5'
       - name: Install uv
         run: pipx install uv
+      - name: Get jsonschema-strip binary
+        run: |
+          git clone --depth 1 --branch main https://github.com/sourcemeta-research/jsonschema-strip
+          cmake -S jsonschema-strip -B ./jsonschema-strip/build -DCMAKE_BUILD_TYPE:STRING=Release
+          cmake --build ./jsonschema-strip/build --config Release --target strip
+          mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip
+          echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH
+      - uses: actions/setup-go@v5
+      - name: Install gron
+        run: go install github.com/tomnomnom/gron@latest
 
       - name: Process CSV
         run: uv run python .github/csv_min.py > dist/report-min.csv
@@ -147,17 +157,29 @@ jobs:
         run: echo "$MARKDOWN_TABLE" >> $GITHUB_STEP_SUMMARY
 
       - name: Generate plots
-        run: mkdir -p dist/results/plots && uv run python plot.py
+        run: |
+          ./dataset_summary.sh csv >> dist/summary.csv
+          mkdir -p dist/results/plots && uv run python plot.py && uv run python plot_compile.py
       - name: Upload plots
         id: imgur
         uses: devicons/public-upload-to-imgur@v2.2.2
         with:
           path: dist/results/plots/*.png
           client_id: ${{secrets.IMGUR_CLIENT_ID}}
+      - name: Upload compile plot
+        id: imgur_compile
+        uses: devicons/public-upload-to-imgur@v2.2.2
+        with:
+          path: dist/results/compile.png
+          client_id: ${{secrets.IMGUR_CLIENT_ID}}
       - name: Add plots to summary
         env:
           IMG_URLS: ${{ steps.imgur.outputs.imgur_urls }}
-        run: python .github/plot_markdown.py >> $GITHUB_STEP_SUMMARY
+        run: python .github/plot_markdown.py dist/results/plots >> $GITHUB_STEP_SUMMARY
+      - name: Add plots to summary
+        env:
+          IMG_URLS: ${{ steps.imgur_compile.outputs.imgur_urls }}
+        run: python .github/plot_markdown.py dist/results >> $GITHUB_STEP_SUMMARY
       - uses: actions/upload-artifact@v4
         if: always()
         with:
@@ -177,12 +199,8 @@ jobs:
           cmake --build ./jsonschema-strip/build --config Release --target strip
           mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip
           echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH
-      - name: Install moreutils
-        run: sudo apt-get install moreutils
-      - name: Strip schemas
-        run: |
-          for f in schemas/*/schema.json; do
-            ./jsonschema-strip/build/jsonschema-strip "$f" | sponge "$f"
-          done
+      - uses: actions/setup-go@v5
+      - name: Install gron
+        run: go install github.com/tomnomnom/gron@latest
       - name: Create summary
-        run: ./.github/dataset_summary_table.sh >> $GITHUB_STEP_SUMMARY
+        run: ./dataset_summary.sh md >> $GITHUB_STEP_SUMMARY
diff --git a/Makefile b/Makefile
index b5d4f5a..159b37f 100644
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,20 @@ dist/temp/$1: | dist/temp ; mkdir $$@
 ALL_TARGETS += $$(addprefix dist/results/$1/,$(SCHEMAS))
 endef
 ALL_PLOTS := $(foreach schema,$(SCHEMAS),dist/results/plots/$(schema).png)
+ALL_SCHEMAS := $(foreach schema,$(SCHEMAS),schemas/$(schema)/schema-noformat.json)
+ALL_INSTANCES := $(foreach schema,$(SCHEMAS),schemas/$(schema)/instances.jsonl)
 $(foreach implementation,$(IMPLEMENTATIONS),$(eval $(call PREPARE_IMPLEMENTATION,$(implementation))))
 dist/report.csv: report.sh $(ALL_TARGETS) | dist ; ./$< $(ALL_TARGETS) > $@
+dist/summary.csv: \
+	$(ALL_SCHEMAS) \
+	$(ALL_INSTANCES) \
+	dataset_summary.sh
+	./dataset_summary.sh csv > $@
+dist/results/compile.svg dist/results/compile.png: \
+	dist/report.csv \
+	dist/summary.csv \
+	plot_compile.py
+	uv run python plot_compile.py
 dist/results/plots/%.png: \
 	dist/results/plots \
 	dist/report.csv \
diff --git a/dataset_summary.sh b/dataset_summary.sh
new file mode 100755
index 0000000..214a32c
--- /dev/null
+++ b/dataset_summary.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 csv|md"
+  exit
+fi
+
+FORMAT=$1
+SCHEMAS=$(ls schemas/)
+
+LATEX_ROWS=""
+
+# Output each table row
+for schema in $SCHEMAS; do
+  make "schemas/$schema/schema-noformat.json" > /dev/null
+  docs=$(wc -l < "schemas/$schema/instances.jsonl")
+  size=$(jsonschema-strip "schemas/$schema/schema-noformat.json" 2> /dev/null | wc -c)
+  size_kb=$(bc <<<"scale=1; $size / 1024")
+  avg_doc_size=$(cat "schemas/$schema/instances.jsonl" | while read l; do echo "$l" | wc -c; done | awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }')
+
+  if [ "$FORMAT" = "csv" ]; then
+    CSV_ROWS=$(printf "%s%s,%d,%.1f,%.0f%s" "$CSV_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
+  elif [ "$FORMAT" = "md" ]; then
+    LATEX_ROWS=$(printf "%s        %s & %d & %.1f & %.0f %s" "$LATEX_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\\\\\n')
+    MARKDOWN_ROWS=$(printf "%s| %s | %d | %.1f | %.0f |%s" "$MARKDOWN_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
+  fi
+done
+
+if [ "$FORMAT" = "csv" ]; then
+  cat << EOF
+name,docs,size_kb,avg_doc_size
+EOF
+
+  echo -ne "$CSV_ROWS"
+fi
+
+if [ "$FORMAT" = "md" ]; then
+  # Print the table header
+  cat << EOF
+|Dataset name|# Docs|Schema Size (KB)|Avg. Doc. Size (B)|
+|---|---|---|---|
+EOF
+
+  echo -e $MARKDOWN_ROWS
+
+  cat << EOF
+<details>
+
+<summary>LaTeX table</summary>
+
+EOF
+
+  echo '```'
+
+  cat << EOF
+\begin{table}[h]
+    {\small
+    \centering
+    \begin{tabular}{l r r r}
+        \hline
+        Name & \# Docs & Schema Size (KB) & Avg. Doc. Size (B) \\\\
+        \hline
+EOF
+
+  echo -ne "$LATEX_ROWS"
+
+  # Print the table footer
+  cat << EOF
+    \end{tabular}
+    }
+    \caption{Dataset used for validator evaluation}\label{tab:datasets}
+\end{table}
+EOF
+
+  echo '```'
+
+  cat << EOF
+</details>
+EOF
+fi
diff --git a/plot_compile.py b/plot_compile.py
new file mode 100644
index 0000000..71e0e8d
--- /dev/null
+++ b/plot_compile.py
@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+
+if __name__ == "__main__":
+    # Average out runtime across runs
+    runtime = (
+        pd.read_csv("dist/report.csv")
+        .groupby(["implementation", "version", "name"])
+        .mean()
+        .astype("int")
+    )
+    runtime.reset_index(inplace=True)
+    runtime = runtime[runtime['implementation'] == 'blaze']
+    runtime.set_index(["name"], inplace=True)
+
+    data = pd.read_csv('dist/summary.csv')
+    data.set_index(['name'], inplace=True)
+    joined = runtime.join(data, on='name')
+
+    fig, ax = plt.subplots()
+    ax.set(xscale='log', yscale='log')
+    ax.set_xlabel('Schema size (KB)')
+    ax.set_ylabel('Compile time (ns)')
+    plot = sns.scatterplot(data=joined, x='size_kb', y='compile_ns')
+    plot.get_figure().savefig(
+        f"dist/results/compile.png", dpi=96, bbox_inches="tight"
+    )
+    plot.get_figure().savefig(
+        f"dist/results/compile.svg", dpi=96, bbox_inches="tight"
+    )