Skip to content

Commit

Permalink
Add graph of compile time vs schema size (#128)
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelmior authored Jan 29, 2025
1 parent fd2e5aa commit 35285bb
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 70 deletions.
59 changes: 0 additions & 59 deletions .github/dataset_summary_table.sh

This file was deleted.

3 changes: 2 additions & 1 deletion .github/plot_markdown.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import glob
import json
import os
import sys


if __name__ == '__main__':
img_urls = json.loads(os.environ['IMG_URLS'])
img_files = sorted(glob.glob('dist/results/plots/*.png'))
img_files = sorted(glob.glob(os.path.join(sys.argv[1], '*.png')))
for (url, file) in zip(img_urls, img_files):
name = file.split('/')[-1].split('.')[0]
print(f"## {name}")
Expand Down
38 changes: 28 additions & 10 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,16 @@ jobs:
python-version: '3.12.5'
- name: Install uv
run: pipx install uv
- name: Get jsonschema-strip binary
run: |
git clone --depth 1 --branch main https://github.com/sourcemeta-research/jsonschema-strip
cmake -S jsonschema-strip -B ./jsonschema-strip/build -DCMAKE_BUILD_TYPE:STRING=Release
cmake --build ./jsonschema-strip/build --config Release --target strip
mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip
echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH
- uses: actions/setup-go@v5
- name: Install gron
run: go install github.com/tomnomnom/gron@latest

- name: Process CSV
run: uv run python .github/csv_min.py > dist/report-min.csv
Expand All @@ -147,17 +157,29 @@ jobs:
run: echo "$MARKDOWN_TABLE" >> $GITHUB_STEP_SUMMARY

- name: Generate plots
run: mkdir -p dist/results/plots && uv run python plot.py
run: |
./dataset_summary.sh csv >> dist/summary.csv
mkdir -p dist/results/plots && uv run python plot.py && uv run python plot_compile.py
- name: Upload plots
id: imgur
uses: devicons/[email protected]
with:
path: dist/results/plots/*.png
client_id: ${{secrets.IMGUR_CLIENT_ID}}
- name: Upload compile plot
id: imgur_compile
uses: devicons/[email protected]
with:
path: dist/results/compile.png
client_id: ${{secrets.IMGUR_CLIENT_ID}}
- name: Add plots to summary
env:
IMG_URLS: ${{ steps.imgur.outputs.imgur_urls }}
run: python .github/plot_markdown.py >> $GITHUB_STEP_SUMMARY
run: python .github/plot_markdown.py dist/results/plots >> $GITHUB_STEP_SUMMARY
- name: Add plots to summary
env:
IMG_URLS: ${{ steps.imgur_compile.outputs.imgur_urls }}
run: python .github/plot_markdown.py dist/results >> $GITHUB_STEP_SUMMARY
- uses: actions/upload-artifact@v4
if: always()
with:
Expand All @@ -177,12 +199,8 @@ jobs:
cmake --build ./jsonschema-strip/build --config Release --target strip
mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip
echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH
- name: Install moreutils
run: sudo apt-get install moreutils
- name: Strip schemas
run: |
for f in schemas/*/schema.json; do
./jsonschema-strip/build/jsonschema-strip "$f" | sponge "$f"
done
- uses: actions/setup-go@v5
- name: Install gron
run: go install github.com/tomnomnom/gron@latest
- name: Create summary
run: ./.github/dataset_summary_table.sh >> $GITHUB_STEP_SUMMARY
run: ./dataset_summary.sh md >> $GITHUB_STEP_SUMMARY
12 changes: 12 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,20 @@ dist/temp/$1: | dist/temp ; mkdir $$@
ALL_TARGETS += $$(addprefix dist/results/$1/,$(SCHEMAS))
endef
ALL_PLOTS := $(foreach schema,$(SCHEMAS),dist/results/plots/$(schema).png)
ALL_SCHEMAS := $(foreach schema,$(SCHEMAS),schemas/$(schema)/schema-noformat.json)
ALL_INSTANCES := $(foreach schema,$(SCHEMAS),schemas/$(schema)/instances.jsonl)
$(foreach implementation,$(IMPLEMENTATIONS),$(eval $(call PREPARE_IMPLEMENTATION,$(implementation))))
dist/report.csv: report.sh $(ALL_TARGETS) | dist ; ./$< $(ALL_TARGETS) > $@
dist/summary.csv: \
$(ALL_SCHEMAS) \
$(ALL_INSTANCES) \
dataset_summary.sh
./dataset_summary.sh csv > $@
dist/results/compile.svg dist/results/compile.png: \
dist/report.csv \
dist/summary.csv \
plot_compile.py
uv run python plot_compile.py
dist/results/plots/%.png: \
dist/results/plots \
dist/report.csv \
Expand Down
80 changes: 80 additions & 0 deletions dataset_summary.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash

if [ $# -ne 1 ]; then
echo "Usage: $0 csv|md"
exit
fi

FORMAT=$1
SCHEMAS=$(ls schemas/)

LATEX_ROWS=""

# Output each table row
for schema in $SCHEMAS; do
make "schemas/$schema/schema-noformat.json" > /dev/null
docs=$(wc -l < "schemas/$schema/instances.jsonl")
size=$(jsonschema-strip "schemas/$schema/schema-noformat.json" 2> /dev/null | wc -c)
size_kb=$(bc <<<"scale=1; $size / 1024")
avg_doc_size=$(cat "schemas/$schema/instances.jsonl" | while read l; do echo "$l" | wc -c; done | awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }')

if [ "$FORMAT" = "csv" ]; then
CSV_ROWS=$(printf "%s%s,%d,%.1f,%.0f%s" "$CSV_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
elif [ "$FORMAT" = "md" ]; then
LATEX_ROWS=$(printf "%s %s & %d & %.1f & %.0f %s" "$LATEX_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\\\\\n')
MARKDOWN_ROWS=$(printf "%s| %s | %d | %.1f | %.0f |%s" "$MARKDOWN_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n')
fi
done

if [ "$FORMAT" = "csv" ]; then
cat << EOF
name,docs,size_kb,avg_doc_size
EOF

echo -ne "$CSV_ROWS"
fi

if [ "$FORMAT" = "md" ]; then
# Print the table header
cat << EOF
|Dataset name|# Docs|Schema Size (KB)|Avg. Doc. Size (B)|
|---|---|---|---|
EOF

echo -e $MARKDOWN_ROWS

cat << EOF
<details>
<summary>LaTeX table</summary>
EOF

echo '```'

cat << EOF
\begin{table}[h]
{\small
\centering
\begin{tabular}{l r r r}
\hline
Name & \# Docs & Schema Size (KB) & Avg. Doc. Size (B) \\\\
\hline
EOF

echo -ne "$LATEX_ROWS"

# Print the table footer
cat << EOF
\end{tabular}
}
\caption{Dataset used for validator evaluation}\label{tab:datasets}
\end{table}
EOF

echo '```'

cat << EOF
</details>
EOF
fi
32 changes: 32 additions & 0 deletions plot_compile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


if __name__ == "__main__":
# Average out runtime across runs
runtime = (
pd.read_csv("dist/report.csv")
.groupby(["implementation", "version", "name"])
.mean()
.astype("int")
)
runtime.reset_index(inplace=True)
runtime = runtime[runtime['implementation'] == 'blaze']
runtime.set_index(["name"], inplace=True)

data = pd.read_csv('dist/summary.csv')
data.set_index(['name'], inplace=True)
joined = runtime.join(data, on='name')

fig, ax = plt.subplots()
ax.set(xscale='log', yscale='log')
ax.set_xlabel('Schema size (KB)')
ax.set_ylabel('Compile time (ns)')
plot = sns.scatterplot(data=joined, x='size_kb', y='compile_ns')
plot.get_figure().savefig(
f"dist/results/compile.png", dpi=96, bbox_inches="tight"
)
plot.get_figure().savefig(
f"dist/results/compile.svg", dpi=96, bbox_inches="tight"
)

0 comments on commit 35285bb

Please sign in to comment.