From 35285bbc8a2de545c82d045baa92f5a736ddd3fb Mon Sep 17 00:00:00 2001 From: Michael Mior Date: Wed, 29 Jan 2025 11:18:46 -0500 Subject: [PATCH] Add graph of compile time vs schema size (#128) --- .github/dataset_summary_table.sh | 59 ----------------------- .github/plot_markdown.py | 3 +- .github/workflows/ci.yml | 38 +++++++++++---- Makefile | 12 +++++ dataset_summary.sh | 80 ++++++++++++++++++++++++++++++++ plot_compile.py | 32 +++++++++++++ 6 files changed, 154 insertions(+), 70 deletions(-) delete mode 100755 .github/dataset_summary_table.sh create mode 100755 dataset_summary.sh create mode 100644 plot_compile.py diff --git a/.github/dataset_summary_table.sh b/.github/dataset_summary_table.sh deleted file mode 100755 index f76a0f9..0000000 --- a/.github/dataset_summary_table.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -SCHEMAS=$(ls schemas/) - -LATEX_ROWS="" - -# Output each table row -for schema in $SCHEMAS; do - docs=$(wc -l < "schemas/$schema/instances.jsonl") - size=$(wc -c < "schemas/$schema/schema.json") - size_kb=$(bc <<<"scale=1; $size / 1024") - avg_doc_size=$(cat "schemas/$schema/instances.jsonl" | while read l; do echo "$l" | wc -c; done | awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }') - - LATEX_ROWS=$(printf "%s %s & %d & %.1f & %.0f %s" "$LATEX_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\\\\\n') - MARKDOWN_ROWS=$(printf "%s| %s | %d | %.1f | %.0f |%s" "$MARKDOWN_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n') -done - -# Print the table header -cat << EOF -|Dataset name|# Docs|Schema Size (KB)|Avg. Doc. Size (B)| -|---|---|---|---| -EOF - -echo -e $MARKDOWN_ROWS - -cat << EOF -
- -LaTeX table - -EOF - -echo '```' - -cat << EOF -\begin{table}[h] - {\small - \centering - \begin{tabular}{l r r r} - \hline - Name & \# Docs & Schema Size (KB) & Avg. Doc. Size (B) \\\\ - \hline -EOF - -echo -ne "$LATEX_ROWS" - -# Print the table footer -cat << EOF - \end{tabular} - } - \caption{Dataset used for validator evaluation}\label{tab:datasets} -\end{table} -EOF - -echo '```' - -cat << EOF -
-EOF diff --git a/.github/plot_markdown.py b/.github/plot_markdown.py index ac28c84..892e2a2 100644 --- a/.github/plot_markdown.py +++ b/.github/plot_markdown.py @@ -1,11 +1,12 @@ import glob import json import os +import sys if __name__ == '__main__': img_urls = json.loads(os.environ['IMG_URLS']) - img_files = sorted(glob.glob('dist/results/plots/*.png')) + img_files = sorted(glob.glob(os.path.join(sys.argv[1], '*.png'))) for (url, file) in zip(img_urls, img_files): name = file.split('/')[-1].split('.')[0] print(f"## {name}") diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7171f5b..bf53e69 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,6 +128,16 @@ jobs: python-version: '3.12.5' - name: Install uv run: pipx install uv + - name: Get jsonschema-strip binary + run: | + git clone --depth 1 --branch main https://github.com/sourcemeta-research/jsonschema-strip + cmake -S jsonschema-strip -B ./jsonschema-strip/build -DCMAKE_BUILD_TYPE:STRING=Release + cmake --build ./jsonschema-strip/build --config Release --target strip + mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip + echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH + - uses: actions/setup-go@v5 + - name: Install gron + run: go install github.com/tomnomnom/gron@latest - name: Process CSV run: uv run python .github/csv_min.py > dist/report-min.csv @@ -147,17 +157,29 @@ jobs: run: echo "$MARKDOWN_TABLE" >> $GITHUB_STEP_SUMMARY - name: Generate plots - run: mkdir -p dist/results/plots && uv run python plot.py + run: | + ./dataset_summary.sh csv >> dist/summary.csv + mkdir -p dist/results/plots && uv run python plot.py && uv run python plot_compile.py - name: Upload plots id: imgur uses: devicons/public-upload-to-imgur@v2.2.2 with: path: dist/results/plots/*.png client_id: ${{secrets.IMGUR_CLIENT_ID}} + - name: Upload compile plot + id: imgur_compile + uses: devicons/public-upload-to-imgur@v2.2.2 + with: + path: dist/results/compile.png + client_id: ${{secrets.IMGUR_CLIENT_ID}} - name: Add plots to summary env: IMG_URLS: ${{ steps.imgur.outputs.imgur_urls }} - run: python .github/plot_markdown.py >> $GITHUB_STEP_SUMMARY + run: python .github/plot_markdown.py dist/results/plots >> $GITHUB_STEP_SUMMARY + - name: Add plots to summary + env: + IMG_URLS: ${{ steps.imgur_compile.outputs.imgur_urls }} + run: python .github/plot_markdown.py dist/results >> $GITHUB_STEP_SUMMARY - uses: actions/upload-artifact@v4 if: always() with: @@ -177,12 +199,8 @@ jobs: cmake --build ./jsonschema-strip/build --config Release --target strip mv ./jsonschema-strip/build/strip ./jsonschema-strip/build/jsonschema-strip echo "$(pwd)/jsonschema-strip/build" >> $GITHUB_PATH - - name: Install moreutils - run: sudo apt-get install moreutils - - name: Strip schemas - run: | - for f in schemas/*/schema.json; do - ./jsonschema-strip/build/jsonschema-strip "$f" | sponge "$f" - done + - uses: actions/setup-go@v5 + - name: Install gron + run: go install github.com/tomnomnom/gron@latest - name: Create summary - run: ./.github/dataset_summary_table.sh >> $GITHUB_STEP_SUMMARY + run: ./dataset_summary.sh md >> $GITHUB_STEP_SUMMARY diff --git a/Makefile b/Makefile index b5d4f5a..159b37f 100644 --- a/Makefile +++ b/Makefile @@ -21,8 +21,20 @@ dist/temp/$1: | dist/temp ; mkdir $$@ ALL_TARGETS += $$(addprefix dist/results/$1/,$(SCHEMAS)) endef ALL_PLOTS := $(foreach schema,$(SCHEMAS),dist/results/plots/$(schema).png) +ALL_SCHEMAS := $(foreach schema,$(SCHEMAS),schemas/$(schema)/schema-noformat.json) +ALL_INSTANCES := $(foreach schema,$(SCHEMAS),schemas/$(schema)/instances.jsonl) $(foreach implementation,$(IMPLEMENTATIONS),$(eval $(call PREPARE_IMPLEMENTATION,$(implementation)))) dist/report.csv: report.sh $(ALL_TARGETS) | dist ; ./$< $(ALL_TARGETS) > $@ +dist/summary.csv: \ + $(ALL_SCHEMAS) \ + $(ALL_INSTANCES) \ + dataset_summary.sh + ./dataset_summary.sh csv > $@ +dist/results/compile.svg dist/results/compile.png: \ + dist/report.csv \ + dist/summary.csv \ + plot_compile.py + uv run python plot_compile.py dist/results/plots/%.png: \ dist/results/plots \ dist/report.csv \ diff --git a/dataset_summary.sh b/dataset_summary.sh new file mode 100755 index 0000000..214a32c --- /dev/null +++ b/dataset_summary.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Usage: $0 csv|md" + exit +fi + +FORMAT=$1 +SCHEMAS=$(ls schemas/) + +LATEX_ROWS="" + +# Output each table row +for schema in $SCHEMAS; do + make "schemas/$schema/schema-noformat.json" > /dev/null + docs=$(wc -l < "schemas/$schema/instances.jsonl") + size=$(jsonschema-strip "schemas/$schema/schema-noformat.json" 2> /dev/null | wc -c) + size_kb=$(bc <<<"scale=1; $size / 1024") + avg_doc_size=$(cat "schemas/$schema/instances.jsonl" | while read l; do echo "$l" | wc -c; done | awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }') + + if [ "$FORMAT" = "csv" ]; then + CSV_ROWS=$(printf "%s%s,%d,%.1f,%.0f%s" "$CSV_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n') + elif [ "$FORMAT" = "md" ]; then + LATEX_ROWS=$(printf "%s %s & %d & %.1f & %.0f %s" "$LATEX_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\\\\\n') + MARKDOWN_ROWS=$(printf "%s| %s | %d | %.1f | %.0f |%s" "$MARKDOWN_ROWS" "$schema" "$docs" "$size_kb" "$avg_doc_size" '\n') + fi +done + +if [ "$FORMAT" = "csv" ]; then + cat << EOF +name,docs,size_kb,avg_doc_size +EOF + + echo -ne "$CSV_ROWS" +fi + +if [ "$FORMAT" = "md" ]; then + # Print the table header + cat << EOF +|Dataset name|# Docs|Schema Size (KB)|Avg. Doc. Size (B)| +|---|---|---|---| +EOF + + echo -e $MARKDOWN_ROWS + + cat << EOF +
+ +LaTeX table + +EOF + + echo '```' + + cat << EOF +\begin{table}[h] + {\small + \centering + \begin{tabular}{l r r r} + \hline + Name & \# Docs & Schema Size (KB) & Avg. Doc. Size (B) \\\\ + \hline +EOF + + echo -ne "$LATEX_ROWS" + + # Print the table footer + cat << EOF + \end{tabular} + } + \caption{Dataset used for validator evaluation}\label{tab:datasets} +\end{table} +EOF + + echo '```' + + cat << EOF +
+EOF +fi diff --git a/plot_compile.py b/plot_compile.py new file mode 100644 index 0000000..71e0e8d --- /dev/null +++ b/plot_compile.py @@ -0,0 +1,32 @@ +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + + +if __name__ == "__main__": + # Average out runtime across runs + runtime = ( + pd.read_csv("dist/report.csv") + .groupby(["implementation", "version", "name"]) + .mean() + .astype("int") + ) + runtime.reset_index(inplace=True) + runtime = runtime[runtime['implementation'] == 'blaze'] + runtime.set_index(["name"], inplace=True) + + data = pd.read_csv('dist/summary.csv') + data.set_index(['name'], inplace=True) + joined = runtime.join(data, on='name') + + fig, ax = plt.subplots() + ax.set(xscale='log', yscale='log') + ax.set_xlabel('Schema size (KB)') + ax.set_ylabel('Compile time (ns)') + plot = sns.scatterplot(data=joined, x='size_kb', y='compile_ns') + plot.get_figure().savefig( + f"dist/results/compile.png", dpi=96, bbox_inches="tight" + ) + plot.get_figure().savefig( + f"dist/results/compile.svg", dpi=96, bbox_inches="tight" + )