Merge pull request #5 from oramasearch/feat/quality_check

chore: adds quality checks
oramasearch · Nov 26, 2024 · 69439dd · 69439dd
2 parents e226462 + 93e6b8b
commit 69439dd
Show file tree

Hide file tree

Showing 7 changed files with 359 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 .idea
-target
+target
+benchmark_results.csv
+benchmark_results.png
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,8 @@ rand_distr = "0.4.3"
 rayon = "1.10.0"
 log = "0.4.22"
 thiserror = "2.0.3"
+env_logger = "0.11.5"
+serde = { version = "1.0.215", features = ["derive"] }
 
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
@@ -35,3 +37,7 @@ path = "src/bin/example.rs"
 [[bin]]
 name = "readme_example"
 path = "src/bin/readme_example.rs"
+
+[[bin]]
+name = "quality_check"
+path = "src/bin/quality_check.rs"
diff --git a/Makefile b/Makefile
@@ -0,0 +1,14 @@
+.PHONY: quality_check
+
+RUST_LOG := info
+PLOTS_DIR := plots
+BENCHMARK_RESULTS := benchmark_results.png
+
+quality_check:
+	$(RUST_LOG) cargo run --release --bin quality_check
+	cd $(PLOTS_DIR) && python3 main.py
+	mv $(PLOTS_DIR)/$(BENCHMARK_RESULTS) ./$(BENCHMARK_RESULTS)
+
+clean:
+	rm -f $(BENCHMARK_RESULTS)
+	rm -f benchmark_results.csv
diff --git a/plots/main.py b/plots/main.py
@@ -0,0 +1,52 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Read the benchmark results
+df = pd.read_csv('../benchmark_results.csv')
+
+# Set up the plotting style
+plt.style.use('seaborn-v0_8-darkgrid')
+sns.set_palette("husl")
+
+# Create a figure with multiple subplots
+fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
+
+# Plot 1: Timing metrics
+ax1.plot(df['n_samples'], df['fit_time_ms'], marker='o', label='Fit Time')
+ax1.plot(df['n_samples'], df['compression_time_ms'], marker='o', label='Compression Time')
+ax1.set_xlabel('Number of Samples')
+ax1.set_ylabel('Time (ms)')
+ax1.set_title('Processing Time vs Dataset Size')
+ax1.legend()
+ax1.set_xscale('log')
+ax1.set_yscale('log')
+
+# Plot 2: Quality metrics
+ax2.plot(df['n_samples'], df['reconstruction_error'], marker='o', label='Reconstruction Error')
+ax2.plot(df['n_samples'], df['recall'], marker='o', label='Recall@10')
+ax2.set_xlabel('Number of Samples')
+ax2.set_ylabel('Score')
+ax2.set_title('Quality Metrics vs Dataset Size')
+ax2.legend()
+ax2.set_xscale('log')
+
+# Plot 3: Memory reduction
+ax3.plot(df['n_samples'], (1 - df['memory_reduction_ratio']) * 100, marker='o')
+ax3.set_xlabel('Number of Samples')
+ax3.set_ylabel('Memory Reduction (%)')
+ax3.set_title('Memory Reduction vs Dataset Size')
+ax3.set_xscale('log')
+
+# Plot 4: Time per sample
+df['time_per_sample'] = (df['compression_time_ms']) / df['n_samples']
+ax4.plot(df['n_samples'], df['time_per_sample'], marker='o')
+ax4.set_xlabel('Number of Samples')
+ax4.set_ylabel('Compression Time per Sample (ms)')
+ax4.set_title('Scaling Efficiency')
+ax4.set_xscale('log')
+ax4.set_yscale('log')
+
+plt.tight_layout()
+plt.savefig('benchmark_results.png', dpi=300, bbox_inches='tight')
+plt.close()
diff --git a/plots/requirements.txt b/plots/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+matplotlib
+seaborn