From cdf61a84a0201c2e1e0e7eba0e69c48bd632a84b Mon Sep 17 00:00:00 2001
From: Chandra Sanapala <chandra@sundeck.io>
Date: Fri, 17 Jan 2025 19:10:04 +0530
Subject: [PATCH 1/3] feat: add the test coverage baseline as a json file

---
 tests/baseline.json                      | 16 ++++
 tests/baseline.py                        | 68 +++++++++++++++++
 tests/cases/comparison/is_false.test     |  1 +
 tests/cases/comparison/is_not_false.test |  1 +
 tests/cases/comparison/is_not_true       |  1 +
 tests/cases/comparison/is_true.test      |  1 +
 tests/test_extensions.py                 | 97 ++++++++++++++++++++----
 7 files changed, 170 insertions(+), 15 deletions(-)
 create mode 100644 tests/baseline.json
 create mode 100644 tests/baseline.py

diff --git a/tests/baseline.json b/tests/baseline.json
new file mode 100644
index 000000000..c30f17ac5
--- /dev/null
+++ b/tests/baseline.json
@@ -0,0 +1,16 @@
+{
+   "registry": {
+     "dependency_count": 13,
+     "extension_count": 13,
+     "function_count": 165,
+     "num_aggregate_functions": 29,
+     "num_scalar_functions": 169,
+     "num_window_functions": 0,
+     "num_function_overloads": 517
+   },
+   "coverage": {
+     "total_test_count": 1086,
+     "num_function_variants": 517,
+     "num_covered_function_variants": 229
+   }
+}
diff --git a/tests/baseline.py b/tests/baseline.py
new file mode 100644
index 000000000..c666a3d2e
--- /dev/null
+++ b/tests/baseline.py
@@ -0,0 +1,68 @@
+import json
+from dataclasses import dataclass
+from typing import List, Dict
+
+from tests.coverage.coverage import TestCoverage
+from tests.coverage.extensions import FunctionRegistry
+
+
+@dataclass
+class Registry:
+    extension_count: int
+    dependency_count: int
+    function_count: int
+    num_aggregate_functions: int
+    num_scalar_functions: int
+    num_window_functions: int
+    num_function_overloads: int
+
+
+@dataclass
+class Coverage:
+    total_test_count: int
+    num_function_variants: int
+    num_covered_function_variants: int
+
+    def num_function_variants_without_coverage(self):
+        return self.num_function_variants - self.num_covered_function_variants
+
+
+@dataclass
+class Baseline:
+    registry: Registry
+    coverage: Coverage
+
+    @classmethod
+    def from_dict(cls, data: Dict):
+        registry_data = data["registry"]
+        test_coverage_data = data["coverage"]
+        registry = Registry(**registry_data)
+        coverage = Coverage(**test_coverage_data)
+        return cls(registry, coverage)
+
+    def num_function_variants_without_coverage(self):
+        return self.coverage.num_function_variants_without_coverage()
+
+
+def read_baseline_file(file_path: str) -> Baseline:
+    with open(file_path, "r") as file:
+        data = json.load(file)
+    return Baseline.from_dict(data)
+
+
+def generate_baseline(registry: FunctionRegistry, coverage: TestCoverage):
+    registry_data = Registry(
+        extension_count=len(registry.extensions),
+        dependency_count=len(registry.dependencies),
+        function_count=len(registry.registry),
+        num_aggregate_functions=len(registry.aggregate_functions),
+        num_scalar_functions=len(registry.scalar_functions),
+        num_window_functions=len(registry.window_functions),
+        num_function_overloads=sum([len(f) for f in registry.registry.values()]),
+    )
+    test_coverage_data = Coverage(
+        total_test_count=coverage.test_count,
+        num_function_variants=coverage.total_function_variants,
+        num_covered_function_variants=coverage.num_covered_function_variants,
+    )
+    return Baseline(registry=registry_data, coverage=test_coverage_data)
diff --git a/tests/cases/comparison/is_false.test b/tests/cases/comparison/is_false.test
index c11295ac3..ea2d78b2d 100644
--- a/tests/cases/comparison/is_false.test
+++ b/tests/cases/comparison/is_false.test
@@ -1,6 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
+# basic is_false
 is_false(true::bool) = false::bool
 is_false(false::bool) = true::bool
 is_false(null::bool) = false::bool
diff --git a/tests/cases/comparison/is_not_false.test b/tests/cases/comparison/is_not_false.test
index 4301e041f..a19bb53de 100644
--- a/tests/cases/comparison/is_not_false.test
+++ b/tests/cases/comparison/is_not_false.test
@@ -1,6 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
+# basic is_not_false
 is_not_false(true::bool) = true::bool
 is_not_false(false::bool) = false::bool
 is_not_false(null::bool) = true::bool
diff --git a/tests/cases/comparison/is_not_true b/tests/cases/comparison/is_not_true
index 3dafdaee1..997735bb1 100644
--- a/tests/cases/comparison/is_not_true
+++ b/tests/cases/comparison/is_not_true
@@ -1,6 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
+# basic is_not_true
 is_not_true(true::bool) = false::bool
 is_not_true(false::bool) = true::bool
 is_not_true(null::bool) = true::bool
diff --git a/tests/cases/comparison/is_true.test b/tests/cases/comparison/is_true.test
index f3bb7ff0e..cb1f30170 100644
--- a/tests/cases/comparison/is_true.test
+++ b/tests/cases/comparison/is_true.test
@@ -1,6 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
+# basic is_true
 is_true(true::bool) = true::bool
 is_true(false::bool) = false::bool
 is_true(null::bool) = false::bool
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index 44335f19e..95fff93e1 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -1,42 +1,109 @@
 # SPDX-License-Identifier: Apache-2.0
+import json
 import os
+from dataclasses import asdict
 
+from tests.baseline import read_baseline_file, generate_baseline
 from tests.coverage.case_file_parser import load_all_testcases
 from tests.coverage.coverage import get_test_coverage
 from tests.coverage.extensions import build_type_to_short_type
 from tests.coverage.extensions import Extension
 
 
+def compare_baselines(expected, actual):
+    errors = []
+
+    if actual.registry.extension_count < expected.registry.extension_count:
+        errors.append(
+            f"Extension count mismatch: expected {expected.registry.extension_count}, got {actual.registry.extension_count}"
+        )
+    if actual.registry.dependency_count < expected.registry.dependency_count:
+        errors.append(
+            f"Dependency count mismatch: expected {expected.registry.dependency_count}, got {actual.registry.dependency_count}"
+        )
+    if actual.registry.function_count < expected.registry.function_count:
+        errors.append(
+            f"Function count mismatch: expected {expected.registry.function_count}, got {actual.registry.function_count}"
+        )
+    if (
+        actual.registry.num_aggregate_functions
+        < expected.registry.num_aggregate_functions
+    ):
+        errors.append(
+            f"Aggregate function count mismatch: expected {expected.registry.num_aggregate_functions}, got {actual.registry.num_aggregate_functions}"
+        )
+    if actual.registry.num_scalar_functions < expected.registry.num_scalar_functions:
+        errors.append(
+            f"Scalar function count mismatch: expected {expected.registry.num_scalar_functions}, got {actual.registry.num_scalar_functions}"
+        )
+    if actual.registry.num_window_functions < expected.registry.num_window_functions:
+        errors.append(
+            f"Window function count mismatch: expected {expected.registry.num_window_functions}, got {actual.registry.num_window_functions}"
+        )
+    if (
+        actual.registry.num_function_overloads
+        < expected.registry.num_function_overloads
+    ):
+        errors.append(
+            f"Function overload count mismatch: expected {expected.registry.num_function_overloads}, got {actual.registry.num_function_overloads}"
+        )
+
+    if actual.coverage.total_test_count < expected.coverage.total_test_count:
+        errors.append(
+            f"Total test count mismatch: expected {expected.coverage.total_test_count}, got {actual.coverage.total_test_count}"
+        )
+    if actual.coverage.num_function_variants < expected.coverage.num_function_variants:
+        errors.append(
+            f"Total function variants mismatch: expected {expected.coverage.num_function_variants}, got {actual.coverage.num_function_variants}"
+        )
+    if (
+        actual.coverage.num_covered_function_variants
+        < expected.coverage.num_covered_function_variants
+    ):
+        errors.append(
+            f"Covered function variants mismatch: expected {expected.coverage.num_covered_function_variants}, got {actual.coverage.num_covered_function_variants}"
+        )
+
+    expected_coverage_gap = expected.num_function_variants_without_coverage()
+    actual_coverage_gap = actual.num_function_variants_without_coverage()
+    if actual_coverage_gap > expected_coverage_gap:
+        errors.append(
+            f"Coverage gap too large: {actual_coverage_gap} function variants with no tests, "
+            f"out of {actual.coverage.num_function_variants} total function variants. "
+            f"New functions should be added along with test cases that illustrate their behavior."
+        )
+
+    return errors
+
+
 # NOTE: this test is run as part of pre-commit hook
 def test_substrait_extension_coverage():
     script_dir = os.path.dirname(os.path.abspath(__file__))
+    baseline = read_baseline_file(os.path.join(script_dir, "baseline.json"))
     extensions_path = os.path.join(script_dir, "../extensions")
     registry = Extension.read_substrait_extensions(extensions_path)
-    assert len(registry.registry) >= 161
-    num_overloads = sum([len(f) for f in registry.registry.values()])
-    assert num_overloads >= 510
-    assert len(registry.dependencies) >= 13
-    assert len(registry.scalar_functions) >= 162
-    assert len(registry.aggregate_functions) >= 29
-    assert len(registry.window_functions) >= 0
 
     test_case_dir = os.path.join(script_dir, "./cases")
     all_test_files = load_all_testcases(test_case_dir)
     coverage = get_test_coverage(all_test_files, registry)
 
-    assert coverage.test_count >= 1077
     assert (
         coverage.num_tests_with_no_matching_function == 0
     ), f"{coverage.num_tests_with_no_matching_function} tests with no matching function"
-    assert coverage.num_covered_function_variants >= 226
-    assert coverage.total_function_variants >= 513
-    assert (
-        coverage.total_function_variants - coverage.num_covered_function_variants
-    ) <= 287, (
-        f"Coverage gap too large: {coverage.total_function_variants - coverage.num_covered_function_variants} "
-        f"function variants with no tests, out of {coverage.total_function_variants} total function variants."
+
+    actual_baseline = generate_baseline(registry, coverage)
+    errors = compare_baselines(baseline, actual_baseline)
+    assert not errors, (
+        "\n".join(errors)
+        + f"The baseline file does not match the current test coverage. "
+        f"Please update the file at tests/baseline.json to align with the current baseline"
+        f"{json.dumps(asdict(actual_baseline), indent=2)}"
     )
 
+    if baseline != actual_baseline:
+        print("\nBaseline has changed, updating tests/baseline.json")
+        print(json.dumps(asdict(actual_baseline), indent=2))
+
 
 def test_build_type_to_short_type():
     long_to_short = build_type_to_short_type()

From fa23172e24976a10df0c6a4a9941d54c3865be42 Mon Sep 17 00:00:00 2001
From: Chandra Sanapala <chandra@sundeck.io>
Date: Fri, 17 Jan 2025 19:14:44 +0530
Subject: [PATCH 2/3] add license header

---
 tests/baseline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/baseline.py b/tests/baseline.py
index c666a3d2e..a60dcfc60 100644
--- a/tests/baseline.py
+++ b/tests/baseline.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 import json
 from dataclasses import dataclass
 from typing import List, Dict

From b694f60bab778204827bf812d4d379c030d6f39a Mon Sep 17 00:00:00 2001
From: Chandra Sanapala <chandra@sundeck.io>
Date: Sat, 18 Jan 2025 04:05:29 +0530
Subject: [PATCH 3/3] address review comments

---
 tests/baseline.py                             | 68 +++++++++++++++++++
 tests/cases/comparison/is_false.test          |  2 +-
 tests/cases/comparison/is_not_false.test      |  2 +-
 .../{is_not_true => is_not_true.test}         |  2 +-
 tests/cases/comparison/is_true.test           |  2 +-
 tests/test_extensions.py                      | 68 +------------------
 6 files changed, 73 insertions(+), 71 deletions(-)
 rename tests/cases/comparison/{is_not_true => is_not_true.test} (80%)

diff --git a/tests/baseline.py b/tests/baseline.py
index a60dcfc60..135069e7f 100644
--- a/tests/baseline.py
+++ b/tests/baseline.py
@@ -44,6 +44,74 @@ def from_dict(cls, data: Dict):
     def num_function_variants_without_coverage(self):
         return self.coverage.num_function_variants_without_coverage()
 
+    def validate_against(self, expected):
+        errors = []
+
+        if self.registry.extension_count < expected.registry.extension_count:
+            errors.append(
+                f"Extension count mismatch: expected {expected.registry.extension_count}, got {self.registry.extension_count}"
+            )
+        if self.registry.dependency_count < expected.registry.dependency_count:
+            errors.append(
+                f"Dependency count mismatch: expected {expected.registry.dependency_count}, got {self.registry.dependency_count}"
+            )
+        if self.registry.function_count < expected.registry.function_count:
+            errors.append(
+                f"Function count mismatch: expected {expected.registry.function_count}, got {self.registry.function_count}"
+            )
+        if (
+            self.registry.num_aggregate_functions
+            < expected.registry.num_aggregate_functions
+        ):
+            errors.append(
+                f"Aggregate function count mismatch: expected {expected.registry.num_aggregate_functions}, got {self.registry.num_aggregate_functions}"
+            )
+        if self.registry.num_scalar_functions < expected.registry.num_scalar_functions:
+            errors.append(
+                f"Scalar function count mismatch: expected {expected.registry.num_scalar_functions}, got {self.registry.num_scalar_functions}"
+            )
+        if self.registry.num_window_functions < expected.registry.num_window_functions:
+            errors.append(
+                f"Window function count mismatch: expected {expected.registry.num_window_functions}, got {self.registry.num_window_functions}"
+            )
+        if (
+            self.registry.num_function_overloads
+            < expected.registry.num_function_overloads
+        ):
+            errors.append(
+                f"Function overload count mismatch: expected {expected.registry.num_function_overloads}, got {self.registry.num_function_overloads}"
+            )
+
+        if self.coverage.total_test_count < expected.coverage.total_test_count:
+            errors.append(
+                f"Total test count mismatch: expected {expected.coverage.total_test_count}, got {self.coverage.total_test_count}"
+            )
+        if (
+            self.coverage.num_function_variants
+            < expected.coverage.num_function_variants
+        ):
+            errors.append(
+                f"Total function variants mismatch: expected {expected.coverage.num_function_variants}, got {self.coverage.num_function_variants}"
+            )
+        if (
+            self.coverage.num_covered_function_variants
+            < expected.coverage.num_covered_function_variants
+        ):
+            errors.append(
+                f"Covered function variants mismatch: expected {expected.coverage.num_covered_function_variants}, got {self.coverage.num_covered_function_variants}"
+            )
+
+        expected_coverage_gap = expected.num_function_variants_without_coverage()
+        actual_coverage_gap = self.num_function_variants_without_coverage()
+        if actual_coverage_gap > expected_coverage_gap:
+            errors.append(
+                f"Coverage gap too large: {actual_coverage_gap} function variants with no tests, "
+                f"out of {self.coverage.num_function_variants} total function variants. "
+                f"New functions should be added along with test cases that illustrate their behavior."
+            )
+
+        return errors
+
 
 def read_baseline_file(file_path: str) -> Baseline:
     with open(file_path, "r") as file:
diff --git a/tests/cases/comparison/is_false.test b/tests/cases/comparison/is_false.test
index ea2d78b2d..6cfd337d6 100644
--- a/tests/cases/comparison/is_false.test
+++ b/tests/cases/comparison/is_false.test
@@ -1,7 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
-# basic is_false
+# basic: Basic examples without any special cases
 is_false(true::bool) = false::bool
 is_false(false::bool) = true::bool
 is_false(null::bool) = false::bool
diff --git a/tests/cases/comparison/is_not_false.test b/tests/cases/comparison/is_not_false.test
index a19bb53de..202b84318 100644
--- a/tests/cases/comparison/is_not_false.test
+++ b/tests/cases/comparison/is_not_false.test
@@ -1,7 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
-# basic is_not_false
+# basic: Basic examples without any special cases
 is_not_false(true::bool) = true::bool
 is_not_false(false::bool) = false::bool
 is_not_false(null::bool) = true::bool
diff --git a/tests/cases/comparison/is_not_true b/tests/cases/comparison/is_not_true.test
similarity index 80%
rename from tests/cases/comparison/is_not_true
rename to tests/cases/comparison/is_not_true.test
index 997735bb1..58b42781a 100644
--- a/tests/cases/comparison/is_not_true
+++ b/tests/cases/comparison/is_not_true.test
@@ -1,7 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
-# basic is_not_true
+# basic: Basic examples without any special cases
 is_not_true(true::bool) = false::bool
 is_not_true(false::bool) = true::bool
 is_not_true(null::bool) = true::bool
diff --git a/tests/cases/comparison/is_true.test b/tests/cases/comparison/is_true.test
index cb1f30170..4bddaf2d8 100644
--- a/tests/cases/comparison/is_true.test
+++ b/tests/cases/comparison/is_true.test
@@ -1,7 +1,7 @@
 ### SUBSTRAIT_SCALAR_TEST: v1.0
 ### SUBSTRAIT_INCLUDE: '/extensions/functions_comparison.yaml'
 
-# basic is_true
+# basic: Basic examples without any special cases
 is_true(true::bool) = true::bool
 is_true(false::bool) = false::bool
 is_true(null::bool) = false::bool
diff --git a/tests/test_extensions.py b/tests/test_extensions.py
index 95fff93e1..064a67eaa 100644
--- a/tests/test_extensions.py
+++ b/tests/test_extensions.py
@@ -10,72 +10,6 @@
 from tests.coverage.extensions import Extension
 
 
-def compare_baselines(expected, actual):
-    errors = []
-
-    if actual.registry.extension_count < expected.registry.extension_count:
-        errors.append(
-            f"Extension count mismatch: expected {expected.registry.extension_count}, got {actual.registry.extension_count}"
-        )
-    if actual.registry.dependency_count < expected.registry.dependency_count:
-        errors.append(
-            f"Dependency count mismatch: expected {expected.registry.dependency_count}, got {actual.registry.dependency_count}"
-        )
-    if actual.registry.function_count < expected.registry.function_count:
-        errors.append(
-            f"Function count mismatch: expected {expected.registry.function_count}, got {actual.registry.function_count}"
-        )
-    if (
-        actual.registry.num_aggregate_functions
-        < expected.registry.num_aggregate_functions
-    ):
-        errors.append(
-            f"Aggregate function count mismatch: expected {expected.registry.num_aggregate_functions}, got {actual.registry.num_aggregate_functions}"
-        )
-    if actual.registry.num_scalar_functions < expected.registry.num_scalar_functions:
-        errors.append(
-            f"Scalar function count mismatch: expected {expected.registry.num_scalar_functions}, got {actual.registry.num_scalar_functions}"
-        )
-    if actual.registry.num_window_functions < expected.registry.num_window_functions:
-        errors.append(
-            f"Window function count mismatch: expected {expected.registry.num_window_functions}, got {actual.registry.num_window_functions}"
-        )
-    if (
-        actual.registry.num_function_overloads
-        < expected.registry.num_function_overloads
-    ):
-        errors.append(
-            f"Function overload count mismatch: expected {expected.registry.num_function_overloads}, got {actual.registry.num_function_overloads}"
-        )
-
-    if actual.coverage.total_test_count < expected.coverage.total_test_count:
-        errors.append(
-            f"Total test count mismatch: expected {expected.coverage.total_test_count}, got {actual.coverage.total_test_count}"
-        )
-    if actual.coverage.num_function_variants < expected.coverage.num_function_variants:
-        errors.append(
-            f"Total function variants mismatch: expected {expected.coverage.num_function_variants}, got {actual.coverage.num_function_variants}"
-        )
-    if (
-        actual.coverage.num_covered_function_variants
-        < expected.coverage.num_covered_function_variants
-    ):
-        errors.append(
-            f"Covered function variants mismatch: expected {expected.coverage.num_covered_function_variants}, got {actual.coverage.num_covered_function_variants}"
-        )
-
-    expected_coverage_gap = expected.num_function_variants_without_coverage()
-    actual_coverage_gap = actual.num_function_variants_without_coverage()
-    if actual_coverage_gap > expected_coverage_gap:
-        errors.append(
-            f"Coverage gap too large: {actual_coverage_gap} function variants with no tests, "
-            f"out of {actual.coverage.num_function_variants} total function variants. "
-            f"New functions should be added along with test cases that illustrate their behavior."
-        )
-
-    return errors
-
-
 # NOTE: this test is run as part of pre-commit hook
 def test_substrait_extension_coverage():
     script_dir = os.path.dirname(os.path.abspath(__file__))
@@ -92,7 +26,7 @@ def test_substrait_extension_coverage():
     ), f"{coverage.num_tests_with_no_matching_function} tests with no matching function"
 
     actual_baseline = generate_baseline(registry, coverage)
-    errors = compare_baselines(baseline, actual_baseline)
+    errors = actual_baseline.validate_against(baseline)
     assert not errors, (
         "\n".join(errors)
         + f"The baseline file does not match the current test coverage. "