Rescale runtimes on LCBench early stopping problems (#3367)

Summary: Pull Request resolved: #3367 On some of the problems, the typical epoch takes 40+ virtual seconds. This means that the simulator checks whether there is new data 40 * (50 epochs) * (30-50 trials). This makes these very slow to run. This diff rescales the runtimes so that the median epoch for each LCBench problem takes one virtual time step. This does have a substantive impact on performance because it means that for trials that run faster than the median, multiple epochs can elapse before we check for early stopping. However, this is realistic. Reviewed By: ltiao Differential Revision: D69616468 fbshipit-source-id: 69d3ce1c433b4f5a4a76b53ce365d8b6d2686db3
facebook · Feb 19, 2025 · f32943f · f32943f
1 parent 9ebd579
commit f32943f
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 6 deletions.
diff --git a/ax/benchmark/problems/surrogate/lcbench/early_stopping.py b/ax/benchmark/problems/surrogate/lcbench/early_stopping.py
@@ -85,6 +85,46 @@
     "volkert": 64.02699279785156,
 }
 
+# Chosen so that for the median parameterization, one step takes one virtual
+# second.
+RUNTIME_MULTIPLIERS = {
+    "APSFailure": 0.2498361853616576,
+    "Amazon_employee_access": 0.5596248586092226,
+    "Australian": 1.0748285031033658,
+    "Fashion-MNIST": 0.057873893027107395,
+    "KDDCup09_appetency": 0.2714829300383819,
+    "MiniBooNE": 0.20530997463252754,
+    "adult": 0.4423476551967684,
+    "airlines": 0.06473793535537586,
+    "albert": 0.06832237841522835,
+    "bank-marketing": 0.46517394252532845,
+    "blood-transfusion-service-center": 1.1101296769694071,
+    "car": 1.0536256049024968,
+    "christine": 0.025742718302424954,
+    "cnae-9": 0.08811760797353926,
+    "connect-4": 0.33489219890695243,
+    "covertype": 0.05049155246078877,
+    "credit-g": 1.0400726314123157,
+    "dionis": 0.0231601276801126,
+    "fabert": 0.08971358669025394,
+    "helena": 0.25008050673472376,
+    "higgs": 0.26990484596881176,
+    "jannis": 0.2828372999943685,
+    "jasmine": 0.7655180467265444,
+    "jungle_chess_2pcs_raw_endgame_complete": 0.47160243094434906,
+    "kc1": 1.0143178289557349,
+    "kr-vs-kp": 0.9390239320512418,
+    "mfeat-factors": 0.595676967891612,
+    "nomao": 0.4420599860962263,
+    "numerai28.6": 0.28377545234818863,
+    "phoneme": 0.9689051773179346,
+    "segment": 1.000676324600838,
+    "shuttle": 0.39362569776573014,
+    "sylvine": 0.9179851039769921,
+    "vehicle": 1.048848701347826,
+    "volkert": 0.28538440509808005,
+}
+
 
 class RegressorProtocol(Protocol):
     """
@@ -227,7 +267,7 @@ def evaluate_true(self, params: Mapping[str, TParamValue]) -> torch.Tensor:
     def step_runtime(self, params: Mapping[str, TParamValue]) -> float:
         X = pd.DataFrame.from_records(data=[params])
         Y = self.runtime_surrogate.predict(X=X)  # shape: (1,)
-        return Y.item()
+        return Y.item() * RUNTIME_MULTIPLIERS[self.dataset_name]
 
 
 def get_lcbench_early_stopping_benchmark_problem(

diff --git a/ax/benchmark/tests/problems/surrogate/lcbench/test_early_stopping.py b/ax/benchmark/tests/problems/surrogate/lcbench/test_early_stopping.py
@@ -7,17 +7,30 @@
 
 from unittest.mock import patch
 
+import numpy as np
+
 from ax.benchmark.problems.surrogate.lcbench.early_stopping import (
-    BASELINE_VALUES,
     get_lcbench_early_stopping_benchmark_problem,
+    LearningCurveBenchmarkTestFunction,
     OPTIMAL_VALUES,
+    RUNTIME_MULTIPLIERS,
+)
+from ax.benchmark.problems.surrogate.lcbench.utils import (
+    BASELINE_VALUES,
+    DEFAULT_METRIC_NAME,
 )
-from ax.benchmark.problems.surrogate.lcbench.utils import DEFAULT_METRIC_NAME
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.benchmark_stubs import get_mock_lcbench_data
+from pyre_extensions import assert_is_instance, none_throws
 
 
 class TestEarlyStoppingProblem(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.early_stopping_path = (
+            get_lcbench_early_stopping_benchmark_problem.__module__
+        )
+
     def test_get_lcbench_early_stopping_problem(self) -> None:
         # Just test one problem for speed. We are mocking out the data load
         # anyway, so there is nothing to distinguish these problems from each
@@ -29,13 +42,12 @@ def test_get_lcbench_early_stopping_problem(self) -> None:
         seed = 27
         dataset_name = "credit-g"
 
-        early_stopping_path = get_lcbench_early_stopping_benchmark_problem.__module__
         with patch(
-            f"{early_stopping_path}.load_lcbench_data",
+            f"{self.early_stopping_path}.load_lcbench_data",
             return_value=get_mock_lcbench_data(),
         ) as mock_load_lcbench_data, patch(
             # Fitting a surrogate won't work with this small synthetic data
-            f"{early_stopping_path}._create_surrogate_regressor"
+            f"{self.early_stopping_path}._create_surrogate_regressor"
         ) as mock_create_surrogate_regressor:
             problem = get_lcbench_early_stopping_benchmark_problem(
                 dataset_name=dataset_name,
@@ -61,3 +73,32 @@ def test_get_lcbench_early_stopping_problem(self) -> None:
         self.assertIsNone(problem.step_runtime_function)
         self.assertEqual(problem.optimal_value, OPTIMAL_VALUES[dataset_name])
         self.assertEqual(problem.baseline_value, BASELINE_VALUES[dataset_name])
+
+    def test_step_scaling(self) -> None:
+        dataset_name = "car"
+        with (
+            patch(
+                f"{self.early_stopping_path}.load_lcbench_data",
+                return_value=get_mock_lcbench_data(),
+            ),
+            patch(
+                # Fitting a surrogate won't work with this small synthetic data
+                f"{self.early_stopping_path}._create_surrogate_regressor"
+            ),
+        ):
+            problem = get_lcbench_early_stopping_benchmark_problem(
+                dataset_name=dataset_name,
+            )
+
+        predicted_runtime = 1234.5
+        test_function = assert_is_instance(
+            problem.test_function, LearningCurveBenchmarkTestFunction
+        )
+        # pyre-fixme[8]: Incompatible attribute type -- not a bound method
+        test_function.runtime_surrogate.predict = lambda X: np.array(
+            [predicted_runtime]
+        )
+        self.assertEqual(
+            none_throws(problem.step_runtime_function)(params={"param": 0}),
+            predicted_runtime * RUNTIME_MULTIPLIERS[dataset_name],
+        )
diff --git a/ax/benchmark/tests/problems/test_lcbench_benchmark.py b/ax/benchmark/tests/problems/test_lcbench_benchmark.py
@@ -17,6 +17,7 @@
 
 
 class TestLCBenchBenchmark(TestCase):
+    @TestCase.ax_long_test(reason="Training random forest regressor")
     def test_lcbench_predictions(self) -> None:
         self.assertEqual(len(DEFAULT_AND_OPTIMAL_VALUES), 22)
         # NOTE: lots of tasks, so testing only one here o/w this is very slow