From 558a618af2f06ccc741c1c0ac3f97ce9ed5cb286 Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@meta.com>
Date: Thu, 13 Feb 2025 15:31:23 -0800
Subject: [PATCH] Rescale runtimes on LCBench early stopping problems (#3367)

Summary:

On some of the problems, the typical epoch takes 40+ virtual seconds. This means that the simulator checks whether there is new data 40 * (50 epochs) * (30-50 trials). This makes these very slow to run.

This diff rescales the runtimes so that the median epoch for each LCBench problem takes one virtual time step. This does have a substantive impact on performance because it means that for trials that run faster than the median, multiple epochs can elapse before we check for early stopping. However, this is realistic.

Differential Revision: D69616468
---
 .../surrogate/lcbench/early_stopping.py       | 42 +++++++++++++++-
 .../surrogate/lcbench/test_early_stopping.py  | 48 +++++++++++++++++--
 .../tests/problems/test_lcbench_benchmark.py  |  1 +
 3 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/ax/benchmark/problems/surrogate/lcbench/early_stopping.py b/ax/benchmark/problems/surrogate/lcbench/early_stopping.py
index e0d54677d19..51acf3cd593 100644
--- a/ax/benchmark/problems/surrogate/lcbench/early_stopping.py
+++ b/ax/benchmark/problems/surrogate/lcbench/early_stopping.py
@@ -85,6 +85,46 @@
     "volkert": 64.02699279785156,
 }
 
+# Chosen so that for the median parameterization, one step takes one virtual
+# second.
+RUNTIME_MULTIPLIERS = {
+    "APSFailure": 0.2498361853616576,
+    "Amazon_employee_access": 0.5596248586092226,
+    "Australian": 1.0748285031033658,
+    "Fashion-MNIST": 0.057873893027107395,
+    "KDDCup09_appetency": 0.2714829300383819,
+    "MiniBooNE": 0.20530997463252754,
+    "adult": 0.4423476551967684,
+    "airlines": 0.06473793535537586,
+    "albert": 0.06832237841522835,
+    "bank-marketing": 0.46517394252532845,
+    "blood-transfusion-service-center": 1.1101296769694071,
+    "car": 1.0536256049024968,
+    "christine": 0.025742718302424954,
+    "cnae-9": 0.08811760797353926,
+    "connect-4": 0.33489219890695243,
+    "covertype": 0.05049155246078877,
+    "credit-g": 1.0400726314123157,
+    "dionis": 0.0231601276801126,
+    "fabert": 0.08971358669025394,
+    "helena": 0.25008050673472376,
+    "higgs": 0.26990484596881176,
+    "jannis": 0.2828372999943685,
+    "jasmine": 0.7655180467265444,
+    "jungle_chess_2pcs_raw_endgame_complete": 0.47160243094434906,
+    "kc1": 1.0143178289557349,
+    "kr-vs-kp": 0.9390239320512418,
+    "mfeat-factors": 0.595676967891612,
+    "nomao": 0.4420599860962263,
+    "numerai28.6": 0.28377545234818863,
+    "phoneme": 0.9689051773179346,
+    "segment": 1.000676324600838,
+    "shuttle": 0.39362569776573014,
+    "sylvine": 0.9179851039769921,
+    "vehicle": 1.048848701347826,
+    "volkert": 0.28538440509808005,
+}
+
 
 class RegressorProtocol(Protocol):
     """
@@ -227,7 +267,7 @@ def evaluate_true(self, params: Mapping[str, TParamValue]) -> torch.Tensor:
     def step_runtime(self, params: Mapping[str, TParamValue]) -> float:
         X = pd.DataFrame.from_records(data=[params])
         Y = self.runtime_surrogate.predict(X=X)  # shape: (1,)
-        return Y.item()
+        return Y.item() * RUNTIME_MULTIPLIERS[self.dataset_name]
 
 
 def get_lcbench_early_stopping_benchmark_problem(
diff --git a/ax/benchmark/tests/problems/surrogate/lcbench/test_early_stopping.py b/ax/benchmark/tests/problems/surrogate/lcbench/test_early_stopping.py
index 769e0c674da..e2d595c32fc 100644
--- a/ax/benchmark/tests/problems/surrogate/lcbench/test_early_stopping.py
+++ b/ax/benchmark/tests/problems/surrogate/lcbench/test_early_stopping.py
@@ -7,17 +7,30 @@
 
 from unittest.mock import patch
 
+import numpy as np
+
 from ax.benchmark.problems.surrogate.lcbench.early_stopping import (
-    BASELINE_VALUES,
     get_lcbench_early_stopping_benchmark_problem,
+    LearningCurveBenchmarkTestFunction,
     OPTIMAL_VALUES,
+    RUNTIME_MULTIPLIERS,
+)
+from ax.benchmark.problems.surrogate.lcbench.utils import (
+    BASELINE_VALUES,
+    DEFAULT_METRIC_NAME,
 )
-from ax.benchmark.problems.surrogate.lcbench.utils import DEFAULT_METRIC_NAME
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.benchmark_stubs import get_mock_lcbench_data
+from pyre_extensions import assert_is_instance, none_throws
 
 
 class TestEarlyStoppingProblem(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.early_stopping_path = (
+            get_lcbench_early_stopping_benchmark_problem.__module__
+        )
+
     def test_get_lcbench_early_stopping_problem(self) -> None:
         # Just test one problem for speed. We are mocking out the data load
         # anyway, so there is nothing to distinguish these problems from each
@@ -29,13 +42,12 @@ def test_get_lcbench_early_stopping_problem(self) -> None:
         seed = 27
         dataset_name = "credit-g"
 
-        early_stopping_path = get_lcbench_early_stopping_benchmark_problem.__module__
         with patch(
-            f"{early_stopping_path}.load_lcbench_data",
+            f"{self.early_stopping_path}.load_lcbench_data",
             return_value=get_mock_lcbench_data(),
         ) as mock_load_lcbench_data, patch(
             # Fitting a surrogate won't work with this small synthetic data
-            f"{early_stopping_path}._create_surrogate_regressor"
+            f"{self.early_stopping_path}._create_surrogate_regressor"
         ) as mock_create_surrogate_regressor:
             problem = get_lcbench_early_stopping_benchmark_problem(
                 dataset_name=dataset_name,
@@ -61,3 +73,29 @@ def test_get_lcbench_early_stopping_problem(self) -> None:
         self.assertIsNone(problem.step_runtime_function)
         self.assertEqual(problem.optimal_value, OPTIMAL_VALUES[dataset_name])
         self.assertEqual(problem.baseline_value, BASELINE_VALUES[dataset_name])
+
+    def test_step_scaling(self) -> None:
+        dataset_name = "car"
+        with patch(
+            f"{self.early_stopping_path}.load_lcbench_data",
+            return_value=get_mock_lcbench_data(),
+        ), patch(
+            # Fitting a surrogate won't work with this small synthetic data
+            f"{self.early_stopping_path}._create_surrogate_regressor"
+        ):
+            problem = get_lcbench_early_stopping_benchmark_problem(
+                dataset_name=dataset_name,
+            )
+
+        predicted_runtime = 1234.5
+        test_function = assert_is_instance(
+            problem.test_function, LearningCurveBenchmarkTestFunction
+        )
+        # pyre-fixme[8]: Incompatible attribute type -- not a bound method
+        test_function.runtime_surrogate.predict = lambda X: np.array(
+            [predicted_runtime]
+        )
+        self.assertEqual(
+            none_throws(problem.step_runtime_function)(params={"param": 0}),
+            predicted_runtime * RUNTIME_MULTIPLIERS[dataset_name],
+        )
diff --git a/ax/benchmark/tests/problems/test_lcbench_benchmark.py b/ax/benchmark/tests/problems/test_lcbench_benchmark.py
index f2e94011bfa..4377b8bfdf3 100644
--- a/ax/benchmark/tests/problems/test_lcbench_benchmark.py
+++ b/ax/benchmark/tests/problems/test_lcbench_benchmark.py
@@ -17,6 +17,7 @@
 
 
 class TestLCBenchBenchmark(TestCase):
+    @TestCase.ax_long_test(reason="Training random forest regressor")
     def test_lcbench_predictions(self) -> None:
         self.assertEqual(len(DEFAULT_AND_OPTIMAL_VALUES), 22)
         # NOTE: lots of tasks, so testing only one here o/w this is very slow