Create google_research/batch_science.

PiperOrigin-RevId: 238674255
nevils235 · Mar 15, 2019 · fc7319c · fc7319c
1 parent 55247eb
commit fc7319c
Show file tree

Hide file tree

Showing 7 changed files with 2,631 additions and 0 deletions.
diff --git a/batch_science/README.md b/batch_science/README.md
diff --git a/batch_science/__init__.py b/batch_science/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2019 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/batch_science/data_loader.py b/batch_science/data_loader.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2019 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for data loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+
+import pandas as pd
+
+
+def _list_subdirs(base_dir):
+  """Lists all subdirectories in base_dir, raising ValueError if none exist."""
+  subdirs = []
+  for dir_entry in os.listdir(base_dir):
+    if os.path.isdir(os.path.join(base_dir, dir_entry)):
+      subdirs.append(dir_entry)
+
+  if not subdirs:
+    raise ValueError("No subdirectories found in {}".format(base_dir))
+
+  return subdirs
+
+
+def load_study(study_dir,
+               num_trials=None,
+               load_complete_trials=True,
+               load_incomplete_trials=False,
+               load_infeasible_trials=False):
+  """Loads measurements for all trials in a study.
+
+  A study is a metaparameter search for a given workload and batch size. A trial
+  is a training run of a particular metaparameter configuration within the
+  study.
+
+  Args:
+    study_dir: Directory containing 'study.json' and subdirectories
+      corresponding to individual trials.
+    num_trials: The number of trials to load. Default is to load all trials.
+    load_complete_trials: Whether to load complete trials.
+    load_incomplete_trials: Whether to load incomplete trials.
+    load_infeasible_trials: Whether to load infeasible trials.
+
+  Returns:
+    study_metadata: A dict of study metadata.
+    study_measurements: A Pandas DataFrame indexed by (trial_id, step).
+
+  Raises:
+    ValueError: If none of load_complete_trials, load_incomplete_trials, or
+      load_infeasible_trials is True.
+  """
+  # Determine which trial statuses to load.
+  status_whitelist = set()
+  if load_complete_trials:
+    status_whitelist.add("COMPLETE")
+  if load_incomplete_trials:
+    status_whitelist.add("INCOMPLETE")
+  if load_infeasible_trials:
+    status_whitelist.add("INFEASIBLE")
+  if not status_whitelist:
+    raise ValueError(
+        "At least one of load_complete_trials, load_incomplete_trials, or "
+        "load_infeasible_trials must be True.")
+
+  trial_ids = []
+  measurements_tables = []
+
+  # Load the study metadata.
+  with open(os.path.join(study_dir, "study.json")) as study_file:
+    study_metadata = json.load(study_file)
+  study_metadata["trials"] = collections.OrderedDict()
+
+  # Find all trial directories.
+  trial_dirs = _list_subdirs(study_dir)
+  trial_dirs.sort(key=int)  # Trial directory names are integers.
+
+  for trial_dir in trial_dirs:
+    # Load trial metadata.
+    trial_dir = os.path.join(study_dir, trial_dir)
+    with open(os.path.join(trial_dir, "metadata.json")) as metadata_file:
+      trial_metadata = json.load(metadata_file)
+
+    # Ignore trials with the wrong status.
+    status = trial_metadata["status"]
+    if status_whitelist and status not in status_whitelist:
+      continue
+
+    # Add trial metadata to the study metadata.
+    trial_id = trial_metadata["trial_id"]
+    trial_ids.append(trial_id)
+    study_metadata["trials"][trial_id] = trial_metadata
+
+    # Read the measurements.
+    measurements_file = os.path.join(trial_dir, "measurements.csv")
+    measurements_tables.append(pd.read_csv(measurements_file, index_col="step"))
+
+    if num_trials and len(trial_ids) >= num_trials:
+      break  # Already loaded the required number of trials.
+
+  # Validate the number of trials.
+  if not trial_ids:
+    raise ValueError("No trials with status {} found in {}".format(
+        list(status_whitelist), study_dir))
+
+  if num_trials and len(trial_ids) != num_trials:
+    raise ValueError(
+        "Requested {} trials with status {}, but found only {} trials in {}"
+        .format(num_trials, list(status_whitelist), len(trial_ids), study_dir))
+
+  study_measurements = pd.concat(
+      measurements_tables, keys=trial_ids, names=["trial_id"])
+  return study_metadata, study_measurements
+
+
+def load_workload(workload_dir,
+                  num_trials=None,
+                  load_complete_trials=True,
+                  load_incomplete_trials=False,
+                  load_infeasible_trials=False):
+  """Loads all studies within a given workload.
+
+  A workload is a triplet of (dataset, model, optimizer). A study is a
+  metaparameter search for a given workload and batch size. A study is
+  comprised of trials. A trial is a training run of a particular metaparameter
+  configuration.
+
+  Args:
+    workload_dir: Directory containing subdirectories corresponding to
+      individual studies.
+    num_trials: The number of trials to load per study. Default is to load all
+      trials.
+    load_complete_trials: Whether to load complete trials.
+    load_incomplete_trials: Whether to load incomplete trials.
+    load_infeasible_trials: Whether to load infeasible trials.
+
+  Returns:
+    workload_metadata: A dict containing the metadata for each study.
+    workload_table: A Pandas DataFrame indexed by (batch_size, trial_id, step).
+  """
+  batch_sizes = []
+  study_tables = []
+  workload_metadata = collections.OrderedDict()
+
+  study_dirs = _list_subdirs(workload_dir)
+  study_dirs.sort(key=int)  # Study directory names are integers.
+
+  for study_dir in study_dirs:
+    study_metadata, study_measurements = load_study(
+        os.path.join(workload_dir, study_dir),
+        num_trials=num_trials,
+        load_complete_trials=load_complete_trials,
+        load_incomplete_trials=load_incomplete_trials,
+        load_infeasible_trials=load_infeasible_trials)
+
+    batch_size = int(study_metadata["batch_size"])
+    batch_sizes.append(batch_size)
+    study_tables.append(study_measurements)
+    workload_metadata[batch_size] = study_metadata
+
+  print("Loaded {} batch sizes for {} on {} with optimizer {}".format(
+      len(batch_sizes), study_metadata["model"], study_metadata["dataset"],
+      study_metadata["optimizer"]))
+
+  workload_table = pd.concat(
+      study_tables, keys=batch_sizes, names=["batch_size"])
+  return workload_metadata, workload_table
diff --git a/batch_science/measurement_utils.py b/batch_science/measurement_utils.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2019 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for manipulating DataFrames of trial measurements."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import operator
+
+import pandas as pd
+
+
+def get_index_values(df, level):
+  """Gets index values from a DataFrame MultiIndex.
+
+  Args:
+    df: A DataFrame.
+    level: The integer position of the level in the MultiIndex, or the name of
+      the level.
+
+  Returns:
+    Vector of index values.
+  """
+  return df.index.get_level_values(level).values
+
+
+def apply_step_budget(measurements, step_budget):
+  """Filters measurements to those satisfying a budget in terms of steps."""
+  return measurements[get_index_values(measurements, "step") <= step_budget]
+
+
+def apply_example_budget(measurements, example_budget):
+  """Filters measurements to those satisfying a budget in terms of examples."""
+  batch_size = get_index_values(measurements, "batch_size")
+  num_steps = get_index_values(measurements, "step")
+  return measurements[batch_size * num_steps <= example_budget]
+
+
+def compute_steps_to_result(measurements,
+                            objective_col_name,
+                            threshold,
+                            maximize=False,
+                            group_by="batch_size"):
+  """Returns the measurements that reached the threshold in the fewest steps.
+
+  Args:
+    measurements: DataFrame of measurements indexed by at least (trial_id,
+      step).
+    objective_col_name: Column name of the objective metric.
+    threshold: Target value of the objective metric.
+    maximize: Whether the goal is to maximize (as opposed to minimize) the
+      objective metric.
+    group_by: Any valid first argument to DataFrame.groupby, for example a
+      column name or list of column names. If empty or None, the operation is
+      performed over the entire measurements table.
+
+  Returns:
+    A DataFrame with either 0 or 1 row per group, which is the measurement that
+    reached the threshold in the fewest steps for that group (if any).
+  """
+  if group_by:
+    # For each group, recursively call this function without group_by and
+    # concatenate the results.
+    return pd.concat([
+        compute_steps_to_result(grouped_measurements, objective_col_name,
+                                threshold, maximize, None)
+        for _, grouped_measurements in measurements.groupby(group_by)
+    ])
+
+  # Sort measurements by step, breaking ties by trial id.
+  measurements = measurements.sort_index(level=["step", "trial_id"])
+
+  # Select all rows satisfying the threshold.
+  comparator = operator.gt if maximize else operator.lt
+  good_measurements = measurements[comparator(measurements[objective_col_name],
+                                              threshold)]
+  if good_measurements.empty:
+    return good_measurements  # Return a table with no rows
+
+  # Return the first measurement row satisfying the threshold.
+  return good_measurements.iloc[[0]]
+
+
+def get_best_measurement(measurements,
+                         objective_col_name,
+                         maximize=False,
+                         group_by="batch_size"):
+  """Returns the measurement corresponding to the best objective value.
+
+  Args:
+    measurements: DataFrame of measurements.
+    objective_col_name: Column name of the objective metric.
+    maximize: Whether the goal is to maximize (as opposed to minimize) the
+      objective metric.
+    group_by: Any valid first argument to DataFrame.groupby, for example a
+      column name or list of column names. If empty or None, the operation is
+      performed over the entire measurements table.
+
+  Returns:
+    A DataFrame with 1 row per group, which is the measurement corresponding to
+    the best objective value for that group.
+  """
+  if group_by:
+    # For each group, recursively call this function without group_by and
+    # concatenate the results.
+    return pd.concat([
+        get_best_measurement(grouped_measurements, objective_col_name, maximize,
+                             None)
+        for _, grouped_measurements in measurements.groupby(group_by)
+    ])
+
+  # Sort measurements by objective and return the first one.
+  measurements = measurements.sort_values(
+      objective_col_name, ascending=not maximize)
+  return measurements.iloc[[0]]