Skip to content

Commit

Permalink
Create google_research/batch_science.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 238674255
  • Loading branch information
cshallue authored and copybara-github committed Mar 15, 2019
1 parent 55247eb commit fc7319c
Show file tree
Hide file tree
Showing 7 changed files with 2,631 additions and 0 deletions.
669 changes: 669 additions & 0 deletions batch_science/README.md

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions batch_science/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2019 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

183 changes: 183 additions & 0 deletions batch_science/data_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
# coding=utf-8
# Copyright 2019 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Functions for data loading."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import json
import os

import pandas as pd


def _list_subdirs(base_dir):
"""Lists all subdirectories in base_dir, raising ValueError if none exist."""
subdirs = []
for dir_entry in os.listdir(base_dir):
if os.path.isdir(os.path.join(base_dir, dir_entry)):
subdirs.append(dir_entry)

if not subdirs:
raise ValueError("No subdirectories found in {}".format(base_dir))

return subdirs


def load_study(study_dir,
num_trials=None,
load_complete_trials=True,
load_incomplete_trials=False,
load_infeasible_trials=False):
"""Loads measurements for all trials in a study.
A study is a metaparameter search for a given workload and batch size. A trial
is a training run of a particular metaparameter configuration within the
study.
Args:
study_dir: Directory containing 'study.json' and subdirectories
corresponding to individual trials.
num_trials: The number of trials to load. Default is to load all trials.
load_complete_trials: Whether to load complete trials.
load_incomplete_trials: Whether to load incomplete trials.
load_infeasible_trials: Whether to load infeasible trials.
Returns:
study_metadata: A dict of study metadata.
study_measurements: A Pandas DataFrame indexed by (trial_id, step).
Raises:
ValueError: If none of load_complete_trials, load_incomplete_trials, or
load_infeasible_trials is True.
"""
# Determine which trial statuses to load.
status_whitelist = set()
if load_complete_trials:
status_whitelist.add("COMPLETE")
if load_incomplete_trials:
status_whitelist.add("INCOMPLETE")
if load_infeasible_trials:
status_whitelist.add("INFEASIBLE")
if not status_whitelist:
raise ValueError(
"At least one of load_complete_trials, load_incomplete_trials, or "
"load_infeasible_trials must be True.")

trial_ids = []
measurements_tables = []

# Load the study metadata.
with open(os.path.join(study_dir, "study.json")) as study_file:
study_metadata = json.load(study_file)
study_metadata["trials"] = collections.OrderedDict()

# Find all trial directories.
trial_dirs = _list_subdirs(study_dir)
trial_dirs.sort(key=int) # Trial directory names are integers.

for trial_dir in trial_dirs:
# Load trial metadata.
trial_dir = os.path.join(study_dir, trial_dir)
with open(os.path.join(trial_dir, "metadata.json")) as metadata_file:
trial_metadata = json.load(metadata_file)

# Ignore trials with the wrong status.
status = trial_metadata["status"]
if status_whitelist and status not in status_whitelist:
continue

# Add trial metadata to the study metadata.
trial_id = trial_metadata["trial_id"]
trial_ids.append(trial_id)
study_metadata["trials"][trial_id] = trial_metadata

# Read the measurements.
measurements_file = os.path.join(trial_dir, "measurements.csv")
measurements_tables.append(pd.read_csv(measurements_file, index_col="step"))

if num_trials and len(trial_ids) >= num_trials:
break # Already loaded the required number of trials.

# Validate the number of trials.
if not trial_ids:
raise ValueError("No trials with status {} found in {}".format(
list(status_whitelist), study_dir))

if num_trials and len(trial_ids) != num_trials:
raise ValueError(
"Requested {} trials with status {}, but found only {} trials in {}"
.format(num_trials, list(status_whitelist), len(trial_ids), study_dir))

study_measurements = pd.concat(
measurements_tables, keys=trial_ids, names=["trial_id"])
return study_metadata, study_measurements


def load_workload(workload_dir,
num_trials=None,
load_complete_trials=True,
load_incomplete_trials=False,
load_infeasible_trials=False):
"""Loads all studies within a given workload.
A workload is a triplet of (dataset, model, optimizer). A study is a
metaparameter search for a given workload and batch size. A study is
comprised of trials. A trial is a training run of a particular metaparameter
configuration.
Args:
workload_dir: Directory containing subdirectories corresponding to
individual studies.
num_trials: The number of trials to load per study. Default is to load all
trials.
load_complete_trials: Whether to load complete trials.
load_incomplete_trials: Whether to load incomplete trials.
load_infeasible_trials: Whether to load infeasible trials.
Returns:
workload_metadata: A dict containing the metadata for each study.
workload_table: A Pandas DataFrame indexed by (batch_size, trial_id, step).
"""
batch_sizes = []
study_tables = []
workload_metadata = collections.OrderedDict()

study_dirs = _list_subdirs(workload_dir)
study_dirs.sort(key=int) # Study directory names are integers.

for study_dir in study_dirs:
study_metadata, study_measurements = load_study(
os.path.join(workload_dir, study_dir),
num_trials=num_trials,
load_complete_trials=load_complete_trials,
load_incomplete_trials=load_incomplete_trials,
load_infeasible_trials=load_infeasible_trials)

batch_size = int(study_metadata["batch_size"])
batch_sizes.append(batch_size)
study_tables.append(study_measurements)
workload_metadata[batch_size] = study_metadata

print("Loaded {} batch sizes for {} on {} with optimizer {}".format(
len(batch_sizes), study_metadata["model"], study_metadata["dataset"],
study_metadata["optimizer"]))

workload_table = pd.concat(
study_tables, keys=batch_sizes, names=["batch_size"])
return workload_metadata, workload_table
129 changes: 129 additions & 0 deletions batch_science/measurement_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# coding=utf-8
# Copyright 2019 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Helper functions for manipulating DataFrames of trial measurements."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import operator

import pandas as pd


def get_index_values(df, level):
"""Gets index values from a DataFrame MultiIndex.
Args:
df: A DataFrame.
level: The integer position of the level in the MultiIndex, or the name of
the level.
Returns:
Vector of index values.
"""
return df.index.get_level_values(level).values


def apply_step_budget(measurements, step_budget):
"""Filters measurements to those satisfying a budget in terms of steps."""
return measurements[get_index_values(measurements, "step") <= step_budget]


def apply_example_budget(measurements, example_budget):
"""Filters measurements to those satisfying a budget in terms of examples."""
batch_size = get_index_values(measurements, "batch_size")
num_steps = get_index_values(measurements, "step")
return measurements[batch_size * num_steps <= example_budget]


def compute_steps_to_result(measurements,
objective_col_name,
threshold,
maximize=False,
group_by="batch_size"):
"""Returns the measurements that reached the threshold in the fewest steps.
Args:
measurements: DataFrame of measurements indexed by at least (trial_id,
step).
objective_col_name: Column name of the objective metric.
threshold: Target value of the objective metric.
maximize: Whether the goal is to maximize (as opposed to minimize) the
objective metric.
group_by: Any valid first argument to DataFrame.groupby, for example a
column name or list of column names. If empty or None, the operation is
performed over the entire measurements table.
Returns:
A DataFrame with either 0 or 1 row per group, which is the measurement that
reached the threshold in the fewest steps for that group (if any).
"""
if group_by:
# For each group, recursively call this function without group_by and
# concatenate the results.
return pd.concat([
compute_steps_to_result(grouped_measurements, objective_col_name,
threshold, maximize, None)
for _, grouped_measurements in measurements.groupby(group_by)
])

# Sort measurements by step, breaking ties by trial id.
measurements = measurements.sort_index(level=["step", "trial_id"])

# Select all rows satisfying the threshold.
comparator = operator.gt if maximize else operator.lt
good_measurements = measurements[comparator(measurements[objective_col_name],
threshold)]
if good_measurements.empty:
return good_measurements # Return a table with no rows

# Return the first measurement row satisfying the threshold.
return good_measurements.iloc[[0]]


def get_best_measurement(measurements,
objective_col_name,
maximize=False,
group_by="batch_size"):
"""Returns the measurement corresponding to the best objective value.
Args:
measurements: DataFrame of measurements.
objective_col_name: Column name of the objective metric.
maximize: Whether the goal is to maximize (as opposed to minimize) the
objective metric.
group_by: Any valid first argument to DataFrame.groupby, for example a
column name or list of column names. If empty or None, the operation is
performed over the entire measurements table.
Returns:
A DataFrame with 1 row per group, which is the measurement corresponding to
the best objective value for that group.
"""
if group_by:
# For each group, recursively call this function without group_by and
# concatenate the results.
return pd.concat([
get_best_measurement(grouped_measurements, objective_col_name, maximize,
None)
for _, grouped_measurements in measurements.groupby(group_by)
])

# Sort measurements by objective and return the first one.
measurements = measurements.sort_values(
objective_col_name, ascending=not maximize)
return measurements.iloc[[0]]
Loading

0 comments on commit fc7319c

Please sign in to comment.