From 7489905370768ba489b141ae962fb8bc58469d6e Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Mon, 3 Feb 2020 19:07:04 -0800 Subject: [PATCH] Model comparison: NNLS initialization and alternating minimization (#13) * Correct notation: alternating minimization not expectation maximization * Prototype of alternating-minimization for affine parameters * Relocate L2-error minimizing affine shaping * Bugfix: include fit_kind in output directory * Affine transform: clarify parameter names, expose public setters * Alternating minimization: start cleaning up, reduce code duplication * Use non-negative least squares to find affine transformation * Change dataset generator to not assume batch size * Make model_comparison config more flexible * Fix up runner, config * Use least_l2_affine for pretrain * Make Regress*Model logging consistent; add input validation * Add unit test for least_l2_affine * Fix int vs float problem * Bugfix: handle total_timesteps==0 * Add large-form plot support * Increase Hopper running control cost by 10x (to make comparable % of reward as backflip control) * Always use random transitions for PointMass, hyperparameter sweep over epoch timesteps * runners: make output directory configurable * visualize bugfix: handle one column/one row figures * Restore ground truth ctrl coef to match Gym reward; downscale backflip reward instead * Make non-alternating maximization default again (alternating does not help much in tests) * Add test for both standard and alternating model comparison * Hardcoded comparison runner: remove epoch sweep * Use EVAL_OUTPUT_ROOT instead of ${HOME}/output * Fix typo in launch_docker.sh * Reformat visualize.py * Improve docstring --- runners/common.sh | 8 +- runners/comparison/hardcoded.sh | 12 +- runners/comparison/learnt.sh | 4 +- runners/eval/greedy_pm_hardcoded.sh | 2 +- runners/eval/greedy_pm_irl.sh | 4 +- runners/eval/learnt.sh | 8 +- runners/eval/static.sh | 2 +- runners/irl/train_irl.sh | 4 +- runners/preferences/hyper_sweep.sh | 4 +- runners/preferences/train_preferences.sh | 4 +- runners/regress/train_regress.sh | 4 +- runners/rl/expert_demos.sh | 2 +- runners/rl/transfer.sh | 4 +- runners/transfer_point_maze.sh | 6 +- runners/visualize/visualize_pm_reward.sh | 6 +- .../analysis/plot_divergence_heatmap.py | 19 +- .../analysis/plot_gridworld_divergence.py | 2 +- src/evaluating_rewards/analysis/visualize.py | 5 +- src/evaluating_rewards/comparisons.py | 162 ++++++++++++++-- src/evaluating_rewards/datasets.py | 60 +++--- src/evaluating_rewards/envs/mujoco.py | 2 +- .../experiments/comparisons.py | 2 +- .../experiments/point_mass_analysis.py | 4 +- .../experiments/synthetic.py | 19 +- src/evaluating_rewards/rewards.py | 178 ++++++++++++------ .../scripts/model_comparison.py | 93 +++++---- .../scripts/train_regress.py | 8 +- src/evaluating_rewards/tabular.py | 31 +-- tests/test_comparisons.py | 4 +- tests/test_rewards.py | 45 ++++- tests/test_scripts.py | 30 +-- tests/test_synthetic.py | 13 +- 32 files changed, 496 insertions(+), 255 deletions(-) diff --git a/runners/common.sh b/runners/common.sh index a814cdd..d93586c 100755 --- a/runners/common.sh +++ b/runners/common.sh @@ -23,12 +23,12 @@ function call_script { function learnt_model { if [[ $# -ne 1 ]]; then echo "usage: $0 " - echo "model prefix must be relative to ${OUTPUT_ROOT}" + echo "model prefix must be relative to ${EVAL_OUTPUT_ROOT}" exit 1 fi model_prefix=$1 - learnt_model_dir=${OUTPUT_ROOT}/${model_prefix} + learnt_model_dir=${EVAL_OUTPUT_ROOT}/${model_prefix} case ${model_prefix} in train_adversarial) @@ -53,4 +53,6 @@ eval "$(${ENV_REWARD_CMD} 2>/dev/null)" ENVS="${!REWARDS_BY_ENV[@]}" echo "Loaded mappings for environments ${ENVS}" -OUTPUT_ROOT=/mnt/eval_reward/data \ No newline at end of file +if [[ "${EVAL_OUTPUT_ROOT}" == "" ]]; then + EVAL_OUTPUT_ROOT=$HOME/output +fi diff --git a/runners/comparison/hardcoded.sh b/runners/comparison/hardcoded.sh index 1deacb2..00d37db 100755 --- a/runners/comparison/hardcoded.sh +++ b/runners/comparison/hardcoded.sh @@ -26,12 +26,18 @@ for env_name in "${!REWARDS_BY_ENV[@]}"; do types=${REWARDS_BY_ENV[$env_name]} env_name_sanitized=$(echo ${env_name} | sed -e 's/\//_/g') types_sanitized=$(echo ${types} | sed -e 's/\//_/g') - parallel --header : --results $HOME/output/parallel/comparison/hardcoded_mujoco \ - ${TRAIN_CMD} env_name=${env_name} \ + + named_configs="" + if [[ ${env_name} == "evaluating_rewards/PointMassLine-v0" ]]; then + named_configs="dataset_random_transition" + fi + + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/comparison/hardcoded_mujoco \ + ${TRAIN_CMD} env_name=${env_name} ${named_configs} \ seed={seed} \ source_reward_type={source_reward_type} \ target_reward_type={target_reward_type} \ - log_dir=${HOME}/output/comparison/hardcoded/${env_name_sanitized}/{source_reward_type_sanitized}_vs_{target_reward_type_sanitized}_seed{seed} \ + log_dir=${EVAL_OUTPUT_ROOT}/comparison/hardcoded/${env_name_sanitized}/{source_reward_type_sanitized}_vs_{target_reward_type_sanitized}_seed{seed} \ ::: source_reward_type ${types} \ :::+ source_reward_type_sanitized ${types_sanitized} \ ::: target_reward_type ${types} \ diff --git a/runners/comparison/learnt.sh b/runners/comparison/learnt.sh index 474398e..dc8b31b 100755 --- a/runners/comparison/learnt.sh +++ b/runners/comparison/learnt.sh @@ -35,12 +35,12 @@ for env_name in ${ENVS}; do echo "Models: ${MODELS}" echo "Hardcoded rewards: ${types}" - parallel --header : --results ${OUTPUT_ROOT}/parallel/comparison/learnt/${env_name_sanitized} \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/comparison/learnt/${env_name_sanitized} \ ${TRAIN_CMD} env_name=${env_name} seed={seed} \ source_reward_type=${source_reward_type} \ source_reward_path=${learnt_model_dir}/${env_name_sanitized}/{source_reward}/${model_name} \ target_reward_type={target_reward} {named_config} \ - log_dir=${OUTPUT_ROOT}/comparison/${model_prefix}/${env_name_sanitized}/{source_reward}/match_{named_config}_to_{target_reward_sanitized}_seed{seed} \ + log_dir=${EVAL_OUTPUT_ROOT}/comparison/${model_prefix}/${env_name_sanitized}/{source_reward}/match_{named_config}_to_{target_reward_sanitized}_seed{seed} \ ::: source_reward ${MODELS} \ ::: target_reward ${types} \ :::+ target_reward_sanitized ${types_sanitized} \ diff --git a/runners/eval/greedy_pm_hardcoded.sh b/runners/eval/greedy_pm_hardcoded.sh index de9f06d..ffc0c00 100755 --- a/runners/eval/greedy_pm_hardcoded.sh +++ b/runners/eval/greedy_pm_hardcoded.sh @@ -20,7 +20,7 @@ GREEDY_REWARD_MODELS="PointMassGroundTruth-v0:None \ PointMassSparse-v0:None \ PointMassDense-v0:None" -parallel --header : --results $HOME/output/parallel/greedy_pm_hardcoded \ +parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/greedy_pm_hardcoded \ ${EVAL_POLICY_CMD} policy_type=evaluating_rewards/MCGreedy-v0 \ env_name={env} policy_path={policy_path} \ ::: env ${PM_ENVS} \ diff --git a/runners/eval/greedy_pm_irl.sh b/runners/eval/greedy_pm_irl.sh index d5d021c..58099b8 100755 --- a/runners/eval/greedy_pm_irl.sh +++ b/runners/eval/greedy_pm_irl.sh @@ -18,13 +18,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" for env in ${ENVS}; do env_sanitized=$(echo ${env} | sed -e 's/\//_/g') - reward_paths=$HOME/output/train_adversarial/${env_sanitized}/*/final/discrim/reward_net + reward_paths=${EVAL_OUTPUT_ROOT}/train_adversarial/${env_sanitized}/*/final/discrim/reward_net policy_paths="" for rew_path in ${reward_paths}; do policy_paths="${policy_paths} BasicShapedRewardNet_shaped:${rew_path}" policy_paths="${policy_paths} BasicShapedRewardNet_unshaped:${rew_path}" done - parallel --header : --results $HOME/output/parallel/greedy_pm_irl \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/greedy_pm_irl \ ${EVAL_POLICY_CMD} env_name=${env} policy_type=evaluating_rewards/MCGreedy-v0 \ policy_path={policy_path} \ ::: policy_path ${policy_paths} diff --git a/runners/eval/learnt.sh b/runners/eval/learnt.sh index 68c9659..5ed9314 100755 --- a/runners/eval/learnt.sh +++ b/runners/eval/learnt.sh @@ -18,12 +18,12 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" if [[ $# -ne 1 ]]; then echo "usage: $0 " - echo "policy prefix must be relative to ${OUTPUT_ROOT}" + echo "policy prefix must be relative to ${EVAL_OUTPUT_ROOT}" exit 1 fi policy_prefix=$1 -policy_dir=${OUTPUT_ROOT}/${policy_prefix} +policy_dir=${EVAL_OUTPUT_ROOT}/${policy_prefix} model_name="policies/final" for env_name in ${ENVS}; do @@ -38,11 +38,11 @@ for env_name in ${ENVS}; do echo "Policies: ${policies}" echo "Hardcoded rewards: ${types}" - parallel --header : --results $HOME/output/parallel/learnt \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/learnt \ ${EVAL_POLICY_CMD} env_name=${env_name} policy_type=ppo2 \ reward_type={reward_type} \ policy_path=${policy_dir}/${env_name_sanitized}/{policy_path}/${model_name} \ - log_dir=${OUTPUT_ROOT}/eval/${policy_prefix}/${env_name_sanitized}/{policy_path}/eval_under_{reward_type_sanitized} \ + log_dir=${EVAL_OUTPUT_ROOT}/eval/${policy_prefix}/${env_name_sanitized}/{policy_path}/eval_under_{reward_type_sanitized} \ ::: reward_type ${types} \ :::+ reward_type_sanitized ${types_sanitized} \ ::: policy_path ${policies} diff --git a/runners/eval/static.sh b/runners/eval/static.sh index 7d18cb3..ad02838 100755 --- a/runners/eval/static.sh +++ b/runners/eval/static.sh @@ -18,7 +18,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" POLICY_TYPES="random zero" -parallel --header : --results $HOME/output/parallel/static \ +parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/static \ ${EVAL_POLICY_CMD} env_name={env} policy_type={policy_type} \ ::: env ${ENVS} \ ::: policy_type ${POLICY_TYPES} diff --git a/runners/irl/train_irl.sh b/runners/irl/train_irl.sh index 711f765..13a6d93 100755 --- a/runners/irl/train_irl.sh +++ b/runners/irl/train_irl.sh @@ -20,11 +20,11 @@ TRAIN_CMD=$(call_script "train_adversarial" "with") for env in ${ENVS}; do env_sanitized=$(echo ${env} | sed -e 's/\//_/g') - parallel --header : --results $HOME/output/parallel/train_irl \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_irl \ ${TRAIN_CMD} env_name=${env} seed={seed} \ init_trainer_kwargs.reward_kwargs.state_only={state_only} \ rollout_path={data_path}/rollouts/final.pkl \ - ::: data_path $HOME/output/expert_demos/${env_sanitized}/* \ + ::: data_path ${EVAL_OUTPUT_ROOT}/expert_demos/${env_sanitized}/* \ ::: state_only True False \ ::: seed 0 1 2 done diff --git a/runners/preferences/hyper_sweep.sh b/runners/preferences/hyper_sweep.sh index 230c860..d4ca7f8 100755 --- a/runners/preferences/hyper_sweep.sh +++ b/runners/preferences/hyper_sweep.sh @@ -23,12 +23,12 @@ PointMassDense-v0 PointMassSparse-v0 " -parallel --header : --results $HOME/output/parallel/train_preferences_hyper \ +parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_preferences_hyper \ ${TRAIN_CMD} env_name=evaluating_rewards/PointMassLine-v0 \ seed={seed} target_reward_type=evaluating_rewards/{target_reward} \ batch_timesteps={batch_timesteps} trajectory_length={trajectory_length} \ learning_rate={lr} total_timesteps=5e6 \ - log_dir=${HOME}/output/train_preferences_hyper/{target_reward}/batch{batch_timesteps}_of_{trajectory_length}_lr{lr}/{seed} \ + log_dir=${EVAL_OUTPUT_ROOT}/train_preferences_hyper/{target_reward}/batch{batch_timesteps}_of_{trajectory_length}_lr{lr}/{seed} \ ::: target_reward ${TARGET_REWARDS} \ ::: batch_timesteps 500 2500 10000 50000 250000 \ ::: trajectory_length 1 5 25 100 \ diff --git a/runners/preferences/train_preferences.sh b/runners/preferences/train_preferences.sh index 6a50444..c2c709a 100755 --- a/runners/preferences/train_preferences.sh +++ b/runners/preferences/train_preferences.sh @@ -25,10 +25,10 @@ for env_name in "${!REWARDS_BY_ENV[@]}"; do env_name_sanitized=$(echo ${env_name} | sed -e 's/\//_/g') types_sanitized=$(echo ${types} | sed -e 's/\//_/g') - parallel --header : --results $HOME/output/parallel/train_preferences/${env_name} \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_preferences/${env_name} \ ${TRAIN_CMD} env_name=${env_name} \ seed={seed} target_reward_type={target_reward} \ - log_dir=${HOME}/output/train_preferences/${env_name_sanitized}/{target_reward_sanitized}/{seed} \ + log_dir=${EVAL_OUTPUT_ROOT}/train_preferences/${env_name_sanitized}/{target_reward_sanitized}/{seed} \ ::: target_reward ${types} \ :::+ target_reward_sanitized ${types_sanitized} \ ::: seed 0 1 2 diff --git a/runners/regress/train_regress.sh b/runners/regress/train_regress.sh index f97eb77..2386c7e 100755 --- a/runners/regress/train_regress.sh +++ b/runners/regress/train_regress.sh @@ -25,10 +25,10 @@ for env_name in "${!REWARDS_BY_ENV[@]}"; do env_name_sanitized=$(echo ${env_name} | sed -e 's/\//_/g') types_sanitized=$(echo ${types} | sed -e 's/\//_/g') - parallel --header : --results $HOME/output/parallel/train_regress/${env_name_sanitized} \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_regress/${env_name_sanitized} \ ${TRAIN_CMD} env_name=${env_name} \ seed={seed} target_reward_type={target_reward} \ - log_dir=${HOME}/output/train_regress/${env_name_sanitized}/{target_reward_sanitized}/{seed} \ + log_dir=${EVAL_OUTPUT_ROOT}/train_regress/${env_name_sanitized}/{target_reward_sanitized}/{seed} \ ::: target_reward ${types} \ :::+ target_reward_sanitized ${types_sanitized} \ ::: seed 0 1 2 diff --git a/runners/rl/expert_demos.sh b/runners/rl/expert_demos.sh index 0bd3d09..2032a5a 100755 --- a/runners/rl/expert_demos.sh +++ b/runners/rl/expert_demos.sh @@ -20,7 +20,7 @@ EXPERT_DEMOS_CMD=$(call_script "expert_demos" "with") for env_name in ${ENVS}; do types=${REWARDS_BY_ENV[$env_name]} - parallel --header : --results $HOME/output/parallel/expert_demos \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/expert_demos \ ${EXPERT_DEMOS_CMD} env_name=${env_name} \ reward_type={type} seed={seed} \ ::: type ${types} \ diff --git a/runners/rl/transfer.sh b/runners/rl/transfer.sh index cf9ad78..6f61b20 100644 --- a/runners/rl/transfer.sh +++ b/runners/rl/transfer.sh @@ -38,11 +38,11 @@ for env_name in "${!TRANSFER_ENVS[@]}"; do transfer_envs="${env_name} ${TRANSFER_ENVS[$env_name]}" transfer_envs_sanitized=$(echo ${transfer_envs} | sed -e 's/\//_/g') - parallel --header : --results $HOME/output/parallel/expert_demos \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/expert_demos \ ${EXPERT_DEMOS_CMD} env_name={env_name} \ reward_type=${source_reward_type} seed={seed} \ reward_path=${learnt_model_dir}/${env_name_sanitized}/{reward_path}/${model_name} \ - log_dir=${OUTPUT_ROOT}/expert_transfer/${model_prefix}/${env_name_sanitized}/{env_name_sanitized}/{reward_path}/{seed} \ + log_dir=${EVAL_OUTPUT_ROOT}/expert_transfer/${model_prefix}/${env_name_sanitized}/{env_name_sanitized}/{reward_path}/{seed} \ ::: env_name ${transfer_envs} \ :::+ env_name_sanitized ${transfer_envs_sanitized} \ ::: reward_path ${MODELS} \ diff --git a/runners/transfer_point_maze.sh b/runners/transfer_point_maze.sh index 9ef4b6f..c910fd0 100755 --- a/runners/transfer_point_maze.sh +++ b/runners/transfer_point_maze.sh @@ -36,7 +36,7 @@ if [[ ${fast} == "true" ]]; then REGRESS_TIMESTEPS="fast" COMPARISON_TIMESTEPS="fast" EVAL_TIMESTEPS=4096 - PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze_fast + PM_OUTPUT=${EVAL_OUTPUT_ROOT}/transfer_point_maze_fast else RL_TIMESTEPS="" IRL_EPOCHS="" @@ -44,7 +44,7 @@ else REGRESS_TIMESTEPS="" COMPARISON_TIMESTEPS="" EVAL_TIMESTEPS=100000 - PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze + PM_OUTPUT=${EVAL_OUTPUT_ROOT}/transfer_point_maze fi @@ -142,7 +142,7 @@ wait for env in ${ENVS}; do env_sanitized=$(echo ${env} | sed -e 's/\//_/g') - parallel --header : --results $HOME/output/parallel/learnt \ + parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/learnt \ $(call_script "eval_policy" "with") render=False num_vec=8 \ eval_n_timesteps=${EVAL_TIMESTEPS} policy_type=ppo2 env_name=${env} \ reward_type=${TARGET_REWARD_TYPE} \ diff --git a/runners/visualize/visualize_pm_reward.sh b/runners/visualize/visualize_pm_reward.sh index 0804b79..4a2f6a6 100755 --- a/runners/visualize/visualize_pm_reward.sh +++ b/runners/visualize/visualize_pm_reward.sh @@ -23,19 +23,19 @@ VISUALIZE_CMD=$(call_script "visualize_pm_reward" "with") if [[ $# -ne 1 ]]; then echo "usage: $0 " - echo "model prefix must be relative to ${OUTPUT_ROOT}" + echo "model prefix must be relative to ${EVAL_OUTPUT_ROOT}" exit 1 fi MODEL_PREFIX=$1 -LEARNT_MODEL_DIR=${OUTPUT_ROOT}/${MODEL_PREFIX} +LEARNT_MODEL_DIR=${EVAL_OUTPUT_ROOT}/${MODEL_PREFIX} MODELS=$(find ${LEARNT_MODEL_DIR} -name model -printf "%P\n" | xargs dirname) echo "Visualizing models:" echo ${MODELS} -parallel --header : --results ${OUTPUT_ROOT}/parallel/visualize_pm_reward/ \ +parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/visualize_pm_reward/ \ ${VISUALIZE_CMD} env_name=evaluating_rewards/PointMassLine-v0 \ reward_type=evaluating_rewards/RewardModel-v0 \ reward_path=${LEARNT_MODEL_DIR}/{reward_path}/model \ diff --git a/src/evaluating_rewards/analysis/plot_divergence_heatmap.py b/src/evaluating_rewards/analysis/plot_divergence_heatmap.py index 44d76e9..d64906c 100644 --- a/src/evaluating_rewards/analysis/plot_divergence_heatmap.py +++ b/src/evaluating_rewards/analysis/plot_divergence_heatmap.py @@ -43,7 +43,6 @@ def default_config(): data_subdir = "hardcoded" # optional, if omitted searches all data (slow) search = { # parameters to filter by in datasets "env_name": "evaluating_rewards/Hopper-v3", - "model_wrapper_kwargs": {}, } # Figure parameters @@ -85,15 +84,14 @@ def test(): @plot_divergence_heatmap_ex.named_config -def dataset_transition(): - """Searches for comparisons using `random_transition_generator`.""" - search = { # noqa: F841 pylint:disable=unused-variable - "dataset_factory": { - "escape/py/function": ( - "evaluating_rewards.experiments.datasets.random_transition_generator" - ), - }, +def large(): + """Large output size, high precision.""" + styles = ["paper", "heatmap", "heatmap-2col", "tex"] + heatmap_kwargs = { + "fmt": visualize.short_e, } + _ = locals() + del _ def _norm(args: Iterable[str]) -> bool: @@ -105,9 +103,6 @@ def point_mass(): """Heatmaps for evaluating_rewards/PointMass* environments.""" search = { # noqa: F841 pylint:disable=unused-variable "env_name": "evaluating_rewards/PointMassLine-v0", - "dataset_factory": { - "escape/py/function": "evaluating_rewards.experiments.datasets.random_policy_generator", - }, } heatmap_kwargs = {} heatmap_kwargs["masks"] = { diff --git a/src/evaluating_rewards/analysis/plot_gridworld_divergence.py b/src/evaluating_rewards/analysis/plot_gridworld_divergence.py index 0c34cda..3711ffd 100644 --- a/src/evaluating_rewards/analysis/plot_gridworld_divergence.py +++ b/src/evaluating_rewards/analysis/plot_gridworld_divergence.py @@ -153,7 +153,7 @@ def compute_divergence(reward_cfg: Dict[str, Any], discount: float) -> pd.Series for target_name, target_reward in rewards.items(): if target_name == "all_zero": continue - closest_reward = tabular.closest_reward_em( + closest_reward = tabular.closest_reward_am( src_reward, target_reward, n_iter=1000, discount=discount ) xlen, ylen = reward_cfg[src_name]["state_reward"].shape diff --git a/src/evaluating_rewards/analysis/visualize.py b/src/evaluating_rewards/analysis/visualize.py index 81f842f..a370b12 100644 --- a/src/evaluating_rewards/analysis/visualize.py +++ b/src/evaluating_rewards/analysis/visualize.py @@ -62,6 +62,9 @@ "target_reward_path": "Target", } +WHITELISTED_LEVELS = ["source_reward_type", "target_reward_type"] # never remove these levels + + # Saving figures @@ -123,7 +126,7 @@ def remove_constant_levels(index: pd.MultiIndex) -> pd.MultiIndex: index = index.copy() levels = index.names for level in levels: - if len(index.get_level_values(level).unique()) == 1: + if len(index.get_level_values(level).unique()) == 1 and level not in WHITELISTED_LEVELS: index = index.droplevel(level=level) return index diff --git a/src/evaluating_rewards/comparisons.py b/src/evaluating_rewards/comparisons.py index 73e9c11..7c58f27 100644 --- a/src/evaluating_rewards/comparisons.py +++ b/src/evaluating_rewards/comparisons.py @@ -13,13 +13,16 @@ # limitations under the License. """Methods to compare reward models.""" + +import collections +import functools import logging -from typing import Any, Callable, Dict, Iterator, List, Mapping, Optional, Tuple, Type, TypeVar +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Type, TypeVar import numpy as np import tensorflow as tf -from evaluating_rewards import rewards +from evaluating_rewards import datasets, rewards FitStats = Mapping[str, List[Mapping[str, Any]]] @@ -82,17 +85,31 @@ def build_feed_dict(self, batch: rewards.Batch): models = [self.model, self.target] return rewards.make_feed_dict(models, batch) - def fit(self, dataset: Iterator[rewards.Batch], log_interval: int = 10) -> FitStats: + def fit( + self, + dataset: datasets.BatchCallable, + total_timesteps: int = int(1e6), + batch_size: int = 4096, + log_interval: int = 10, + ) -> FitStats: """Fits shaping to target. Args: - dataset: iterator of batches of data to fit to. + dataset: a callable returning batches of the specified size. + total_timesteps: the total number of timesteps to train for. + batch_size: the number of timesteps in each training batch. log_interval: reports statistics every log_interval batches. Returns: Training statistics. """ - return fit_models({"singleton": self}, dataset=dataset, log_interval=log_interval) + return fit_models( + {"singleton": self}, + dataset=dataset, + total_timesteps=total_timesteps, + batch_size=batch_size, + log_interval=log_interval, + ) ModelWrapperRet = Tuple[rewards.RewardModel, Any, Mapping[str, tf.Tensor]] @@ -131,30 +148,120 @@ def __init__( self.metrics["unwrapped_loss"] = loss_fn(self.target.reward, self.unwrapped_source.reward) self.metrics.update(metrics) - def pretrain(self, batch: rewards.Batch): + def fit_affine(self, batch: rewards.Batch): + """Fits affine parameters only (not e.g. potential).""" affine_model = self.model_extra["affine"] - return affine_model.pretrain(batch, target=self.target, original=self.unwrapped_source) + return affine_model.fit_lstsq(batch, target=self.target, shaping=None) def fit( - self, dataset: Iterator[rewards.Batch], pretrain: Optional[rewards.Batch], **kwargs + self, dataset: datasets.BatchCallable, affine_size: Optional[int] = 4096, **kwargs, ) -> FitStats: """Fits shaping to target. + If `affine_size` is specified, initializes affine parameters using `self.fit_affine`. + Args: - dataset: iterator of batches of data to fit to. - pretrain: if provided, warm-start affine parameters from estimates - computed from this batch. (Requires that model_wrapper adds - affine parameters.) - **kwargs: passed through to super().fit. + dataset: a callable returning batches of the specified size. + affine_size: the size of the batch to pretrain affine parameters. Returns: Training statistics. """ - if pretrain: - self.pretrain(pretrain) + if affine_size: + affine_batch = dataset(affine_size) + self.fit_affine(affine_batch) return super().fit(dataset, **kwargs) +class RegressEquivalentLeastSqModel(RegressWrappedModel): + """Least-squares regression from source model wrapped with affine and potential shaping. + + Positive affine transformations and potential shaping are optimal policy preserving + transformations, and so the rewards are considered equivalent (in the sense of Ng et al, 1999). + + The regression is solved via alternating minimization. Since the regression is least-squares, + the affine parameters can be computed analytically. The potential shaping must be computed + with gradient descent. + + Does not change the source model: only the wrapper. + """ + + def __init__(self, model: rewards.RewardModel, target: rewards.RewardModel, **kwargs): + """Constructs RegressEquivalentLeastSqModel. + + Args: + model: The original model to wrap. + target: The model we want to match. + **kwargs: Passed through to super-class. + """ + model_wrapper = functools.partial(equivalence_model_wrapper, affine_stopgrad=True) + super().__init__( + model=model, + target=target, + model_wrapper=model_wrapper, + loss_fn=tf.losses.mean_squared_error, + **kwargs, + ) + + def fit_affine(self, batch: rewards.Batch) -> rewards.AffineParameters: + """ + Set affine transformation parameters to analytic least-squares solution. + + Does not update potential parameters. + + Args: + batch: The batch to compute the affine parameters over. + + Returns: + The optimal affine parameters (also updates as side-effect). + """ + affine_model = self.model_extra["affine"] + shaping_model = self.model_extra["shaping"].models["shaping"][0] + return affine_model.fit_lstsq(batch, target=self.target, shaping=shaping_model) + + def fit( + self, + dataset: datasets.BatchCallable, + total_timesteps: int = int(1e6), + epoch_timesteps: int = 16384, + affine_size: int = 4096, + **kwargs, + ) -> FitStats: + """Fits shaping to target. + + Args: + dataset: a callable returning batches of the specified size. + total_timesteps: the total number of timesteps to train for. + epoch_timesteps: the number of timesteps to train shaping for; the optimal affine + parameters are set analytically at the start of each epoch. + affine_size: the size of the batch to pretrain affine parameters. + + Returns: + Training statistics. + + Raises: + ValueError if total_timesteps < epoch_timesteps. + """ + if total_timesteps < epoch_timesteps: + raise ValueError("total_timesteps must be at least as large as epoch_timesteps.") + + stats = collections.defaultdict(list) + nepochs = int(total_timesteps) // int(epoch_timesteps) + for epoch in range(nepochs): + affine_batch = dataset(affine_size) + affine_stats = self.fit_affine(affine_batch) + logging.info(f"Epoch {epoch}: {affine_stats}") + + epoch_stats = super().fit( + dataset, total_timesteps=epoch_timesteps, affine_size=None, **kwargs + ) + + for k, v in epoch_stats.items(): + stats[k] += v + + return stats + + def _scaled_norm(x): """l2 norm, normalized to be invariant to length of vectors.""" return np.linalg.norm(x) / np.sqrt(len(x)) @@ -204,6 +311,7 @@ def equivalence_model_wrapper( wrapped: rewards.RewardModel, potential: bool = True, affine: bool = True, + affine_stopgrad: bool = False, affine_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> ModelWrapperRet: @@ -215,6 +323,7 @@ def equivalence_model_wrapper( wrapped: The model to wrap. potential: If true, add potential shaping. affine: If true, add affine transformation. + affine_stopgrad: If true, do not propagate gradients to affine. affine_kwargs: Passed through to AffineTransform. **kwargs: Passed through to PotentialShapingWrapper. @@ -230,8 +339,10 @@ def equivalence_model_wrapper( affine_kwargs = affine_kwargs or {} model = rewards.AffineTransform(model, **affine_kwargs) models["affine"] = model - metrics["constant"] = model.constant + metrics["constant"] = model.shift metrics["scale"] = model.scale + if affine_stopgrad: + model = rewards.StopGradientsModelWrapper(model) if potential: model = rewards.PotentialShapingWrapper(model, **kwargs) @@ -244,7 +355,11 @@ def equivalence_model_wrapper( def fit_models( - potentials: Mapping[K, RegressModel], dataset: Iterator[rewards.Batch], log_interval: int = 10 + potentials: Mapping[K, RegressModel], + dataset: datasets.BatchCallable, + total_timesteps: int, + batch_size: int, + log_interval: int = 10, ) -> Mapping[str, List[Mapping[K, Any]]]: """Regresses model(s). @@ -254,16 +369,27 @@ def fit_models( Args: potentials: A mapping from strings to a potential-shaped reward model. dataset: An iterator returning batches of old obs-act-next obs tuples. + total_timesteps: the total number of timesteps to train for. + batch_size: the number of timesteps in each training batch. log_interval: The frequency with which to print. Returns: Metrics from training. + + Raises: + ValueError if total_timesteps < batch_size. """ + if total_timesteps < batch_size: + raise ValueError("total_timesteps must be at least as larger as batch_size.") + sess = tf.get_default_session() ops = {k: [p.opt_op, p.loss, p.metrics] for k, p in potentials.items()} losses = [] metrics = [] - for i, batch in enumerate(dataset): + + nbatches = int(total_timesteps) // int(batch_size) + for i in range(nbatches): + batch = dataset(batch_size) feed_dict = {} for potential in potentials.values(): feed_dict.update(potential.build_feed_dict(batch)) diff --git a/src/evaluating_rewards/datasets.py b/src/evaluating_rewards/datasets.py index 2e46fb9..719a0a4 100644 --- a/src/evaluating_rewards/datasets.py +++ b/src/evaluating_rewards/datasets.py @@ -20,7 +20,6 @@ """ import contextlib -import math from typing import Callable, ContextManager, Iterator, Union import gym @@ -31,7 +30,7 @@ from evaluating_rewards import rewards -BatchCallable = Callable[[int, int], Iterator[rewards.Batch]] +BatchCallable = Callable[[int], rewards.Batch] # Expect DatasetFactory to accept a str specifying env_name as first argument, # int specifying seed as second argument and factory-specific keyword arguments # after this. There is no way to specify this in Python type annotations yet :( @@ -45,14 +44,13 @@ def rollout_policy_generator( ) -> Iterator[BatchCallable]: """Generator returning rollouts from a policy in a given environment.""" - def f(total_timesteps: int, batch_size: int) -> Iterator[rewards.Batch]: - nbatch = math.ceil(total_timesteps / batch_size) - for _ in range(nbatch): - transitions = rollout.generate_transitions(policy, venv, n_timesteps=batch_size) - # TODO(): can we switch to rollout.Transition? - yield rewards.Batch( - obs=transitions.obs, actions=transitions.acts, next_obs=transitions.next_obs - ) + def f(total_timesteps: int) -> rewards.Batch: + # TODO(adam): inefficient -- discards partial trajectories and resets environment + transitions = rollout.generate_transitions(policy, venv, n_timesteps=total_timesteps) + # TODO(): can we switch to rollout.Transition? + return rewards.Batch( + obs=transitions.obs, actions=transitions.acts, next_obs=transitions.next_obs + ) yield f @@ -83,33 +81,29 @@ def random_transition_generator(env_name: str, seed: int = 0) -> Iterator[BatchC seed: Used to seed the dynamics. Yields: - A function that, when called with timesteps and batch size, will perform - the sampling process described above. + A function that will perform the sampling process described above for a + number of timesteps specified in the argument. """ env = gym.make(env_name) env.seed(seed) - # TODO(): why is total_timesteps specified here? - # Could instead make it endless, or specify nbatch directly. - def f(total_timesteps: int, batch_size: int) -> Iterator[rewards.Batch]: + def f(total_timesteps: int) -> rewards.Batch: """Helper function.""" - nbatch = math.ceil(total_timesteps / batch_size) - for _ in range(nbatch): - obses = [] - acts = [] - next_obses = [] - for _ in range(batch_size): - old_state = env.state_space.sample() - obs = env.obs_from_state(old_state) - act = env.action_space.sample() - new_state = env.transition(old_state, act) # may be non-deterministic - next_obs = env.obs_from_state(new_state) - - obses.append(obs) - acts.append(act) - next_obses.append(next_obs) - yield rewards.Batch( - obs=np.array(obses), actions=np.array(acts), next_obs=np.array(next_obses) - ) + obses = [] + acts = [] + next_obses = [] + for _ in range(total_timesteps): + old_state = env.state_space.sample() + obs = env.obs_from_state(old_state) + act = env.action_space.sample() + new_state = env.transition(old_state, act) # may be non-deterministic + next_obs = env.obs_from_state(new_state) + + obses.append(obs) + acts.append(act) + next_obses.append(next_obs) + return rewards.Batch( + obs=np.array(obses), actions=np.array(acts), next_obs=np.array(next_obses) + ) yield f diff --git a/src/evaluating_rewards/envs/mujoco.py b/src/evaluating_rewards/envs/mujoco.py index 167970f..637c036 100644 --- a/src/evaluating_rewards/envs/mujoco.py +++ b/src/evaluating_rewards/envs/mujoco.py @@ -202,7 +202,7 @@ def __init__( observation_space: gym.Space, action_space: gym.Space, forward: bool = True, - ctrl_coef: float = 0.1, + ctrl_coef: float = 1e-2, ): """Constructs the reward model. diff --git a/src/evaluating_rewards/experiments/comparisons.py b/src/evaluating_rewards/experiments/comparisons.py index 0aef890..93acc0c 100644 --- a/src/evaluating_rewards/experiments/comparisons.py +++ b/src/evaluating_rewards/experiments/comparisons.py @@ -63,7 +63,7 @@ def constant_baseline( Returns: A dictionary containing summary statistics. """ - test_set = next(dataset(test_size, test_size)) + test_set = dataset(test_size) models = {"matched": match.model, "target": target} preds = rewards.evaluate_models(models, test_set) diff --git a/src/evaluating_rewards/experiments/point_mass_analysis.py b/src/evaluating_rewards/experiments/point_mass_analysis.py index 4f4d923..945b694 100644 --- a/src/evaluating_rewards/experiments/point_mass_analysis.py +++ b/src/evaluating_rewards/experiments/point_mass_analysis.py @@ -177,8 +177,8 @@ def plot_state_density( nsamples: The number of points to sample. **kwargs: Passed through to `sns.jointplot`. """ - batch = next(dataset_generator(nsamples, nsamples)) - obs, _, _ = batch + batch = dataset_generator(nsamples) + obs = batch.obs sns.jointplot(y=obs[:, 0], x=obs[:, 1], kind="hex", xlim=(-1, 1), ylim=(-1, 1), **kwargs) plt.xlabel("Velocity") plt.ylabel("Position") diff --git a/src/evaluating_rewards/experiments/synthetic.py b/src/evaluating_rewards/experiments/synthetic.py index a24ba09..7c25e3c 100644 --- a/src/evaluating_rewards/experiments/synthetic.py +++ b/src/evaluating_rewards/experiments/synthetic.py @@ -178,8 +178,8 @@ def _compare_synthetic_eval( if model_affine: final = matched.model_extra["affine"].get_weights() else: - final = rewards.AffineParameters(constant=0, scale=1.0) - final_constants[(rew_nm, pot_nm)] = final.constant + final = rewards.AffineParameters(shift=0, scale=1.0) + final_constants[(rew_nm, pot_nm)] = final.shift final_scales[(rew_nm, pot_nm)] = final.scale res = { @@ -326,24 +326,25 @@ def compare_synthetic( sess.run(tf.global_variables_initializer()) # Datasets - training_generator = dataset_generator(total_timesteps, batch_size) - test_set = next(dataset_generator(test_size, test_size)) + test_set = dataset_generator(test_size) # Pre-train to initialize affine parameters initial_constants = {} initial_scales = {} - pretrain_set = next(dataset_generator(pretrain_size, pretrain_size)) + pretrain_set = dataset_generator(pretrain_size) for key, matched in matchings.items(): if model_affine and pretrain: logging.info(f"Pretraining {key}") - initial = matched.pretrain(pretrain_set) + initial = matched.fit_affine(pretrain_set) else: - initial = rewards.AffineParameters(constant=0, scale=1) - initial_constants[key] = initial.constant + initial = rewards.AffineParameters(shift=0, scale=1) + initial_constants[key] = initial.shift initial_scales[key] = initial.scale # Train potential shaping and other parameters - metrics = comparisons.fit_models(matchings, training_generator) + metrics = None + if total_timesteps > 0: + metrics = comparisons.fit_models(matchings, dataset_generator, total_timesteps, batch_size) return _compare_synthetic_eval( metrics=metrics, diff --git a/src/evaluating_rewards/rewards.py b/src/evaluating_rewards/rewards.py index 698c96a..90608ed 100644 --- a/src/evaluating_rewards/rewards.py +++ b/src/evaluating_rewards/rewards.py @@ -16,7 +16,6 @@ import abc import itertools -import logging import os import pickle from typing import Dict, Iterable, Mapping, NamedTuple, Optional, Sequence, Tuple, Type, TypeVar @@ -25,6 +24,7 @@ from imitation.rewards import reward_net from imitation.util import rollout, serialize import numpy as np +import scipy.optimize from stable_baselines.common import input as env_in # avoid name clash import tensorflow as tf @@ -35,11 +35,11 @@ class AffineParameters(NamedTuple): """Parameters of an affine transformation. Attributes: - constant: The additive shift. + shift: The additive shift. scale: The multiplicative dilation factor. """ - constant: float + shift: float scale: float @@ -535,68 +535,62 @@ def __init__(self, wrapped: RewardModel, scale: bool = True, shift: bool = True) models = {"wrapped": (wrapped, scale)} + self._shift = None if shift: - constant = ConstantReward(wrapped.observation_space, wrapped.action_space) + self._shift = shift = ConstantReward(wrapped.observation_space, wrapped.action_space) else: - constant = ZeroReward(wrapped.observation_space, wrapped.action_space) - models["constant"] = (constant, tf.constant(1.0)) + shift = ZeroReward(wrapped.observation_space, wrapped.action_space) + models["constant"] = (shift, tf.constant(1.0)) super().__init__(models) - def pretrain( - self, batch: Batch, target: RewardModel, original: Optional[RewardModel] = None, eps=1e-8 + def fit_lstsq( + self, batch: Batch, target: RewardModel, shaping: Optional[RewardModel] ) -> AffineParameters: - """Initializes the shift and scale parameter to try to match target. + """Sets the shift and scale parameters to try and match target, given shaping. - Computes the mean and standard deviation of the wrapped reward model - and target on batch, and sets the shift and scale parameters so that the - output of this model has the same mean and standard deviation as target. + Uses `least_l2_affine` to find the least-squares affine parameters between + `scale * source + shift + shaping` and `target`. - If the wrapped model is just an affine transformation of target, this - should get the correct values (given adequate data). However, if they differ - -- even if just by potential shaping -- it can deviate substantially. It's - generally still better than just the identity initialization. + If one of `scale` or `shift` is not present in this `AffineParameter`, it will be skipped, + and the value corresponding to the identify transformation will be returned. Args: - batch: Data to evaluate the reward models on. - target: A RewardModel to match the mean and standard deviation of. - original: If specified, a RewardModel to rescale to match target. - Defaults to using the reward model this object wraps, `self.wrapped`. - This can be undesirable if `self.wrapped` includes some randomly - initialized model elements, such as potential shaping, that would - be better to treat as mean-zero. - eps: Minimum standard deviation (for numerical stability). + batch: A batch of data to estimate the affine parameters on. + target: The reward model to try and match. + shaping: Optionally, potential shaping to add to the source. If omitted, will default + to all-zero. Returns: - The initial shift and scale parameters. + The least-squares affine parameters. They will also be set as a side-effect. """ - if original is None: - original = self.models["wrapped"][0] - - feed_dict = make_feed_dict([original, target], batch) sess = tf.get_default_session() - preds = sess.run([original.reward, target.reward], feed_dict=feed_dict) - original_mean, target_mean = np.mean(preds, axis=-1) - original_std, target_std = np.clip(np.std(preds, axis=-1), eps, None) - - log_scale = 0.0 - if self._log_scale_layer is not None: - log_scale = np.log(target_std) - np.log(original_std) - logging.info("Assigning log scale: %f", log_scale) - self._log_scale_layer.set_constant(log_scale) - scale = np.exp(log_scale) - - constant = 0.0 - constant_model = self.models["constant"][0] - if isinstance(constant_model, ConstantReward): - constant = -original_mean * target_std / original_std + target_mean - logging.info("Assigning shift: %f", constant) - constant_model.constant.set_constant(constant) - - return AffineParameters(constant=constant, scale=scale) - - @property - def constant(self) -> tf.Tensor: + source = self.models["wrapped"][0] + target_tensor = target.reward + models = [source, target] + if shaping is not None: + target_tensor -= shaping.reward + models.append(shaping) + + reward_tensors = [source.reward, target_tensor] + feed_dict = make_feed_dict(models, batch) + source_reward, target_reward = sess.run(reward_tensors, feed_dict=feed_dict) + + has_shift = self._shift is not None + has_scale = self._log_scale_layer is not None + # Find affine parameters minimizing L2 distance of `scale * source + shift + shaping` + # to `target`. Note if `shaping` is present, have subtracted it from `target` above. + params = least_l2_affine(source_reward, target_reward, shift=has_shift, scale=has_scale) + scale = max(params.scale, np.finfo(params.scale).eps) # ensure strictly positive + if has_shift: + self.set_shift(params.shift) + if has_scale: + self.set_log_scale(np.log(scale)) + + return params + + @property + def shift(self) -> tf.Tensor: """The additive shift.""" return self.models["constant"][0].constant.constant @@ -605,7 +599,35 @@ def scale(self) -> tf.Tensor: """The multiplicative dilation.""" return self.models["wrapped"][1] - def get_weights(self): + def set_shift(self, constant: float) -> None: + """Sets the constant shift. + + Args: + constant: The shift factor. + + Raises: + TypeError if the AffineTransform does not have a shift parameter. + """ + if self._shift is not None: + return self._shift.constant.set_constant(constant) + else: + raise TypeError("Calling `set_shift` on AffineTransform with `shift=False`.") + + def set_log_scale(self, log_scale: float) -> None: + """Sets the log of the scale factor. + + Args: + log_scale: The log scale factor. + + Raises: + TypeError if the AffineTransform does not have a scale parameter. + """ + if self._log_scale_layer is not None: + return self._log_scale_layer.set_constant(log_scale) + else: + raise TypeError("Calling `set_log_scale` on AffineTransform with `scale=False`.") + + def get_weights(self) -> AffineParameters: """Extract affine parameters from a model. Returns: @@ -615,8 +637,8 @@ def get_weights(self): comparison with results returned by other methods.) """ sess = tf.get_default_session() - const, scale = sess.run([self.constant, self.scale]) - return AffineParameters(constant=const, scale=scale) + shift, scale = sess.run([self.shift, self.scale]) + return AffineParameters(shift=shift, scale=scale) @classmethod def _load(cls, directory: str) -> "AffineTransform": @@ -720,3 +742,51 @@ def evaluate_potentials(potentials: Iterable[PotentialShaping], batch: Batch) -> new_pots = [p.new_potential for p in potentials] feed_dict = make_feed_dict(potentials, batch) return tf.get_default_session().run([old_pots, new_pots], feed_dict=feed_dict) + + +def least_l2_affine( + source: np.ndarray, target: np.ndarray, shift: bool = True, scale: bool = True +) -> AffineParameters: + """Finds the squared-error minimizing affine transform. + + Args: + source: a 1D array consisting of the reward to transform. + target: a 1D array consisting of the target to match. + shift: affine includes constant shift. + scale: affine includes rescale. + + Returns: + (shift, scale) such that (scale * reward + shift) has minimal squared-error from target. + + Raises: + ValueError if source or target are not 1D arrays, or if neither shift or scale are True. + """ + if source.ndim != 1: + raise ValueError("source must be vector.") + if target.ndim != 1: + raise ValueError("target must be vector.") + if not (shift or scale): + raise ValueError("At least one of shift and scale must be True.") + + a_vals = [] + if shift: + # Positive and negative constant. + # The shift will be the sum of the coefficients of these terms. + a_vals += [np.ones_like(source), -np.ones_like(source)] + if scale: + a_vals += [source] + a_vals = np.stack(a_vals, axis=1) + # Find x such that a_vals.dot(x) has least-squared error from target, where x >= 0. + coefs, _ = scipy.optimize.nnls(a_vals, target) + + shift_param = 0.0 + scale_idx = 0 + if shift: + shift_param = coefs[0] - coefs[1] + scale_idx = 2 + + scale_param = 1.0 + if scale: + scale_param = coefs[scale_idx] + + return AffineParameters(shift=shift_param, scale=scale_param) diff --git a/src/evaluating_rewards/scripts/model_comparison.py b/src/evaluating_rewards/scripts/model_comparison.py index ae6ca5a..f7e6e6f 100644 --- a/src/evaluating_rewards/scripts/model_comparison.py +++ b/src/evaluating_rewards/scripts/model_comparison.py @@ -16,10 +16,9 @@ import functools import os -from typing import Any, Dict, Mapping +from typing import Any, Dict, Mapping, Type import sacred -import tensorflow as tf from evaluating_rewards import comparisons, datasets, serialize from evaluating_rewards.scripts import regress_utils, script_utils @@ -39,14 +38,14 @@ def default_config(): source_reward_path = "dummy" # Model to train and hyperparameters - model_wrapper_fn = comparisons.equivalence_model_wrapper # equivalence class - model_wrapper_kwargs = dict() - loss_fn = tf.losses.mean_squared_error - pretrain = True # set initial scale and constant to match target - pretrain_size = 16386 # number of timesteps to use in pretraining - total_timesteps = 1e6 + comparison_class = comparisons.RegressWrappedModel + comparison_kwargs = { + "learning_rate": 1e-2, + } + affine_size = 16386 # number of timesteps to use in pretraining; set to None to disable + total_timesteps = int(1e6) batch_size = 4096 - learning_rate = 1e-2 + fit_kwargs = {} # Logging log_root = os.path.join("output", "train_regress") # output directory @@ -55,7 +54,7 @@ def default_config(): @model_comparison_ex.config -def default_kwargs(dataset_factory, dataset_factory_kwargs): +def default_kwargs(dataset_factory, dataset_factory_kwargs, comparison_class, comparison_kwargs): """Sets dataset_factory_kwargs to defaults when dataset_factory not overridden.""" # TODO(): remove this function when Sacred issue #238 is fixed if ( # pylint:disable=comparison-with-callable @@ -63,44 +62,64 @@ def default_kwargs(dataset_factory, dataset_factory_kwargs): and not dataset_factory_kwargs ): dataset_factory_kwargs = dict(policy_type="random", policy_path="dummy") + if ( + comparison_class == comparisons.RegressWrappedModel + and "model_wrapper" not in comparison_kwargs + ): + comparison_kwargs["model_wrapper"] = comparisons.equivalence_model_wrapper _ = locals() # quieten flake8 unused variable warning del _ @model_comparison_ex.named_config -def affine_only(): - """Equivalence class consists of just affine transformations.""" - model_wrapper_fn = comparisons.equivalence_model_wrapper - model_wrapper_kwargs = dict(potential=False) +def alternating_maximization(): + """Use less flexible (but sometimes more accurate) RegressEquivalentLeastSq. + + Uses least-squares loss and affine + potential shaping wrapping. + """ + comparison_class = comparisons.RegressEquivalentLeastSqModel _ = locals() # quieten flake8 unused variable warning del _ +@model_comparison_ex.named_config +def affine_only(): + """Equivalence class consists of just affine transformations.""" + comparison_kwargs = { # noqa: F841 pylint:disable=unused-variable + "model_wrapper": functools.partial(comparisons.equivalence_model_wrapper, potential=False), + } + + @model_comparison_ex.named_config def no_rescale(): """Equivalence class are shifts plus potential shaping (no scaling).""" - model_wrapper_fn = comparisons.equivalence_model_wrapper - model_wrapper_kwargs = dict(affine_kwargs=dict(scale=False)) - _ = locals() # quieten flake8 unused variable warning - del _ + comparison_kwargs = { # noqa: F841 pylint:disable=unused-variable + "model_wrapper": functools.partial( + comparisons.equivalence_model_wrapper, affine_kwargs=dict(scale=False) + ), + } @model_comparison_ex.named_config def shaping_only(): """Equivalence class consists of just potential shaping.""" - model_wrapper_fn = comparisons.equivalence_model_wrapper - model_wrapper_kwargs = dict(affine=False) - pretrain = False + comparison_kwargs = { + "model_wrapper": functools.partial(comparisons.equivalence_model_wrapper, affine=False), + } + affine_size = None _ = locals() # quieten flake8 unused variable warning del _ @model_comparison_ex.named_config def ellp_loss(): + """Use mean (x-y)^p loss, default to p=0.5 (sparsity inducing)""" p = 0.5 # Note if p specified at CLI, it will take priority over p above here # (Sacred configuration scope magic). - loss_fn = functools.partial(comparisons.ellp_norm_loss, p=p) + comparison_kwargs = { + "loss_fn": functools.partial(comparisons.ellp_norm_loss, p=p), + } _ = locals() # quieten flake8 unused variable warning del _ @@ -113,7 +132,7 @@ def ellp_loss(): @model_comparison_ex.named_config def test(): """Small number of epochs, finish quickly, intended for tests / debugging.""" - pretrain_size = 512 + affine_size = 512 total_timesteps = 8192 _ = locals() # quieten flake8 unused variable warning del _ @@ -145,36 +164,34 @@ def model_comparison( target_reward_type: str, target_reward_path: str, # Model parameters - model_wrapper_fn: comparisons.ModelWrapperFn, - model_wrapper_kwargs: Dict[str, Any], - pretrain: bool, - pretrain_size: int, + comparison_class: Type[comparisons.RegressModel], + comparison_kwargs: Dict[str, Any], + affine_size: int, total_timesteps: int, batch_size: int, - learning_rate: float, + fit_kwargs: Dict[str, Any], # Logging log_dir: str, ) -> Mapping[str, Any]: """Entry-point into script to regress source onto target reward model.""" - with dataset_factory(env_name, seed=_seed, **dataset_factory_kwargs) as dataset_callable: - dataset = dataset_callable(total_timesteps, batch_size) + with dataset_factory(env_name, seed=_seed, **dataset_factory_kwargs) as dataset_generator: def make_source(venv): return serialize.load_reward(source_reward_type, source_reward_path, venv) def make_trainer(model, model_scope, target): del model_scope - model_wrapper = functools.partial(model_wrapper_fn, **model_wrapper_kwargs) - return comparisons.RegressWrappedModel( - model, target, model_wrapper=model_wrapper, learning_rate=learning_rate - ) + return comparison_class(model, target, **comparison_kwargs) def do_training(target, trainer): del target - pretrain_set = None - if pretrain: - pretrain_set = next(dataset_callable(pretrain_size, pretrain_size)) - return trainer.fit(dataset, pretrain=pretrain_set) + return trainer.fit( + dataset_generator, + total_timesteps=total_timesteps, + batch_size=batch_size, + affine_size=affine_size, + **fit_kwargs, + ) return regress_utils.regress( seed=_seed, diff --git a/src/evaluating_rewards/scripts/train_regress.py b/src/evaluating_rewards/scripts/train_regress.py index 1201617..c9f380b 100644 --- a/src/evaluating_rewards/scripts/train_regress.py +++ b/src/evaluating_rewards/scripts/train_regress.py @@ -90,9 +90,7 @@ def train_regress( log_dir: str, ) -> Mapping[str, Any]: """Entry-point into script to regress source onto target reward model.""" - with dataset_factory(env_name, seed=_seed, **dataset_factory_kwargs) as dataset_callable: - dataset = dataset_callable(total_timesteps, batch_size) - + with dataset_factory(env_name, seed=_seed, **dataset_factory_kwargs) as dataset_generator: make_source = functools.partial(regress_utils.make_model, model_reward_type) def make_trainer(model, model_scope, target): @@ -101,7 +99,9 @@ def make_trainer(model, model_scope, target): def do_training(target, trainer): del target - return trainer.fit(dataset) + return trainer.fit( + dataset_generator, total_timesteps=total_timesteps, batch_size=batch_size + ) return regress_utils.regress( seed=_seed, diff --git a/src/evaluating_rewards/tabular.py b/src/evaluating_rewards/tabular.py index 63bab16..bbe191a 100644 --- a/src/evaluating_rewards/tabular.py +++ b/src/evaluating_rewards/tabular.py @@ -19,6 +19,8 @@ import numpy as np import pandas as pd +from evaluating_rewards import rewards + def random_state_only_reward( n_states: int, n_actions: int, rng: np.random.RandomState = np.random @@ -120,33 +122,12 @@ def closest_potential(reward: np.ndarray, target: np.ndarray, discount: float) - return potential -def closest_affine(reward: np.ndarray, target: np.ndarray) -> Tuple[float, float]: - """Finds the squared-error minimizing affine transform. - - Args: - reward: the reward to transform. - target: the target to match. - - Returns: - (shift, scale) such that (scale * reward + shift) has minimal squared-error from target. - """ - reward = reward.flatten() - target = target.flatten() - # Find x such that [1; reward].dot(x) has least-squared error from target - # x corresponds to a shift and scaling parameter. - a_vals = np.stack([np.ones_like(reward), reward], axis=1) - coefs, _, _, _ = np.linalg.lstsq(a_vals, target, rcond=None) - assert coefs.shape == (2,) - - return coefs - - -def closest_reward_em( +def closest_reward_am( source: np.ndarray, target: np.ndarray, n_iter: int = 100, discount: float = 0.99 ) -> np.ndarray: """Finds the least squared-error reward to target that is equivalent to reward. - Alternates calls to `closest_potential` and `closest_affine`, in an EM-like approach. + Alternating minimization over `closest_potential` and `closest_affine`. Args: - source: the source reward. @@ -161,8 +142,8 @@ def closest_reward_em( for _ in range(n_iter): potential = closest_potential(closest_reward, target, discount) closest_reward = shape(closest_reward, potential, discount) - shift, scale = closest_affine(closest_reward, target) - closest_reward = closest_reward * scale + shift + params = rewards.least_l2_affine(closest_reward.flatten(), target.flatten()) + closest_reward = closest_reward * params.scale + params.shift return closest_reward diff --git a/tests/test_comparisons.py b/tests/test_comparisons.py index ac901bc..4f84082 100644 --- a/tests/test_comparisons.py +++ b/tests/test_comparisons.py @@ -63,8 +63,6 @@ def test_regress( venv = vec_env.DummyVecEnv([lambda: gym.make(env_name)]) with datasets.random_transition_generator(env_name) as dataset_generator: - dataset = dataset_generator(1e5, 512) - with graph.as_default(): with session.as_default(): with tf.variable_scope("source") as source_scope: @@ -79,7 +77,7 @@ def test_regress( init_vars = source_scope.global_variables() + match_scope.global_variables() session.run(tf.initializers.variables(init_vars)) - stats = match.fit(dataset) + stats = match.fit(dataset_generator, total_timesteps=1e5, batch_size=512) loss = pd.DataFrame(stats["loss"])["singleton"] logging.info(f"Loss: {loss.iloc[::10]}") diff --git a/tests/test_rewards.py b/tests/test_rewards.py index e8de055..ed36f68 100644 --- a/tests/test_rewards.py +++ b/tests/test_rewards.py @@ -104,7 +104,7 @@ def fixture_serialize_identity(graph, session, venv): def f(make_model): policy = base.RandomPolicy(venv.observation_space, venv.action_space) with datasets.rollout_policy_generator(venv, policy) as dataset_callable: - batch = next(dataset_callable(1024, 1024)) + batch = dataset_callable(1024) with graph.as_default(), session.as_default(): original = make_model(venv) @@ -199,3 +199,46 @@ def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): # Are the predictions close to true Gym reward? np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5) + + +REWARD_LEN = 10000 +NUM_SAMPLES = 10 + + +def test_least_l2_affine_random(): + """Check least_l2_affine recovers random affine transformations.""" + source = np.random.randn(REWARD_LEN) + + shifts = np.random.randn(NUM_SAMPLES) + scales = np.exp(np.random.randn(NUM_SAMPLES)) + + for shift, scale in zip(shifts, scales): + target = source * scale + shift + params = rewards.least_l2_affine(source, target) + assert np.allclose([shift, scale], [params.shift, params.scale]) + assert params.scale >= 0 + + for has_shift in [False, True]: + target = source * scale + params = rewards.least_l2_affine(source, target, shift=has_shift) + assert np.allclose([0.0, scale], [params.shift, params.scale]) + + for has_scale in [False, True]: + target = source + shift + params = rewards.least_l2_affine(source, target, scale=has_scale) + assert np.allclose([shift, 1.0], [params.shift, params.scale], atol=0.1) + + +def test_least_l2_affine_zero(): + """Check least_l2_affine finds zero scale and shift for negative and zero target.""" + for _ in range(NUM_SAMPLES): + source = np.random.randn(REWARD_LEN) + + params = rewards.least_l2_affine(source, -source) + assert np.allclose([0.0], [params.scale]) + assert params.scale >= 0 + assert np.allclose([0.0], [params.shift], atol=0.1) + + params = rewards.least_l2_affine(source, np.zeros_like(source)) + assert np.allclose([0.0, 0.0], [params.shift, params.scale]) + assert params.scale >= 0 diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 8a58ca4..5001d97 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -30,20 +30,28 @@ from tests import common EXPERIMENTS = { - # experiment, expected_type - "plot_divergence": (plot_divergence_heatmap_ex, dict), - "plot_pm": (plot_pm_reward_ex, xr.DataArray), - "plot_gridworld_divergence": (plot_gridworld_divergence_ex, dict), - "plot_gridworld_reward": (plot_gridworld_reward_ex, plt.Figure), - "comparison": (model_comparison_ex, dict), - "regress": (train_regress_ex, dict), - "preferences": (train_preferences_ex, pd.DataFrame), + # experiment, expected_type, extra_named_configs, config_updates + "plot_divergence": (plot_divergence_heatmap_ex, dict, [], {}), + "plot_pm": (plot_pm_reward_ex, xr.DataArray, [], {}), + "plot_gridworld_divergence": (plot_gridworld_divergence_ex, dict, [], {}), + "plot_gridworld_reward": (plot_gridworld_reward_ex, plt.Figure, [], {}), + "comparison": (model_comparison_ex, dict, [], {}), + "comparison_alternating": ( + model_comparison_ex, + dict, + ["alternating_maximization"], + {"fit_kwargs": {"epoch_timesteps": 4096}}, + ), + "regress": (train_regress_ex, dict, [], {}), + "preferences": (train_preferences_ex, pd.DataFrame, [], {}), } -@common.mark_parametrize_dict("experiment,expected_type", EXPERIMENTS) -def test_experiment(experiment, expected_type): +@common.mark_parametrize_dict("experiment,expected_type,named_configs,config_updates", EXPERIMENTS) +def test_experiment(experiment, expected_type, named_configs, config_updates): + named_configs = ["test"] + named_configs with tempfile.TemporaryDirectory(prefix="eval-rewards-exp") as tmpdir: - run = experiment.run(named_configs=["test"], config_updates=dict(log_root=tmpdir)) + config_updates["log_root"] = tmpdir + run = experiment.run(named_configs=named_configs, config_updates=config_updates) assert run.status == "COMPLETED" assert isinstance(run.result, expected_type) diff --git a/tests/test_synthetic.py b/tests/test_synthetic.py index a9ab310..d05c6d1 100644 --- a/tests/test_synthetic.py +++ b/tests/test_synthetic.py @@ -19,7 +19,6 @@ """ import logging -import math import gym from imitation import util @@ -39,13 +38,11 @@ def dummy_env_and_dataset(dims: int = 5): obs_space = gym.spaces.Box(low=np.repeat(0.0, dims), high=np.repeat(1.0, dims)) act_space = gym.spaces.Box(low=np.repeat(0.0, dims), high=np.repeat(1.0, dims)) - def dataset_generator(total_timesteps, batch_size): - nbatches = math.ceil(total_timesteps / batch_size) - for _ in range(nbatches): - obs = np.array([obs_space.sample() for _ in range(batch_size)]) - actions = np.array([act_space.sample() for _ in range(batch_size)]) - next_obs = (obs + actions).clip(0.0, 1.0) - yield rewards.Batch(obs=obs, actions=actions, next_obs=next_obs) + def dataset_generator(total_timesteps): + obs = np.array([obs_space.sample() for _ in range(total_timesteps)]) + actions = np.array([act_space.sample() for _ in range(total_timesteps)]) + next_obs = (obs + actions).clip(0.0, 1.0) + return rewards.Batch(obs=obs, actions=actions, next_obs=next_obs) return { "observation_space": obs_space,