Skip to content

Commit

Permalink
Model comparison: NNLS initialization and alternating minimization (#13)
Browse files Browse the repository at this point in the history
* Correct notation: alternating minimization not expectation maximization

* Prototype of alternating-minimization for affine parameters

* Relocate L2-error minimizing affine shaping

* Bugfix: include fit_kind in output directory

* Affine transform: clarify parameter names, expose public setters

* Alternating minimization: start cleaning up, reduce code duplication

* Use non-negative least squares to find affine transformation

* Change dataset generator to not assume batch size

* Make model_comparison config more flexible

* Fix up runner, config

* Use least_l2_affine for pretrain

* Make Regress*Model logging consistent; add input validation

* Add unit test for least_l2_affine

* Fix int vs float problem

* Bugfix: handle total_timesteps==0

* Add large-form plot support

* Increase Hopper running control cost by 10x (to make comparable % of reward as backflip control)

* Always use random transitions for PointMass, hyperparameter sweep over epoch timesteps

* runners: make output directory configurable

* visualize bugfix: handle one column/one row figures

* Restore ground truth ctrl coef to match Gym reward; downscale backflip reward instead

* Make non-alternating maximization default again (alternating does not help much in tests)

* Add test for both standard and alternating model comparison

* Hardcoded comparison runner: remove epoch sweep

* Use EVAL_OUTPUT_ROOT instead of ${HOME}/output

* Fix typo in launch_docker.sh

* Reformat visualize.py

* Improve docstring
  • Loading branch information
AdamGleave authored Feb 4, 2020
1 parent 89dcc75 commit 7489905
Show file tree
Hide file tree
Showing 32 changed files with 496 additions and 255 deletions.
8 changes: 5 additions & 3 deletions runners/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ function call_script {
function learnt_model {
if [[ $# -ne 1 ]]; then
echo "usage: $0 <model prefix>"
echo "model prefix must be relative to ${OUTPUT_ROOT}"
echo "model prefix must be relative to ${EVAL_OUTPUT_ROOT}"
exit 1
fi

model_prefix=$1
learnt_model_dir=${OUTPUT_ROOT}/${model_prefix}
learnt_model_dir=${EVAL_OUTPUT_ROOT}/${model_prefix}

case ${model_prefix} in
train_adversarial)
Expand All @@ -53,4 +53,6 @@ eval "$(${ENV_REWARD_CMD} 2>/dev/null)"
ENVS="${!REWARDS_BY_ENV[@]}"
echo "Loaded mappings for environments ${ENVS}"

OUTPUT_ROOT=/mnt/eval_reward/data
if [[ "${EVAL_OUTPUT_ROOT}" == "" ]]; then
EVAL_OUTPUT_ROOT=$HOME/output
fi
12 changes: 9 additions & 3 deletions runners/comparison/hardcoded.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,18 @@ for env_name in "${!REWARDS_BY_ENV[@]}"; do
types=${REWARDS_BY_ENV[$env_name]}
env_name_sanitized=$(echo ${env_name} | sed -e 's/\//_/g')
types_sanitized=$(echo ${types} | sed -e 's/\//_/g')
parallel --header : --results $HOME/output/parallel/comparison/hardcoded_mujoco \
${TRAIN_CMD} env_name=${env_name} \

named_configs=""
if [[ ${env_name} == "evaluating_rewards/PointMassLine-v0" ]]; then
named_configs="dataset_random_transition"
fi

parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/comparison/hardcoded_mujoco \
${TRAIN_CMD} env_name=${env_name} ${named_configs} \
seed={seed} \
source_reward_type={source_reward_type} \
target_reward_type={target_reward_type} \
log_dir=${HOME}/output/comparison/hardcoded/${env_name_sanitized}/{source_reward_type_sanitized}_vs_{target_reward_type_sanitized}_seed{seed} \
log_dir=${EVAL_OUTPUT_ROOT}/comparison/hardcoded/${env_name_sanitized}/{source_reward_type_sanitized}_vs_{target_reward_type_sanitized}_seed{seed} \
::: source_reward_type ${types} \
:::+ source_reward_type_sanitized ${types_sanitized} \
::: target_reward_type ${types} \
Expand Down
4 changes: 2 additions & 2 deletions runners/comparison/learnt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ for env_name in ${ENVS}; do
echo "Models: ${MODELS}"
echo "Hardcoded rewards: ${types}"

parallel --header : --results ${OUTPUT_ROOT}/parallel/comparison/learnt/${env_name_sanitized} \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/comparison/learnt/${env_name_sanitized} \
${TRAIN_CMD} env_name=${env_name} seed={seed} \
source_reward_type=${source_reward_type} \
source_reward_path=${learnt_model_dir}/${env_name_sanitized}/{source_reward}/${model_name} \
target_reward_type={target_reward} {named_config} \
log_dir=${OUTPUT_ROOT}/comparison/${model_prefix}/${env_name_sanitized}/{source_reward}/match_{named_config}_to_{target_reward_sanitized}_seed{seed} \
log_dir=${EVAL_OUTPUT_ROOT}/comparison/${model_prefix}/${env_name_sanitized}/{source_reward}/match_{named_config}_to_{target_reward_sanitized}_seed{seed} \
::: source_reward ${MODELS} \
::: target_reward ${types} \
:::+ target_reward_sanitized ${types_sanitized} \
Expand Down
2 changes: 1 addition & 1 deletion runners/eval/greedy_pm_hardcoded.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ GREEDY_REWARD_MODELS="PointMassGroundTruth-v0:None \
PointMassSparse-v0:None \
PointMassDense-v0:None"

parallel --header : --results $HOME/output/parallel/greedy_pm_hardcoded \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/greedy_pm_hardcoded \
${EVAL_POLICY_CMD} policy_type=evaluating_rewards/MCGreedy-v0 \
env_name={env} policy_path={policy_path} \
::: env ${PM_ENVS} \
Expand Down
4 changes: 2 additions & 2 deletions runners/eval/greedy_pm_irl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

for env in ${ENVS}; do
env_sanitized=$(echo ${env} | sed -e 's/\//_/g')
reward_paths=$HOME/output/train_adversarial/${env_sanitized}/*/final/discrim/reward_net
reward_paths=${EVAL_OUTPUT_ROOT}/train_adversarial/${env_sanitized}/*/final/discrim/reward_net
policy_paths=""
for rew_path in ${reward_paths}; do
policy_paths="${policy_paths} BasicShapedRewardNet_shaped:${rew_path}"
policy_paths="${policy_paths} BasicShapedRewardNet_unshaped:${rew_path}"
done
parallel --header : --results $HOME/output/parallel/greedy_pm_irl \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/greedy_pm_irl \
${EVAL_POLICY_CMD} env_name=${env} policy_type=evaluating_rewards/MCGreedy-v0 \
policy_path={policy_path} \
::: policy_path ${policy_paths}
Expand Down
8 changes: 4 additions & 4 deletions runners/eval/learnt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

if [[ $# -ne 1 ]]; then
echo "usage: $0 <policy prefix>"
echo "policy prefix must be relative to ${OUTPUT_ROOT}"
echo "policy prefix must be relative to ${EVAL_OUTPUT_ROOT}"
exit 1
fi

policy_prefix=$1
policy_dir=${OUTPUT_ROOT}/${policy_prefix}
policy_dir=${EVAL_OUTPUT_ROOT}/${policy_prefix}
model_name="policies/final"

for env_name in ${ENVS}; do
Expand All @@ -38,11 +38,11 @@ for env_name in ${ENVS}; do
echo "Policies: ${policies}"
echo "Hardcoded rewards: ${types}"

parallel --header : --results $HOME/output/parallel/learnt \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/learnt \
${EVAL_POLICY_CMD} env_name=${env_name} policy_type=ppo2 \
reward_type={reward_type} \
policy_path=${policy_dir}/${env_name_sanitized}/{policy_path}/${model_name} \
log_dir=${OUTPUT_ROOT}/eval/${policy_prefix}/${env_name_sanitized}/{policy_path}/eval_under_{reward_type_sanitized} \
log_dir=${EVAL_OUTPUT_ROOT}/eval/${policy_prefix}/${env_name_sanitized}/{policy_path}/eval_under_{reward_type_sanitized} \
::: reward_type ${types} \
:::+ reward_type_sanitized ${types_sanitized} \
::: policy_path ${policies}
Expand Down
2 changes: 1 addition & 1 deletion runners/eval/static.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

POLICY_TYPES="random zero"

parallel --header : --results $HOME/output/parallel/static \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/static \
${EVAL_POLICY_CMD} env_name={env} policy_type={policy_type} \
::: env ${ENVS} \
::: policy_type ${POLICY_TYPES}
4 changes: 2 additions & 2 deletions runners/irl/train_irl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ TRAIN_CMD=$(call_script "train_adversarial" "with")

for env in ${ENVS}; do
env_sanitized=$(echo ${env} | sed -e 's/\//_/g')
parallel --header : --results $HOME/output/parallel/train_irl \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_irl \
${TRAIN_CMD} env_name=${env} seed={seed} \
init_trainer_kwargs.reward_kwargs.state_only={state_only} \
rollout_path={data_path}/rollouts/final.pkl \
::: data_path $HOME/output/expert_demos/${env_sanitized}/* \
::: data_path ${EVAL_OUTPUT_ROOT}/expert_demos/${env_sanitized}/* \
::: state_only True False \
::: seed 0 1 2
done
4 changes: 2 additions & 2 deletions runners/preferences/hyper_sweep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ PointMassDense-v0
PointMassSparse-v0
"

parallel --header : --results $HOME/output/parallel/train_preferences_hyper \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_preferences_hyper \
${TRAIN_CMD} env_name=evaluating_rewards/PointMassLine-v0 \
seed={seed} target_reward_type=evaluating_rewards/{target_reward} \
batch_timesteps={batch_timesteps} trajectory_length={trajectory_length} \
learning_rate={lr} total_timesteps=5e6 \
log_dir=${HOME}/output/train_preferences_hyper/{target_reward}/batch{batch_timesteps}_of_{trajectory_length}_lr{lr}/{seed} \
log_dir=${EVAL_OUTPUT_ROOT}/train_preferences_hyper/{target_reward}/batch{batch_timesteps}_of_{trajectory_length}_lr{lr}/{seed} \
::: target_reward ${TARGET_REWARDS} \
::: batch_timesteps 500 2500 10000 50000 250000 \
::: trajectory_length 1 5 25 100 \
Expand Down
4 changes: 2 additions & 2 deletions runners/preferences/train_preferences.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ for env_name in "${!REWARDS_BY_ENV[@]}"; do
env_name_sanitized=$(echo ${env_name} | sed -e 's/\//_/g')
types_sanitized=$(echo ${types} | sed -e 's/\//_/g')

parallel --header : --results $HOME/output/parallel/train_preferences/${env_name} \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_preferences/${env_name} \
${TRAIN_CMD} env_name=${env_name} \
seed={seed} target_reward_type={target_reward} \
log_dir=${HOME}/output/train_preferences/${env_name_sanitized}/{target_reward_sanitized}/{seed} \
log_dir=${EVAL_OUTPUT_ROOT}/train_preferences/${env_name_sanitized}/{target_reward_sanitized}/{seed} \
::: target_reward ${types} \
:::+ target_reward_sanitized ${types_sanitized} \
::: seed 0 1 2
Expand Down
4 changes: 2 additions & 2 deletions runners/regress/train_regress.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ for env_name in "${!REWARDS_BY_ENV[@]}"; do
env_name_sanitized=$(echo ${env_name} | sed -e 's/\//_/g')
types_sanitized=$(echo ${types} | sed -e 's/\//_/g')

parallel --header : --results $HOME/output/parallel/train_regress/${env_name_sanitized} \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/train_regress/${env_name_sanitized} \
${TRAIN_CMD} env_name=${env_name} \
seed={seed} target_reward_type={target_reward} \
log_dir=${HOME}/output/train_regress/${env_name_sanitized}/{target_reward_sanitized}/{seed} \
log_dir=${EVAL_OUTPUT_ROOT}/train_regress/${env_name_sanitized}/{target_reward_sanitized}/{seed} \
::: target_reward ${types} \
:::+ target_reward_sanitized ${types_sanitized} \
::: seed 0 1 2
Expand Down
2 changes: 1 addition & 1 deletion runners/rl/expert_demos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ EXPERT_DEMOS_CMD=$(call_script "expert_demos" "with")

for env_name in ${ENVS}; do
types=${REWARDS_BY_ENV[$env_name]}
parallel --header : --results $HOME/output/parallel/expert_demos \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/expert_demos \
${EXPERT_DEMOS_CMD} env_name=${env_name} \
reward_type={type} seed={seed} \
::: type ${types} \
Expand Down
4 changes: 2 additions & 2 deletions runners/rl/transfer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ for env_name in "${!TRANSFER_ENVS[@]}"; do
transfer_envs="${env_name} ${TRANSFER_ENVS[$env_name]}"
transfer_envs_sanitized=$(echo ${transfer_envs} | sed -e 's/\//_/g')

parallel --header : --results $HOME/output/parallel/expert_demos \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/expert_demos \
${EXPERT_DEMOS_CMD} env_name={env_name} \
reward_type=${source_reward_type} seed={seed} \
reward_path=${learnt_model_dir}/${env_name_sanitized}/{reward_path}/${model_name} \
log_dir=${OUTPUT_ROOT}/expert_transfer/${model_prefix}/${env_name_sanitized}/{env_name_sanitized}/{reward_path}/{seed} \
log_dir=${EVAL_OUTPUT_ROOT}/expert_transfer/${model_prefix}/${env_name_sanitized}/{env_name_sanitized}/{reward_path}/{seed} \
::: env_name ${transfer_envs} \
:::+ env_name_sanitized ${transfer_envs_sanitized} \
::: reward_path ${MODELS} \
Expand Down
6 changes: 3 additions & 3 deletions runners/transfer_point_maze.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ if [[ ${fast} == "true" ]]; then
REGRESS_TIMESTEPS="fast"
COMPARISON_TIMESTEPS="fast"
EVAL_TIMESTEPS=4096
PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze_fast
PM_OUTPUT=${EVAL_OUTPUT_ROOT}/transfer_point_maze_fast
else
RL_TIMESTEPS=""
IRL_EPOCHS=""
PREFERENCES_TIMESTEPS=""
REGRESS_TIMESTEPS=""
COMPARISON_TIMESTEPS=""
EVAL_TIMESTEPS=100000
PM_OUTPUT=${OUTPUT_ROOT}/transfer_point_maze
PM_OUTPUT=${EVAL_OUTPUT_ROOT}/transfer_point_maze
fi


Expand Down Expand Up @@ -142,7 +142,7 @@ wait

for env in ${ENVS}; do
env_sanitized=$(echo ${env} | sed -e 's/\//_/g')
parallel --header : --results $HOME/output/parallel/learnt \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/learnt \
$(call_script "eval_policy" "with") render=False num_vec=8 \
eval_n_timesteps=${EVAL_TIMESTEPS} policy_type=ppo2 env_name=${env} \
reward_type=${TARGET_REWARD_TYPE} \
Expand Down
6 changes: 3 additions & 3 deletions runners/visualize/visualize_pm_reward.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,19 @@ VISUALIZE_CMD=$(call_script "visualize_pm_reward" "with")

if [[ $# -ne 1 ]]; then
echo "usage: $0 <model prefix>"
echo "model prefix must be relative to ${OUTPUT_ROOT}"
echo "model prefix must be relative to ${EVAL_OUTPUT_ROOT}"
exit 1
fi

MODEL_PREFIX=$1
LEARNT_MODEL_DIR=${OUTPUT_ROOT}/${MODEL_PREFIX}
LEARNT_MODEL_DIR=${EVAL_OUTPUT_ROOT}/${MODEL_PREFIX}

MODELS=$(find ${LEARNT_MODEL_DIR} -name model -printf "%P\n" | xargs dirname)

echo "Visualizing models:"
echo ${MODELS}

parallel --header : --results ${OUTPUT_ROOT}/parallel/visualize_pm_reward/ \
parallel --header : --results ${EVAL_OUTPUT_ROOT}/parallel/visualize_pm_reward/ \
${VISUALIZE_CMD} env_name=evaluating_rewards/PointMassLine-v0 \
reward_type=evaluating_rewards/RewardModel-v0 \
reward_path=${LEARNT_MODEL_DIR}/{reward_path}/model \
Expand Down
19 changes: 7 additions & 12 deletions src/evaluating_rewards/analysis/plot_divergence_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def default_config():
data_subdir = "hardcoded" # optional, if omitted searches all data (slow)
search = { # parameters to filter by in datasets
"env_name": "evaluating_rewards/Hopper-v3",
"model_wrapper_kwargs": {},
}

# Figure parameters
Expand Down Expand Up @@ -85,15 +84,14 @@ def test():


@plot_divergence_heatmap_ex.named_config
def dataset_transition():
"""Searches for comparisons using `random_transition_generator`."""
search = { # noqa: F841 pylint:disable=unused-variable
"dataset_factory": {
"escape/py/function": (
"evaluating_rewards.experiments.datasets.random_transition_generator"
),
},
def large():
"""Large output size, high precision."""
styles = ["paper", "heatmap", "heatmap-2col", "tex"]
heatmap_kwargs = {
"fmt": visualize.short_e,
}
_ = locals()
del _


def _norm(args: Iterable[str]) -> bool:
Expand All @@ -105,9 +103,6 @@ def point_mass():
"""Heatmaps for evaluating_rewards/PointMass* environments."""
search = { # noqa: F841 pylint:disable=unused-variable
"env_name": "evaluating_rewards/PointMassLine-v0",
"dataset_factory": {
"escape/py/function": "evaluating_rewards.experiments.datasets.random_policy_generator",
},
}
heatmap_kwargs = {}
heatmap_kwargs["masks"] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def compute_divergence(reward_cfg: Dict[str, Any], discount: float) -> pd.Series
for target_name, target_reward in rewards.items():
if target_name == "all_zero":
continue
closest_reward = tabular.closest_reward_em(
closest_reward = tabular.closest_reward_am(
src_reward, target_reward, n_iter=1000, discount=discount
)
xlen, ylen = reward_cfg[src_name]["state_reward"].shape
Expand Down
5 changes: 4 additions & 1 deletion src/evaluating_rewards/analysis/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@
"target_reward_path": "Target",
}

WHITELISTED_LEVELS = ["source_reward_type", "target_reward_type"] # never remove these levels


# Saving figures


Expand Down Expand Up @@ -123,7 +126,7 @@ def remove_constant_levels(index: pd.MultiIndex) -> pd.MultiIndex:
index = index.copy()
levels = index.names
for level in levels:
if len(index.get_level_values(level).unique()) == 1:
if len(index.get_level_values(level).unique()) == 1 and level not in WHITELISTED_LEVELS:
index = index.droplevel(level=level)
return index

Expand Down
Loading

0 comments on commit 7489905

Please sign in to comment.