Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

F/eval x play #157

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions pax/agents/mfos_ppo/ppo_gru.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ def __init__(
random_key: jnp.ndarray,
gru_dim: int,
obs_spec: Tuple,
batch_size: int = 2000,
num_envs: int = 4,
num_steps: int = 500,
num_minibatches: int = 16,
num_epochs: int = 4,
clip_value: bool = True,
Expand Down Expand Up @@ -479,15 +477,12 @@ def prepare_batch(

# Other useful hyperparameters
self._num_envs = num_envs # number of environments
self._num_steps = num_steps # number of steps per environment
self._batch_size = int(num_envs * num_steps) # number in one batch
self._num_minibatches = num_minibatches # number of minibatches
self._num_epochs = num_epochs # number of epochs to use sample
self._gru_dim = gru_dim

def reset_memory(self, memory, eval=False) -> TrainingState:
num_envs = 1 if eval else self._num_envs

memory = memory._replace(
extras={
"values": jnp.zeros(num_envs),
Expand Down Expand Up @@ -573,8 +568,7 @@ def make_mfos_agent(

# Optimizer
transition_steps = (
num_iterations,
*agent_args.num_epochs * agent_args.num_minibatches,
num_iterations * agent_args.num_epochs * agent_args.num_minibatches,
)

if agent_args.lr_scheduling:
Expand Down Expand Up @@ -607,9 +601,7 @@ def make_mfos_agent(
random_key=random_key,
gru_dim=gru_dim,
obs_spec=obs_spec,
batch_size=None,
num_envs=args.num_envs,
num_steps=args.num_steps,
num_minibatches=agent_args.num_minibatches,
num_epochs=agent_args.num_epochs,
clip_value=agent_args.clip_value,
Expand Down
2 changes: 1 addition & 1 deletion pax/conf/experiment/ipd/shaper_v_tabular.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ runner: evo
top_k: 5
popsize: 1000
num_envs: 2
num_opps: 1
num_opps: 10
num_outer_steps: 100
num_inner_steps: 100
num_iters: 5000
Expand Down
87 changes: 87 additions & 0 deletions pax/conf/experiment/ipditm/eval_gs_gs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# @package _global_

# Agents
agent1: 'PPO'
agent2: 'PPO'

# Environment
env_id: InTheMatrix
env_type: meta
env_discount: 0.96
payoff: [[[3, 0], [5, 1]], [[3, 5], [0, 1]]]
runner: ipditm_eval
freeze: 5
fixed_coins: False

# Training hyperparameters

# env_batch_size = num_envs * num_opponents
num_envs: 50
num_opps: 1
num_outer_steps: 100
num_inner_steps: 152
save_interval: 100
num_iters: 1
save_gif: False
# Evaluation
# Shaper
run_path1: ucl-dark/ipditm/2wjr55mr
model_path1: exp/shaping-PPO-vs-PPO_memory/run-seed-0/2023-01-05_09.59.53.063797/generation_1000
# GS
run_path2: ucl-dark/ipditm/2wjr55mr
model_path2: exp/shaping-PPO-vs-PPO_memory/run-seed-0/2023-01-05_09.59.53.063797/generation_1000

# PPO agent parameters
ppo1:
num_minibatches: 8
num_epochs: 2
gamma: 0.96
gae_lambda: 0.95
ppo_clipping_epsilon: 0.2
value_coeff: 0.5
clip_value: True
max_gradient_norm: 0.5
anneal_entropy: False
entropy_coeff_start: 0.1
entropy_coeff_horizon: 0.6e8
entropy_coeff_end: 0.005
lr_scheduling: False
learning_rate: 0.005
adam_epsilon: 1e-5
with_memory: True
with_cnn: True
output_channels: 16
kernel_shape: [3, 3]
separate: False # only works with CNN
hidden_size: 8

ppo2:
num_minibatches: 8
num_epochs: 2
gamma: 0.96
gae_lambda: 0.95
ppo_clipping_epsilon: 0.2
value_coeff: 0.5
clip_value: True
max_gradient_norm: 0.5
anneal_entropy: False
entropy_coeff_start: 0.1
entropy_coeff_horizon: 0.6e8
entropy_coeff_end: 0.005
lr_scheduling: False
learning_rate: 0.005
adam_epsilon: 1e-5
with_memory: True
with_cnn: True
output_channels: 16
kernel_shape: [3, 3]
separate: False # only works with CNN
hidden_size: 8

# Logging setup
wandb:
entity: "ucl-dark"
project: ipditm
group: 'xplay-eval-${agent1}-vs-${agent2}'
name: run-seed-${seed}
log: True
85 changes: 85 additions & 0 deletions pax/conf/experiment/ipditm/eval_gs_mfos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# @package _global_

# Agents
agent1: 'PPO'
agent2: 'MFOS'

# Environment
env_id: InTheMatrix
env_type: meta
env_discount: 0.96
payoff: [[[3, 0], [5, 1]], [[3, 5], [0, 1]]]
runner: ipditm_eval
freeze: 5
fixed_coins: False

# Training hyperparameters

# env_batch_size = num_envs * num_opponents
num_envs: 50
num_opps: 1
num_outer_steps: 100
num_inner_steps: 152
save_interval: 100
num_iters: 1
save_gif: False
# Evaluation
# GS
run_path1: ucl-dark/ipditm/2wjr55mr
model_path1: exp/shaping-PPO-vs-PPO_memory/run-seed-0/2023-01-05_09.59.53.063797/generation_1000
run_path2: ucl-dark/ipditm/226zwu1v
model_path2: exp/shaping-MFOS-vs-PPO_memory/run-seed-0/2023-01-09_10.27.04.619601/generation_1000
# PPO agent parameters
ppo1:
num_minibatches: 8
num_epochs: 2
gamma: 0.96
gae_lambda: 0.95
ppo_clipping_epsilon: 0.2
value_coeff: 0.5
clip_value: True
max_gradient_norm: 0.5
anneal_entropy: False
entropy_coeff_start: 0.1
entropy_coeff_horizon: 0.6e8
entropy_coeff_end: 0.005
lr_scheduling: False
learning_rate: 0.005
adam_epsilon: 1e-5
with_memory: True
with_cnn: True
output_channels: 16
kernel_shape: [3, 3]
separate: False # only works with CNN
hidden_size: 8

ppo2:
num_minibatches: 8
num_epochs: 2
gamma: 0.96
gae_lambda: 0.95
ppo_clipping_epsilon: 0.2
value_coeff: 0.5
clip_value: True
max_gradient_norm: 0.5
anneal_entropy: False
entropy_coeff_start: 0.1
entropy_coeff_horizon: 0.6e8
entropy_coeff_end: 0.05
lr_scheduling: False
learning_rate: 0.005
adam_epsilon: 1e-5
with_memory: True
with_cnn: True
output_channels: 16
kernel_shape: [3, 3]
separate: False # only works with CNN
hidden_size: 16

# Logging setup
wandb:
entity: "ucl-dark"
project: ipditm
group: 'xplay-eval-${agent1}-vs-${agent2}'
name: run-seed-${seed}
log: True
87 changes: 87 additions & 0 deletions pax/conf/experiment/ipditm/eval_gs_shaper.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# @package _global_

# Agents
agent2: 'PPO_memory'
agent1: 'PPO'

# Environment
env_id: InTheMatrix
env_type: meta
env_discount: 0.96
payoff: [[[3, 0], [5, 1]], [[3, 5], [0, 1]]]
runner: ipditm_eval
freeze: 5
fixed_coins: False

# Training hyperparameters

# env_batch_size = num_envs * num_opponents
num_envs: 50
num_opps: 1
num_outer_steps: 100
num_inner_steps: 152
save_interval: 100
num_iters: 1
save_gif: False
# Evaluation
# Shaper
run_path2: ucl-dark/ipditm/1vpl5161
model_path2: exp/shaping-PPO_memory-vs-PPO_memory/run-seed-0/2023-01-05_14.13.25.169599/generation_1000
# GS
run_path1: ucl-dark/ipditm/2wjr55mr
model_path1: exp/shaping-PPO-vs-PPO_memory/run-seed-0/2023-01-05_09.59.53.063797/generation_1000

# PPO agent parameters
ppo2:
num_minibatches: 8
num_epochs: 2
gamma: 0.96
gae_lambda: 0.95
ppo_clipping_epsilon: 0.2
value_coeff: 0.5
clip_value: True
max_gradient_norm: 0.5
anneal_entropy: False
entropy_coeff_start: 0.1
entropy_coeff_horizon: 0.6e8
entropy_coeff_end: 0.005
lr_scheduling: False
learning_rate: 0.005
adam_epsilon: 1e-5
with_memory: True
with_cnn: True
output_channels: 16
kernel_shape: [3, 3]
separate: False # only works with CNN
hidden_size: 32

ppo1:
num_minibatches: 8
num_epochs: 2
gamma: 0.96
gae_lambda: 0.95
ppo_clipping_epsilon: 0.2
value_coeff: 0.5
clip_value: True
max_gradient_norm: 0.5
anneal_entropy: False
entropy_coeff_start: 0.1
entropy_coeff_horizon: 0.6e8
entropy_coeff_end: 0.005
lr_scheduling: False
learning_rate: 0.005
adam_epsilon: 1e-5
with_memory: True
with_cnn: True
output_channels: 16
kernel_shape: [3, 3]
separate: False # only works with CNN
hidden_size: 8

# Logging setup
wandb:
entity: "ucl-dark"
project: ipditm
group: 'xplay-eval-${agent1}-vs-${agent2}'
name: run-seed-${seed}
log: True
Loading