From a376a86a94c075e62db8e32039b439d6e984c04b Mon Sep 17 00:00:00 2001 From: Fabian Konstantinidis Date: Fri, 2 Feb 2024 14:10:52 +0100 Subject: [PATCH 1/6] added typehints --- notebooks/mdp_policy_gradient.ipynb | 3 ++- src/behavior_generation_lecture_python/mdp/policy.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/notebooks/mdp_policy_gradient.ipynb b/notebooks/mdp_policy_gradient.ipynb index 137a844..0759ff9 100644 --- a/notebooks/mdp_policy_gradient.ipynb +++ b/notebooks/mdp_policy_gradient.ipynb @@ -72,7 +72,8 @@ "outputs": [], "source": [ "policy_array = [\n", - " derive_deterministic_policy(mdp=grid_mdp, policy=model) for model in model_checkpoints\n", + " derive_deterministic_policy(mdp=grid_mdp, policy=model)\n", + " for model in model_checkpoints\n", "]" ] }, diff --git a/src/behavior_generation_lecture_python/mdp/policy.py b/src/behavior_generation_lecture_python/mdp/policy.py index d85ecde..19b712e 100644 --- a/src/behavior_generation_lecture_python/mdp/policy.py +++ b/src/behavior_generation_lecture_python/mdp/policy.py @@ -7,7 +7,11 @@ from torch.distributions.categorical import Categorical -def multi_layer_perceptron(sizes, activation=nn.ReLU, output_activation=nn.Identity): +def multi_layer_perceptron( + sizes: List[int], + activation: torch.nn.Module = nn.ReLU, + output_activation: torch.nn.Module = nn.Identity, +): """Returns a multi-layer perceptron""" mlp = nn.Sequential() for i in range(len(sizes) - 1): From 43db4952ffd993345095c0aebac67e9b2ead3ecc Mon Sep 17 00:00:00 2001 From: Fabian Konstantinidis Date: Fri, 2 Feb 2024 14:27:34 +0100 Subject: [PATCH 2/6] try to fix pyling error --- src/behavior_generation_lecture_python/mdp/mdp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/behavior_generation_lecture_python/mdp/mdp.py b/src/behavior_generation_lecture_python/mdp/mdp.py index 2fe4e15..81d15ab 100644 --- a/src/behavior_generation_lecture_python/mdp/mdp.py +++ b/src/behavior_generation_lecture_python/mdp/mdp.py @@ -632,11 +632,11 @@ def policy_gradient( # compute the loss logp = policy.get_log_prob( - states=torch.as_tensor(buffer.states, dtype=torch.float32), - actions=torch.as_tensor(buffer.actions, dtype=torch.int32), + states=torch.tensor(buffer.states, dtype=torch.float), + actions=torch.tensor(buffer.actions, dtype=torch.long), ) batch_loss = -( - logp * torch.as_tensor(buffer.weights, dtype=torch.float32) + logp * torch.tensor(buffer.weights, dtype=torch.float) ).mean() # take a single policy gradient update step From 4d4a95f01a8b96a3a345639b513c13870c0df037 Mon Sep 17 00:00:00 2001 From: Fabian Konstantinidis Date: Fri, 2 Feb 2024 14:30:08 +0100 Subject: [PATCH 3/6] ran black --- src/behavior_generation_lecture_python/mdp/mdp.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/behavior_generation_lecture_python/mdp/mdp.py b/src/behavior_generation_lecture_python/mdp/mdp.py index 81d15ab..9960e35 100644 --- a/src/behavior_generation_lecture_python/mdp/mdp.py +++ b/src/behavior_generation_lecture_python/mdp/mdp.py @@ -635,9 +635,7 @@ def policy_gradient( states=torch.tensor(buffer.states, dtype=torch.float), actions=torch.tensor(buffer.actions, dtype=torch.long), ) - batch_loss = -( - logp * torch.tensor(buffer.weights, dtype=torch.float) - ).mean() + batch_loss = -(logp * torch.tensor(buffer.weights, dtype=torch.float)).mean() # take a single policy gradient update step optimizer.zero_grad() From 7fb8ddcfea2c5a2a6e87d9d2736e59b7d93ee855 Mon Sep 17 00:00:00 2001 From: Fabian Konstantinidis Date: Fri, 2 Feb 2024 14:38:48 +0100 Subject: [PATCH 4/6] unified torch calls --- src/behavior_generation_lecture_python/mdp/mdp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/behavior_generation_lecture_python/mdp/mdp.py b/src/behavior_generation_lecture_python/mdp/mdp.py index 9960e35..4f78749 100644 --- a/src/behavior_generation_lecture_python/mdp/mdp.py +++ b/src/behavior_generation_lecture_python/mdp/mdp.py @@ -598,7 +598,7 @@ def policy_gradient( # call model to get next action action = policy.get_action( - state=torch.as_tensor(state, dtype=torch.float32) + state=torch.tensor(state, dtype=torch.float32) ) # execute action in the environment From 90f0bf01a6afb15bf3ac44ae010ef8529f8704d3 Mon Sep 17 00:00:00 2001 From: Fabian Konstantinidis Date: Fri, 2 Feb 2024 14:42:18 +0100 Subject: [PATCH 5/6] unified torch calls --- src/behavior_generation_lecture_python/mdp/policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/behavior_generation_lecture_python/mdp/policy.py b/src/behavior_generation_lecture_python/mdp/policy.py index 19b712e..d0d1f64 100644 --- a/src/behavior_generation_lecture_python/mdp/policy.py +++ b/src/behavior_generation_lecture_python/mdp/policy.py @@ -29,7 +29,7 @@ def __init__(self, sizes: List[int], actions: List): torch.manual_seed(1337) self.net = multi_layer_perceptron(sizes=sizes) self.actions = actions - self._actions_tensor = torch.as_tensor(actions, dtype=torch.float32).view( + self._actions_tensor = torch.tensor(actions, dtype=torch.long).view( len(actions), -1 ) From e0bb5582f0c12452d41c56bf430af9ba157a7a01 Mon Sep 17 00:00:00 2001 From: Fabian Konstantinidis Date: Fri, 2 Feb 2024 14:57:42 +0100 Subject: [PATCH 6/6] ran black --- src/behavior_generation_lecture_python/mdp/mdp.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/behavior_generation_lecture_python/mdp/mdp.py b/src/behavior_generation_lecture_python/mdp/mdp.py index 4f78749..cc495c3 100644 --- a/src/behavior_generation_lecture_python/mdp/mdp.py +++ b/src/behavior_generation_lecture_python/mdp/mdp.py @@ -597,9 +597,7 @@ def policy_gradient( buffer.states.append(deepcopy(state)) # call model to get next action - action = policy.get_action( - state=torch.tensor(state, dtype=torch.float32) - ) + action = policy.get_action(state=torch.tensor(state, dtype=torch.float32)) # execute action in the environment state, reward, done = mdp.execute_action(state=state, action=action)