Vanilla policy gradient: no more optional batch size

+ uniformize names with the solution + removed whitespace at the end of the solution (apparently) Former-commit-id: 9dc4aea
EffiSciencesResearch · Aug 28, 2023 · 201be50 · 201be50
1 parent 186bfd9
commit 201be50
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 11 deletions.
diff --git a/days/w1d5/1_simple_pg_correction.py b/days/w1d5/1_simple_pg_correction.py
@@ -58,25 +58,24 @@ def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
     # make function to compute action distribution
     # What is the shape of obs?
     @typechecked
-    def get_policy(obs: TensorType[..., obs_dim]):
-        # Warning: obs has not always the same shape.
+    def get_policy(obs: TensorType["b", obs_dim]):
         logits = logits_net(obs)
         return Categorical(logits=logits)
 
     # make action selection function (outputs int actions, sampled from policy)
     # What is the shape of obs?
     @typechecked
     def get_action(obs: TensorType[obs_dim]) -> int:
-        return get_policy(obs).sample().item()
+        return get_policy(obs.unsquezze()).sample().item()
 
     # make loss function whose gradient, for the right data, is policy gradient
     # What does the weights parameter represents here?
     # What is the shape of obs?
     # Answer: b here is the sum of the len of each episode.
     @typechecked
-    def compute_loss(obs: TensorType["b", obs_dim], act: TensorType["b"], weights: TensorType["b"]):
-        logp : TensorType["b"] = get_policy(obs).log_prob(act)
-        return -(logp * weights).mean()
+    def compute_loss(obs: TensorType["b", obs_dim], acts: TensorType["b"], rewards: TensorType["b"]):
+        logp : TensorType["b"] = get_policy(obs).log_prob(acts)
+        return -(logp * rewards).mean()
 
     # make optimizer
     optimizer = Adam(logits_net.parameters(), lr=lr)
@@ -91,7 +90,7 @@ def train_one_epoch():
         batch_lens = []         # for measuring episode lengths
 
         # reset episode-specific variables
-        obs = env.reset()       # first obs comes from starting distribution 
+        obs = env.reset()       # first obs comes from starting distribution
         done = False            # signal from environment that episode is over
         ep_rews = []            # list for rewards accrued throughout ep
 
@@ -139,8 +138,8 @@ def train_one_epoch():
         # take a single policy gradient update step
         optimizer.zero_grad()
         batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
-                                  act=torch.as_tensor(batch_acts, dtype=torch.int32),
-                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32)
+                                  acts=torch.as_tensor(batch_acts, dtype=torch.int32),
+                                  rewards=torch.as_tensor(batch_weights, dtype=torch.float32)
                                   )
         batch_loss.backward()
         optimizer.step()

diff --git a/days/w1d5/vanilla_policy_gradient.ipynb b/days/w1d5/vanilla_policy_gradient.ipynb
@@ -88,7 +88,6 @@
     "    # What is the shape of obs?\n",
     "    @typechecked # To be typed\n",
     "    def get_policy(obs):\n",
-    "        # Warning: obs sometimes has a batch dimension, sometimes there is no such dimension\n",
     "        logits = logits_net(obs)\n",
     "        # Tip: Categorical is a convenient pytorch object which enable register logits (or a batch of logits)\n",
     "        # and then being able to sample from this pseudo-probability distribution with the \".sample()\" method.\n",
@@ -98,7 +97,7 @@
     "    # What is the shape of obs?\n",
     "    @typechecked # To be typed\n",
     "    def get_action(obs):\n",
-    "        return get_policy(obs).sample().item()\n",
+    "        return get_policy(obs.unsquezze(0)).sample().item()\n",
     "\n",
     "    # make loss function whose gradient, for the right data, is policy gradient\n",
     "    # What is the shape of obs?\n",