HorizonRobotics · le-horizon · Oct 11, 2019 · Aug 26, 2019 · Sep 3, 2019 · Oct 7, 2019
diff --git a/python/social_bot/envs/grocery_ground.py b/python/social_bot/envs/grocery_ground.py
@@ -75,6 +75,11 @@ def __init__(self,
                  fail_distance_thresh=3,
                  random_range=10.0,
                  random_goal=False,
+                 start_range=0,
+                 increase_range_by_percent=50.,
+                 reward_thresh_to_increase_range=0.4,
+                 percent_full_range_in_curriculum=0.1,
+                 max_reward_q_length=100,
                  reward_weight=1.0):
         """
         Args:
@@ -85,6 +90,17 @@ def __init__(self,
                 it's considered a failure and is givne reward -1
             random_range (float): the goal's random position range
             random_goal (bool): if ture, teacher will randomly select goal from the object list each episode
+            start_range (float): for curriculum learning, the starting random_range to set the goal
+                Enables curriculum learning if start_range > 1.2 * success_distance_thresh.
+                NOTE: Because curriculum learning is implemented using teacher in the environment,
+                currently teacher status are not stored in model checkpoints.  Resuming is not supported.
+            increase_range_by_percent (float): for curriculum learning, how much to increase random range
+                every time agent reached the specified amount of reward.
+            reward_thresh_to_increase_range (float): for curriculum learning, how much reward to reach
+                before the teacher increases random range.
+            percent_full_range_in_curriculum (float): if above 0, randomly throw in x% of training examples
+                where random_range is the full range instead of the easier ones in the curriculum.
+            max_reward_q_length (int): how many recent rewards to consider when estimating agent accuracy.
         """
         assert goal_name is not None, "Goal name needs to be set, not None."
         GoalTask.__init__(
@@ -93,7 +109,12 @@ def __init__(self,
             goal_name=goal_name,
             success_distance_thresh=success_distance_thresh,
             fail_distance_thresh=fail_distance_thresh,
-            random_range=random_range)
+            random_range=random_range,
+            start_range=start_range,
+            increase_range_by_percent=increase_range_by_percent,
+            reward_thresh_to_increase_range=reward_thresh_to_increase_range,
+            percent_full_range_in_curriculum=percent_full_range_in_curriculum,
+            max_reward_q_length=max_reward_q_length)
         GroceryGroundTaskBase.__init__(self)
         self._random_goal = random_goal
         self._objects_in_world = [
@@ -104,8 +125,11 @@ def __init__(self,
             'coke_can', 'table', 'bookshelf', 'car_wheel', 'plastic_cup',
             'beer', 'hammer'
         ]
-        logging.info("goal_name %s, random_goal %d, fail_distance_thresh %f",
+        logging.info("goal_name %s, random_goal %d, fail_distance_thresh %f,",
             self._goal_name, self._random_goal, fail_distance_thresh)
+        if GoalTask.should_use_curriculum_training(self):
+            logging.info("start_range %f, reward_thresh_to_increase_range %f",
+                self._start_range, self._reward_thresh_to_increase_range)
         self._pos_list = list(itertools.product(range(-5, 5), range(-5, 5)))
         self._pos_list.remove((0, 0))
         self.reward_weight = reward_weight
@@ -509,6 +533,9 @@ def __init__(self,
                  agent_type='pioneer2dx_noplugin',
                  world_time_precision=None,
                  step_time=0.1,
+                 random_goal=None,
+                 fail_distance_thresh=3,
+                 max_steps=200,
                  port=None,
                  action_cost=0.0,
                  resized_image_size=(64, 64),
@@ -534,6 +561,11 @@ def __init__(self,
             step_time (float): the peroid of one step of the environment.
                 step_time / world_time_precision is how many simulator substeps during one
                 environment step. for some complex agent, i.e., icub, using step_time of 0.05 is better
+            random_goal (bool): Optional flag to control whether goal is randomly picked
+                or just the ball.
+            fail_distance_thresh (float): end session if agent is too far away from target.
+            max_steps (int): maximum number of simulation steps in an episode.
+                (Unless a smaller value is specified in REPO/__init__.py)
             port: Gazebo port, need to specify when run multiple environment in parallel
             action_cost (float): Add an extra action cost to reward, which helps to train
                 an energy/forces efficency policy or reduce unnecessary movements

diff --git a/python/social_bot/teacher_tasks.py b/python/social_bot/teacher_tasks.py
@@ -15,6 +15,7 @@
 A variety of teacher tasks.
 """
 
+from collections import deque
 import math
 import numpy as np
 import os
@@ -41,7 +42,12 @@ def __init__(self,
                  goal_name="goal",
                  success_distance_thresh=0.5,
                  fail_distance_thresh=0.5,
-                 random_range=2.0):
+                 random_range=2.0,
+                 start_range=0,
+                 increase_range_by_percent=50.,
+                 reward_thresh_to_increase_range=0.4,
+                 percent_full_range_in_curriculum=0.1,
+                 max_reward_q_length=100):
         """
         Args:
             max_steps (int): episode will end if not reaching gaol in so many steps
@@ -50,15 +56,57 @@ def __init__(self,
             fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
                 it's considered a failure and is given reward -1
             random_range (float): the goal's random position range
+            start_range (float): for curriculum learning, the starting random_range to set the goal
+            increase_range_by_percent (float): for curriculum learning, how much to increase random range
+                every time agent reached the specified amount of reward.
+            reward_thresh_to_increase_range (float): for curriculum learning, how much reward to reach
+                before the teacher increases random range.
+            percent_full_range_in_curriculum (float): if above 0, randomly throw in x% of training examples
+                where random_range is the full range instead of the easier ones in the curriculum.
+            max_reward_q_length (int): how many recent rewards to consider when estimating agent accuracy.
         """
         super().__init__()
+        self._q = deque()
         self._goal_name = goal_name
         self._success_distance_thresh = success_distance_thresh
         self._fail_distance_thresh = fail_distance_thresh
         self._max_steps = max_steps
-        self._random_range = random_range
+        self._start_range = start_range
+        self._is_full_range_in_curriculum = False
+        if self.should_use_curriculum_training():
+            logging.info("Setting random_range to %f", self._start_range)
+            self._orig_random_range = random_range
+            self._random_range = start_range
+            self._max_reward_q_length = max_reward_q_length
+            self._reward_thresh_to_increase_range = reward_thresh_to_increase_range
+            self._increase_range_by_percent = increase_range_by_percent
+            self._percent_full_range_in_curriculum = percent_full_range_in_curriculum
+        else:
+            self._random_range = random_range
         self.task_vocab = ['hello', 'goal', 'well', 'done', 'failed', 'to']
 
+    def should_use_curriculum_training(self):
+        return self._start_range >= self._success_distance_thresh * 1.2
+
+    def _push_reward_queue(self, value):
+        if (not self.should_use_curriculum_training() or
+            self._is_full_range_in_curriculum):
+            return
+        while len(self._q) >= self._max_reward_q_length:
+            self._q.popleft()
+        self._q.append(value)
+        if (value > 0 and len(self._q) == self._max_reward_q_length and
+            sum(self._q) >= self._max_reward_q_length *
+                self._reward_thresh_to_increase_range):
+            self._random_range *= 1. + self._increase_range_by_percent
+            if self._random_range > self._orig_random_range:
+                self._random_range = self._orig_random_range
+            logging.info("Raising random_range to %f", self._random_range)
+            self._q.clear()
+
+    def get_random_range(self):
+        return self._random_range
+
     def run(self, agent, world):
         """
         Start a teaching episode for this task.
@@ -86,6 +134,7 @@ def run(self, agent, world):
                 goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
                 dot = sum(dir * goal_dir)
                 if dot > 0.707:
+                    self._push_reward_queue(1)
                     # within 45 degrees of the agent direction
                     logging.debug("loc: " + str(loc) + " goal: " +
                                   str(goal_loc) + "dist: " + str(dist))
@@ -96,17 +145,26 @@ def run(self, agent, world):
                 else:
                     agent_sentence = yield TeacherAction()
             elif dist > self._initial_dist + self._fail_distance_thresh:
+                self._push_reward_queue(0)
                 logging.debug("loc: " + str(loc) + " goal: " + str(goal_loc) +
                               "dist: " + str(dist))
                 yield TeacherAction(reward=-1.0, sentence="failed", done=True)
             else:
                 agent_sentence = yield TeacherAction(sentence=self._goal_name)
         logging.debug("loc: " + str(loc) + " goal: " + str(goal_loc) +
                       "dist: " + str(dist))
+        self._push_reward_queue(0)
         yield TeacherAction(reward=-1.0, sentence="failed", done=True)
 
     def _move_goal(self, goal, agent_loc):
-        range = self._random_range
+        if (self.should_use_curriculum_training() and
+            self._percent_full_range_in_curriculum > 0 and
+            random.random() < self._percent_full_range_in_curriculum):
+            range = self._orig_random_range
+            self._is_full_range_in_curriculum = True
+        else:
+            range = self._random_range
+            self._is_full_range_in_curriculum = False
         while True:
             loc = (random.random() * range - range / 2,
                    random.random() * range - range / 2, 0)