Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pr allow curriculum learning in grocery ground goal task #82

Merged
merged 8 commits into from
Oct 11, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions python/social_bot/envs/grocery_ground.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ def __init__(self,
fail_distance_thresh=3,
random_range=10.0,
random_goal=False,
start_range=0,
increase_range_by_percent=50.,
reward_thresh_to_increase_range=0.4,
percent_full_range_in_curriculum=0.1,
max_reward_q_length=100,
reward_weight=1.0):
"""
Args:
Expand All @@ -85,6 +90,17 @@ def __init__(self,
it's considered a failure and is givne reward -1
random_range (float): the goal's random position range
random_goal (bool): if ture, teacher will randomly select goal from the object list each episode
start_range (float): for curriculum learning, the starting random_range to set the goal
Enables curriculum learning if start_range > 1.2 * success_distance_thresh.
Jialn marked this conversation as resolved.
Show resolved Hide resolved
NOTE: Because curriculum learning is implemented using teacher in the environment,
currently teacher status are not stored in model checkpoints. Resuming is not supported.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As curriculum range is increased according to parameter reward_thresh_to_increase_range automatically, is it supposed to be kind of supporting resuming?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is described in Issue #79

increase_range_by_percent (float): for curriculum learning, how much to increase random range
every time agent reached the specified amount of reward.
reward_thresh_to_increase_range (float): for curriculum learning, how much reward to reach
before the teacher increases random range.
percent_full_range_in_curriculum (float): if above 0, randomly throw in x% of training examples
where random_range is the full range instead of the easier ones in the curriculum.
max_reward_q_length (int): how many recent rewards to consider when estimating agent accuracy.
"""
assert goal_name is not None, "Goal name needs to be set, not None."
GoalTask.__init__(
Expand All @@ -93,7 +109,12 @@ def __init__(self,
goal_name=goal_name,
success_distance_thresh=success_distance_thresh,
fail_distance_thresh=fail_distance_thresh,
random_range=random_range)
random_range=random_range,
start_range=start_range,
increase_range_by_percent=increase_range_by_percent,
reward_thresh_to_increase_range=reward_thresh_to_increase_range,
percent_full_range_in_curriculum=percent_full_range_in_curriculum,
max_reward_q_length=max_reward_q_length)
GroceryGroundTaskBase.__init__(self)
self._random_goal = random_goal
self._objects_in_world = [
Expand All @@ -104,8 +125,11 @@ def __init__(self,
'coke_can', 'table', 'bookshelf', 'car_wheel', 'plastic_cup',
'beer', 'hammer'
]
logging.info("goal_name %s, random_goal %d, fail_distance_thresh %f",
logging.info("goal_name %s, random_goal %d, fail_distance_thresh %f,",
self._goal_name, self._random_goal, fail_distance_thresh)
if GoalTask.should_use_curriculum_training(self):
logging.info("start_range %f, reward_thresh_to_increase_range %f",
self._start_range, self._reward_thresh_to_increase_range)
self._pos_list = list(itertools.product(range(-5, 5), range(-5, 5)))
self._pos_list.remove((0, 0))
self.reward_weight = reward_weight
Expand Down Expand Up @@ -509,6 +533,9 @@ def __init__(self,
agent_type='pioneer2dx_noplugin',
world_time_precision=None,
step_time=0.1,
random_goal=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that random_goal,fail_distance_thresh and max_steps are not used in this class. These parameters are configured it by gin files.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Forgot to remove them.

fail_distance_thresh=3,
max_steps=200,
Jialn marked this conversation as resolved.
Show resolved Hide resolved
port=None,
action_cost=0.0,
resized_image_size=(64, 64),
Expand All @@ -534,6 +561,11 @@ def __init__(self,
step_time (float): the peroid of one step of the environment.
step_time / world_time_precision is how many simulator substeps during one
environment step. for some complex agent, i.e., icub, using step_time of 0.05 is better
random_goal (bool): Optional flag to control whether goal is randomly picked
or just the ball.
fail_distance_thresh (float): end session if agent is too far away from target.
max_steps (int): maximum number of simulation steps in an episode.
(Unless a smaller value is specified in REPO/__init__.py)
port: Gazebo port, need to specify when run multiple environment in parallel
action_cost (float): Add an extra action cost to reward, which helps to train
an energy/forces efficency policy or reduce unnecessary movements
Expand Down
64 changes: 61 additions & 3 deletions python/social_bot/teacher_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
A variety of teacher tasks.
"""

from collections import deque
import math
import numpy as np
import os
Expand All @@ -41,7 +42,12 @@ def __init__(self,
goal_name="goal",
success_distance_thresh=0.5,
fail_distance_thresh=0.5,
random_range=2.0):
random_range=2.0,
start_range=0,
increase_range_by_percent=50.,
reward_thresh_to_increase_range=0.4,
percent_full_range_in_curriculum=0.1,
max_reward_q_length=100):
"""
Args:
max_steps (int): episode will end if not reaching gaol in so many steps
Expand All @@ -50,15 +56,57 @@ def __init__(self,
fail_distance_thresh (float): if the agent moves away from the goal more than this distance,
it's considered a failure and is given reward -1
random_range (float): the goal's random position range
start_range (float): for curriculum learning, the starting random_range to set the goal
increase_range_by_percent (float): for curriculum learning, how much to increase random range
every time agent reached the specified amount of reward.
reward_thresh_to_increase_range (float): for curriculum learning, how much reward to reach
before the teacher increases random range.
percent_full_range_in_curriculum (float): if above 0, randomly throw in x% of training examples
where random_range is the full range instead of the easier ones in the curriculum.
max_reward_q_length (int): how many recent rewards to consider when estimating agent accuracy.
"""
super().__init__()
self._q = deque()
self._goal_name = goal_name
self._success_distance_thresh = success_distance_thresh
self._fail_distance_thresh = fail_distance_thresh
self._max_steps = max_steps
self._random_range = random_range
self._start_range = start_range
self._is_full_range_in_curriculum = False
if self.should_use_curriculum_training():
logging.info("Setting random_range to %f", self._start_range)
self._orig_random_range = random_range
self._random_range = start_range
self._max_reward_q_length = max_reward_q_length
self._reward_thresh_to_increase_range = reward_thresh_to_increase_range
self._increase_range_by_percent = increase_range_by_percent
self._percent_full_range_in_curriculum = percent_full_range_in_curriculum
else:
self._random_range = random_range
self.task_vocab = ['hello', 'goal', 'well', 'done', 'failed', 'to']

def should_use_curriculum_training(self):
return self._start_range >= self._success_distance_thresh * 1.2

def _push_reward_queue(self, value):
if (not self.should_use_curriculum_training() or
self._is_full_range_in_curriculum):
return
while len(self._q) >= self._max_reward_q_length:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deque has an argument maxlen. You don't need to pop it if maxlen is provided.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

self._q.popleft()
self._q.append(value)
if (value > 0 and len(self._q) == self._max_reward_q_length and
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think "len(self._q) == self._max_reward_q_length" can be removed. It's unlikely to exceed the reward_thresh without the queue being full.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, actually after curriculum advances, we clear the deque, and it's very likely the agent can get a few episodes successfully (because agent can already pass earlier level in curriculum), and pass the next level of the curriculum by accident.

sum(self._q) >= self._max_reward_q_length *
self._reward_thresh_to_increase_range):
self._random_range *= 1. + self._increase_range_by_percent
if self._random_range > self._orig_random_range:
self._random_range = self._orig_random_range
logging.info("Raising random_range to %f", self._random_range)
self._q.clear()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest polyak average can be used here, which could make the code logic simpler and less computational, and can have similar effect.

alpha = 0.001
self.polyak_reward = value * alpha + self.polyak_reward * (1 - alpha)
if self.polyak_reward > reward_thresh_to_increase_range:
    self._random_range += self._random_range * self._increase_range_by_percent
    self.polyak_reward = 0

alpha has similar effect to 'max_reward_q_length' here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using queue if Ok, it's easier to reason about the effect, plus the code isn't that much complex if maxlen is used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, Jiangtao, I'll keep this in mind for the future. For this one, I'll just use success rate?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, of course, just a simple suggestion :-)


def get_random_range(self):
return self._random_range

def run(self, agent, world):
"""
Start a teaching episode for this task.
Expand Down Expand Up @@ -86,6 +134,7 @@ def run(self, agent, world):
goal_dir = (goal_loc[0:2] - loc[0:2]) / dist
dot = sum(dir * goal_dir)
if dot > 0.707:
self._push_reward_queue(1)
# within 45 degrees of the agent direction
logging.debug("loc: " + str(loc) + " goal: " +
str(goal_loc) + "dist: " + str(dist))
Expand All @@ -96,17 +145,26 @@ def run(self, agent, world):
else:
agent_sentence = yield TeacherAction()
elif dist > self._initial_dist + self._fail_distance_thresh:
self._push_reward_queue(0)
logging.debug("loc: " + str(loc) + " goal: " + str(goal_loc) +
"dist: " + str(dist))
yield TeacherAction(reward=-1.0, sentence="failed", done=True)
else:
agent_sentence = yield TeacherAction(sentence=self._goal_name)
logging.debug("loc: " + str(loc) + " goal: " + str(goal_loc) +
"dist: " + str(dist))
self._push_reward_queue(0)
yield TeacherAction(reward=-1.0, sentence="failed", done=True)

def _move_goal(self, goal, agent_loc):
range = self._random_range
if (self.should_use_curriculum_training() and
self._percent_full_range_in_curriculum > 0 and
random.random() < self._percent_full_range_in_curriculum):
range = self._orig_random_range
self._is_full_range_in_curriculum = True
else:
range = self._random_range
self._is_full_range_in_curriculum = False
while True:
loc = (random.random() * range - range / 2,
random.random() * range - range / 2, 0)
Expand Down