diff --git a/README.md b/README.md index 37787ca..7fadc5f 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,14 @@ This README tells you how to set up the tutorials, decide which content is best > 3. If you are attending a live tutorial event, **please** follow the setup instructions provided in advance. It will take too long to do these instructions during the event. > 4. For troubleshooting help, see the [Troubleshooting, Tips, and Tricks](reference/Troubleshooting-Tips-Tricks.ipynb) notebook. +## Dependencies + +For a few of the examples here, you may need to add some additional dependencies: + + * `nodejs` + * `holoviews` + * `bokeh` + ## Join Us at Ray Summit 2020! Join us for the [_free_ Ray Summit 2020 virtual conference](https://events.linuxfoundation.org/ray-summit/?utm_source=dean&utm_medium=embed&utm_campaign=ray_summit&utm_content=anyscale_academy), September 30 - October 1, 2020. We have an amazing lineup of luminar keynote speakers and breakout sessions on the Ray ecosystem, third-party Ray libraries, and applications of Ray in the real world. diff --git a/TODO b/TODO new file mode 100644 index 0000000..37f7dd5 --- /dev/null +++ b/TODO @@ -0,0 +1,60 @@ +advanced-ray/01-Ray-Tasks-Revisited.ipynb: "from bokeh.layouts import gridplot\n", +advanced-ray/01-Ray-Tasks-Revisited.ipynb: "from bokeh.plotting import figure, output_file, show\n", +advanced-ray/01-Ray-Tasks-Revisited.ipynb: "import bokeh.io\n", +advanced-ray/01-Ray-Tasks-Revisited.ipynb: "bokeh.io.reset_output()\n", +advanced-ray/01-Ray-Tasks-Revisited.ipynb: "bokeh.io.output_notebook()\n", + +ray-crash-course/actor_lesson_util.py:import holoviews as hv +ray-crash-course/actor_lesson_util.py:from holoviews import dim, opts +ray-crash-course/actor_lesson_util.py:from holoviews.streams import Pipe, Buffer +ray-crash-course/actor_lesson_util.py:from holoviews.plotting.util import process_cmap +ray-crash-course/task_lesson_util.py:import holoviews as hv +ray-crash-course/task_lesson_util.py:from holoviews import opts +ray-crash-course/task_lesson_util.py:from holoviews.streams import Counter, Tap + +ray-crash-course/bokeh_util.py:from bokeh.layouts import gridplot +ray-crash-course/bokeh_util.py:from bokeh.plotting import figure, output_file, show +ray-crash-course/bokeh_util.py:import bokeh.io +ray-crash-course/bokeh_util.py:bokeh.io.reset_output() +ray-crash-course/bokeh_util.py:bokeh.io.output_notebook() +ray-crash-course/bokeh_util.py: from bokeh.plotting import show +ray-crash-course/solutions/Ray-Crash-Course-Solutions.ipynb: "from bokeh_util import two_lines_plot, means_stddevs_plot # Some plotting utilities in `./bokeh_util.py`.\n", +ray-crash-course/solutions/Ray-Crash-Course-Solutions.ipynb: "from bokeh.plotting import show, figure\n", +ray-crash-course/solutions/Ray-Crash-Course-Solutions.ipynb: "from bokeh.layouts import gridplot" + +ray-crash-course/actor_lesson_util.py:hv.extension('bokeh') +ray-crash-course/task_lesson_util.py:from bokeh_util import square_circle_plot, two_lines_plot, means_stddevs_plot +ray-crash-course/task_lesson_util.py:hv.extension('bokeh') +ray-crash-course/task_lesson_util.py:from bokeh.layouts import gridplot, layout +ray-crash-course/task_lesson_util.py:from bokeh.models import Slider, Button +ray-crash-course/task_lesson_util.py:from bokeh.plotting import figure, output_file, show +ray-crash-course/02-Ray-Actors.ipynb: "from bokeh_util import two_lines_plot # utility we used in the previous lesson\n", +ray-crash-course/02-Ray-Actors.ipynb: "from bokeh.plotting import show, figure\n", +ray-crash-course/02-Ray-Actors.ipynb: "from bokeh.layouts import gridplot" +ray-crash-course/01-Ray-Tasks.ipynb: "from bokeh_util import two_lines_plot, means_stddevs_plot # Some plotting utilities in `./bokeh_util.py`.\n", +ray-crash-course/01-Ray-Tasks.ipynb: "from bokeh.plotting import show, figure\n", +ray-crash-course/01-Ray-Tasks.ipynb: "from bokeh.layouts import gridplot" + +ray-project/requirements.txt:bokeh==2.1.1 + +ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb: "import bokeh.io\n", +ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb: "bokeh.io.reset_output()\n", +ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb: "bokeh.io.output_notebook()" +ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb: "import bokeh.io\n", +ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb: "bokeh.io.reset_output()\n", +ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb: "bokeh.io.output_notebook()" +ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb: "import bokeh.io\n", +ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb: "bokeh.io.reset_output()\n", +ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb: "bokeh.io.output_notebook()" +ray-rllib/solutions/Ray-RLlib-Solutions.ipynb: "import bokeh.io\n", +ray-rllib/solutions/Ray-RLlib-Solutions.ipynb: "bokeh.io.reset_output()\n", +ray-rllib/solutions/Ray-RLlib-Solutions.ipynb: "bokeh.io.output_notebook()" + +ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb: "import bokeh.io\n", +ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb: "bokeh.io.reset_output()\n", +ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb: "bokeh.io.output_notebook()" + +util/line_plots.py:from bokeh.plotting import figure, show, output_file +util/line_plots.py:from bokeh.models import Band, ColumnDataSource, Range1d +util/line_plots.py:from bokeh.models.tools import HoverTool +util/line_plots.py:import bokeh.io \ No newline at end of file diff --git a/ray-crash-course/exer01.ipynb b/ray-crash-course/exer01.ipynb new file mode 100644 index 0000000..e2daf45 --- /dev/null +++ b/ray-crash-course/exer01.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise 1 - Simple Data Parallel Example\n", + "\n", + "**GOAL:** The goal of this exercise is to show how to run simple tasks in parallel.\n", + "\n", + "The Python script used in this exercise runs too slowly, although its computation is embarrassingly parallel. Use Ray to execute the functions in parallel to speed up the script." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Concept for this Exercise - Remote Functions\n", + "\n", + "The standard way to turn a Python function into a *remote function* is to add the `@ray.remote` decorator. Here is an example.\n", + "\n", + "```python\n", + "# A regular Python function.\n", + "def regular_function():\n", + " return 1\n", + "\n", + "# A Ray remote function.\n", + "@ray.remote\n", + "def remote_function():\n", + " return 1\n", + "```\n", + "\n", + "The differences are the following:\n", + "\n", + "1. **Invocation:** The regular version is called with `regular_function()`, whereas the remote version is called with `remote_function.remote()`.\n", + "2. **Return values:** `regular_function` immediately executes and returns `1`, whereas `remote_function` immediately returns an *object ref* (a future) and then creates a task that will be executed on a worker process. The result can be obtained with `ray.get`.\n", + " ```python\n", + " >>> regular_function()\n", + " 1\n", + " \n", + " >>> remote_function.remote()\n", + " ObjectID(1c80d6937802cd7786ad25e50caf2f023c95e350)\n", + " \n", + " >>> ray.get(remote_function.remote())\n", + " 1\n", + " ```\n", + "3. **Parallelism:** Invocations of `regular_function` happen **serially**, for example\n", + " ```python\n", + " # These happen serially.\n", + " for _ in range(4):\n", + " regular_function()\n", + " ```\n", + " whereas invocations of `remote_function` happen in **parallel**, for example\n", + " ```python\n", + " # These happen in parallel.\n", + " for _ in range(4):\n", + " remote_function.remote()\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example needs to run four tasks concurrently.\n", + "To illustrate how these tasks run in parallel, we'll tell Ray explicitly that there are four CPUs.\n", + "\n", + "Usually this is not done.\n", + "By default, Ray does not schedule more tasks concurrently than there are CPUs. \n", + "Instead it computes the number of CPUs using `psutil.cpu_count()`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import psutil\n", + "\n", + "psutil.cpu_count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's start Ray using `ray.init` -- with the `num_cpus` set explicitly.\n", + "This starts a number of processes. \n", + "The other argument `ignore_reinit_error=True` simply ignores errors if this cell gets re-run multiple times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ray\n", + "import time\n", + "\n", + "ray.init(num_cpus=4, ignore_reinit_error=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE:** The function below run too slow. Turn it into a remote function using the `@ray.remote` decorator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This function is a proxy for a more interesting and computationally\n", + "# intensive function.\n", + "def slow_function(i):\n", + " time.sleep(1)\n", + " return i" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE:** The loop below takes too long. The four function calls could be executed in parallel.\n", + "Instead of four seconds, it should only take one second for total run time.\n", + "\n", + "Once `slow_function` has been made a remote function, execute these four tasks in parallel by calling `slow_function.remote()`.\n", + "Then obtain the results by calling `ray.get` on a list of the resulting object refs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sleep a little to improve the accuracy of the timing measurements below.\n", + "# We do this because workers may still be starting up in the background.\n", + "time.sleep(2.0)\n", + "start_time = time.time()\n", + "\n", + "results = [slow_function(i) for i in range(4)]\n", + " \n", + "end_time = time.time()\n", + "duration = end_time - start_time\n", + "\n", + "print('The results are {}. This took {} seconds. Run the next cell to see '\n", + " 'if the exercise was done correctly.'.format(results, duration))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**VERIFY:** Run some checks to verify that the changes you made to the code were correct. Some of the checks should fail when you initially run the cells. After completing the exercises, the checks should pass." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert results == [0, 1, 2, 3], 'Did you remember to call ray.get?'\n", + "assert duration < 1.1, ('The loop took {} seconds. This is too slow.'\n", + " .format(duration))\n", + "assert duration > 1, ('The loop took {} seconds. This is too fast.'\n", + " .format(duration))\n", + "\n", + "print('Success! The example took {} seconds.'.format(duration))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE:** Use the UI to view the task timeline and to verify that the four tasks were executed in parallel. You can do this as follows.\n", + "\n", + "1. Run the following cell to generate a JSON file containing the profiling data.\n", + "2. Download the timeline file by right clicking on `timeline01.json` in the navigator to the left and choosing **\"Download\"**.\n", + "3. Open [`chrome://tracing/`](chrome://tracing/) in the Chrome web browser, click on the **\"Load\"** button and load the downloaded JSON file.\n", + "\n", + "To navigate within the timeline, do the following.\n", + "- Move around by clicking and dragging.\n", + "- Zoom in and out by holding **alt** (or **option**) and scrolling.\n", + "\n", + "**NOTE:** The timeline visualization will only work in **Chrome**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ray.timeline(filename=\"timeline01.json\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "382.391px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/ray-rllib/00-Ray-RLlib-Overview.ipynb b/ray-rllib/00-Ray-RLlib-Overview.ipynb index 4a54dd0..849713d 100644 --- a/ray-rllib/00-Ray-RLlib-Overview.ipynb +++ b/ray-rllib/00-Ray-RLlib-Overview.ipynb @@ -171,7 +171,6 @@ "## Getting Help\n", "\n", "* The [#tutorial channel](https://ray-distributed.slack.com/archives/C011ML23W5B) on the [Ray Slack](https://ray-distributed.slack.com). [Click here](https://forms.gle/9TSdDYUgxYs8SA9e8) to join.\n", - "* [Email](mailto:academy@anyscale.com)\n", "\n", "Find an issue? Please report it!\n", "\n", @@ -206,7 +205,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb b/ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb index a938ca9..7dfbe4f 100644 --- a/ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb +++ b/ray-rllib/01-Introduction-to-Reinforcement-Learning.ipynb @@ -267,8 +267,8 @@ }, "outputs": [], "source": [ - "env = gym.make('CartPole-v1')\n", - "print('Created env:', env)" + "env = gym.make(\"CartPole-v1\")\n", + "print(\"Created env:\", env)" ] }, { @@ -298,7 +298,7 @@ "outputs": [], "source": [ "state = env.reset()\n", - "print('The starting state is:', state)" + "print(\"The starting state is:\", state)" ] }, { @@ -396,7 +396,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Try rerunning the following cell a few times. How much do the answers change? Note that the maximum possible reward for `CartPole` is 500. You'll probably get numbers under 200." + "Try rerunning the following cell a few times. How much do the answers change? Note that the maximum possible reward for `CartPole-v1` is 500. You'll probably get numbers well under 500." ] }, { @@ -415,6 +415,7 @@ "source": [ "reward = random_rollout(env)\n", "print(reward)\n", + "\n", "reward = random_rollout(env)\n", "print(reward)" ] @@ -527,7 +528,6 @@ }, "outputs": [], "source": [ - "# import gym # imported above already, but listed here for completeness\n", "import ray\n", "from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG\n", "from ray.tune.logger import pretty_print" @@ -550,7 +550,7 @@ }, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True, log_to_driver=False)" + "info = ray.init(ignore_reinit_error=True, log_to_driver=False)" ] }, { @@ -575,7 +575,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -641,20 +641,24 @@ "metadata": {}, "outputs": [], "source": [ - "N=10\n", + "N = 10\n", "results = []\n", "episode_data = []\n", "episode_json = []\n", + "\n", "for n in range(N):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']} \n", + " 'episode_len_mean': result['episode_len_mean']} \n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", + " \n", " print(f'{n:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}')" ] }, @@ -688,37 +692,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's plot the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(\"..\")\n", - "from util.line_plots import plot_line, plot_line_with_min_max, plot_line_with_stddev" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bokeh.io\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the length and reward means are equal, we'll only plot one line:" + "Let's plot the data. Since the length and reward means are equal, we'll only plot one line:" ] }, { @@ -727,17 +701,14 @@ "metadata": {}, "outputs": [], "source": [ - "plot_line_with_min_max(df, x_col='n', y_col='episode_reward_mean', min_col='episode_reward_min', max_col='episode_reward_max',\n", - " title='Episode Rewards', x_axis_label='n', y_axis_label='reward')" + "df.plot(x=\"n\", y=[\"episode_reward_mean\", \"episode_reward_min\", \"episode_reward_max\"], secondary_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "([image](../images/rllib/Cart-Pole-Episode-Rewards.png))\n", - "\n", - "The model is quickly able to hit the maximum value of 500, but the mean is what's most valueable. After 10 steps, we're more than half way there." + "The model is quickly able to hit the maximum value of 500, but the mean is what's most valuable. After 10 steps, we're more than half way there." ] }, { @@ -798,15 +769,14 @@ "source": [ "episode_rewards = results[-1]['hist_stats']['episode_reward']\n", "df_episode_rewards = pd.DataFrame(data={'episode':range(len(episode_rewards)), 'reward':episode_rewards})\n", - "plot_line(df_episode_rewards, x_col='episode', y_col='reward', title='Episode Rewards', x_axis_label='episode', y_axis_label='reward')" + "\n", + "df_episode_rewards.plot(x=\"episode\", y=\"reward\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "([image](../images/rllib/Cart-Pole-Episode-Rewards2.png))\n", - "\n", "For a well-trained model, most runs do very well while occasional runs do poorly. Try plotting other results episodes by changing the array index in `results[-1]` to another number between `0` and `9`. (The length of `results` is `10`.)" ] }, @@ -850,7 +820,7 @@ "id": "64FmVP7kNlh_" }, "source": [ - "Train the agent and try to get a reward of 200. If it's training too slowly you may need to modify the config above to use fewer hidden units, a larger `sgd_minibatch_size`, a smaller `num_sgd_iter`, or a larger `num_workers`.\n", + "Train the agent and try to get a reward of 500. If it's training too slowly you may need to modify the config above to use fewer hidden units, a larger `sgd_minibatch_size`, a smaller `num_sgd_iter`, or a larger `num_workers`.\n", "\n", "This should take around `N` = 20 or 30 training iterations." ] @@ -866,19 +836,23 @@ }, "outputs": [], "source": [ - "N=5\n", + "N = 5\n", "results = []\n", "episode_data = []\n", "episode_json = []\n", + "\n", "for n in range(N):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']} \n", + " 'episode_len_mean': result['episode_len_mean']} \n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", + " \n", " print(f'Max reward: {episode[\"episode_reward_max\"]}')" ] }, @@ -929,7 +903,7 @@ "outputs": [], "source": [ "trained_config = config.copy()\n", - "test_agent = PPOTrainer(trained_config, 'CartPole-v1')\n", + "test_agent = PPOTrainer(trained_config, \"CartPole-v1\")\n", "test_agent.restore(checkpoint_path)" ] }, @@ -942,7 +916,7 @@ "source": [ "Use the previously-trained policy to act in an environment. The key line is the call to `test_agent.compute_action(state)` which uses the trained policy to choose an action. This is an example of _rollout_, which we'll study in a subsequent lesson.\n", "\n", - "Verify that the cumulative reward received roughly matches up with the reward printed above. It will be at or near 200." + "Verify that the cumulative reward received roughly matches up with the reward printed above. It will be at or near 500." ] }, { @@ -955,7 +929,7 @@ }, "outputs": [], "source": [ - "env = gym.make('CartPole-v1')\n", + "env = gym.make(\"CartPole-v1\")\n", "state = env.reset()\n", "done = False\n", "cumulative_reward = 0\n", @@ -974,7 +948,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] }, { @@ -1006,7 +980,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/References-Reinforcement-Learning.ipynb b/ray-rllib/References-Reinforcement-Learning.ipynb index c2f7cbe..e914a92 100644 --- a/ray-rllib/References-Reinforcement-Learning.ipynb +++ b/ray-rllib/References-Reinforcement-Learning.ipynb @@ -36,6 +36,7 @@ "\n", "Several blog posts and series provide concise introductions to RL:\n", "\n", + "* [Intro to RLlib: Example Environments](https://medium.com/distributed-computing-with-ray/intro-to-rllib-example-environments-3a113f532c70).\n", "* [Anatomy of a custom environment for RLlib](https://medium.com/distributed-computing-with-ray/anatomy-of-a-custom-environment-for-rllib-327157f269e5).\n", "* [A Reinforcement Learning Cheat Sheet](https://towardsdatascience.com/reinforcement-learning-cheat-sheet-2f9453df7651).\n", "* [Reinforcement Learning Explained](https://www.oreilly.com/radar/reinforcement-learning-explained/), Junling Hu, 2016. A gentle introduction to the ideas of RL.\n", @@ -161,7 +162,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/00-Explore-RLlib-Overview.ipynb b/ray-rllib/explore-rllib/00-Explore-RLlib-Overview.ipynb index ffbe36b..0961e9d 100644 --- a/ray-rllib/explore-rllib/00-Explore-RLlib-Overview.ipynb +++ b/ray-rllib/explore-rllib/00-Explore-RLlib-Overview.ipynb @@ -47,19 +47,11 @@ "## Getting Help\n", "\n", "* The [#tutorial channel](https://ray-distributed.slack.com/archives/C011ML23W5B) on the [Ray Slack](https://ray-distributed.slack.com). [Click here](https://forms.gle/9TSdDYUgxYs8SA9e8) to join.\n", - "* [Email](mailto:academy@anyscale.com)\n", "\n", "Find an issue? Please report it!\n", "\n", "* [GitHub issues](https://github.com/anyscale/academy/issues)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -78,7 +70,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb b/ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb index 206d02b..64162b7 100644 --- a/ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb +++ b/ray-rllib/explore-rllib/01-Application-Cart-Pole.ipynb @@ -50,17 +50,10 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import json, os, shutil, sys" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sys.path.append('../..') # so we can import from \"util\"\n", - "from util.line_plots import plot_line, plot_line_with_min_max, plot_line_with_stddev" + "import json\n", + "import os\n", + "import shutil\n", + "import sys" ] }, { @@ -79,7 +72,7 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_root = 'tmp/ppo/cart'" + "checkpoint_root = \"tmp/ppo/cart\"" ] }, { @@ -116,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -132,7 +125,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -166,13 +159,14 @@ "\n", "config = ppo.DEFAULT_CONFIG.copy() # PPO's default configuration. See the next code cell.\n", "config[\"log_level\"] = \"WARN\" # Suppress too many messages, but try \"INFO\" to see what can be printed.\n", + "\n", "# Other settings we might adjust:\n", - "config['num_workers'] = 1 # Use > 1 for using more CPU cores, including over a cluster\n", - "config['num_sgd_iter'] = 10 # Number of SGD (stochastic gradient descent) iterations per training minibatch.\n", + "config[\"num_workers\"] = 1 # Use > 1 for using more CPU cores, including over a cluster\n", + "config[\"num_sgd_iter\"] = 10 # Number of SGD (stochastic gradient descent) iterations per training minibatch.\n", " # I.e., for each minibatch of data, do this many passes over it to train. \n", - "config['sgd_minibatch_size'] = 250 # The amount of data records per minibatch\n", - "config['model']['fcnet_hiddens'] = [100, 50] #\n", - "config['num_cpus_per_worker'] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed" + "config[\"sgd_minibatch_size\"] = 250 # The amount of data records per minibatch\n", + "config[\"model\"][\"fcnet_hiddens\"] = [100, 50] #\n", + "config[\"num_cpus_per_worker\"] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed" ] }, { @@ -204,17 +198,21 @@ "results = []\n", "episode_data = []\n", "episode_json = []\n", + "\n", "for n in range(N_ITER):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", " 'episode_len_mean': result['episode_len_mean']}\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", " file_name = agent.save(checkpoint_root)\n", + " \n", " print(f'{n:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}. Checkpoint saved to {file_name}')" ] }, @@ -244,20 +242,7 @@ "metadata": {}, "outputs": [], "source": [ - "import bokeh.io\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_line_with_min_max(df, x_col='n', y_col='episode_reward_mean', min_col='episode_reward_min', max_col='episode_reward_max',\n", - " title='Cart Pole Episode Rewards', x_axis_label = 'n', y_axis_label='reward')" + "df.plot(x=\"n\", y=[\"episode_reward_mean\", \"episode_reward_min\", \"episode_reward_max\"], secondary_y=True)" ] }, { @@ -267,8 +252,6 @@ "id": "Gp1LgeCJjGLk" }, "source": [ - "([image](../../images/rllib/Cart-Pole-Episode-Rewards3.png))\n", - "\n", "Also, print out the policy and model to see the results of training in detail…" ] }, @@ -344,7 +327,7 @@ "source": [ "from IPython.display import Video\n", "\n", - "cart_pole_sample_video='../../images/rllib/Cart-Pole-Example-Video.mp4'\n", + "cart_pole_sample_video = \"../../images/rllib/Cart-Pole-Example-Video.mp4\"\n", "Video(cart_pole_sample_video)" ] }, @@ -390,7 +373,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -415,7 +398,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb b/ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb index e163f5e..2c830b8 100644 --- a/ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb +++ b/ray-rllib/explore-rllib/02-Bipedal-Walker.ipynb @@ -48,17 +48,10 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import json, os, shutil, sys" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sys.path.append('../..') # so we can import from \"util\"\n", - "from util.line_plots import plot_line, plot_line_with_min_max, plot_line_with_stddev" + "import json\n", + "import os\n", + "import shutil\n", + "import sys" ] }, { @@ -77,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_root = 'tmp/ppo/bipedal-walker'" + "checkpoint_root = \"tmp/ppo/bipedal-walker\"" ] }, { @@ -94,11 +87,11 @@ "outputs": [], "source": [ "# Where checkpoints are written:\n", - "#shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)\n", + "shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)\n", "\n", "# Where some data will be written and used by Tensorboard below:\n", "ray_results = f'{os.getenv(\"HOME\")}/ray_results/'\n", - "#shutil.rmtree(ray_results, ignore_errors=True, onerror=None)" + "shutil.rmtree(ray_results, ignore_errors=True, onerror=None)" ] }, { @@ -114,7 +107,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -130,7 +123,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -154,14 +147,15 @@ "\n", "config = ppo.DEFAULT_CONFIG.copy() # PPO's default configuration. See the next code cell.\n", "config[\"log_level\"] = \"WARN\" # Suppress too many messages, but try \"INFO\" to see what can be printed.\n", - "config['framework'] = \"tf\" # TensorFlow\n", + "config[\"framework\"] = \"tf\" # TensorFlow\n", + "\n", "# Other settings we might adjust:\n", - "config['num_workers'] = 4 # Use > 1 for using more CPU cores, including over a cluster\n", - "config['num_sgd_iter'] = 50 # Number of SGD (stochastic gradient descent) iterations per training minibatch.\n", + "config[\"num_workers\"] = 4 # Use > 1 for using more CPU cores, including over a cluster\n", + "config[\"num_sgd_iter\"] = 50 # Number of SGD (stochastic gradient descent) iterations per training minibatch.\n", " # I.e., for each minibatch of data, do this many passes over it to train. \n", - "config['sgd_minibatch_size'] = 250 # The amount of data records per minibatch\n", - "config['model']['fcnet_hiddens'] = [512, 512] # Larger network than we used for CartPole.\n", - "config['num_cpus_per_worker'] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed" + "config[\"sgd_minibatch_size\"] = 250 # The amount of data records per minibatch\n", + "config[\"model\"][\"fcnet_hiddens\"] = [512, 512] # Larger network than we used for CartPole.\n", + "config[\"num_cpus_per_worker\"] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed" ] }, { @@ -222,7 +216,7 @@ }, "outputs": [], "source": [ - "agent.restore('bipedal-walker-checkpoint/checkpoint-100')" + "agent.restore(\"bipedal-walker-checkpoint/checkpoint-100\")" ] }, { @@ -245,17 +239,21 @@ "results = []\n", "episode_data = []\n", "episode_json = []\n", + "\n", "for n in range(N_ITER):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", " 'episode_len_mean': result['episode_len_mean']}\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", " file_name = agent.save(checkpoint_root)\n", + " \n", " print(f'{n:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}. Checkpoint saved to {file_name}')" ] }, @@ -281,18 +279,6 @@ "df" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bokeh.io\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -306,8 +292,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_line_with_min_max(df, x_col='n', y_col='episode_reward_mean', min_col='episode_reward_min', max_col='episode_reward_max',\n", - " title='Bipel Walker Episode Rewards', x_axis_label = 'n', y_axis_label='reward')" + "df.plot(x=\"n\", y=[\"episode_reward_mean\", \"episode_reward_min\", \"episode_reward_max\"], secondary_y=True)" ] }, { @@ -317,8 +302,6 @@ "id": "Gp1LgeCJjGLk" }, "source": [ - "([image](../../images/rllib/Bipedal-Walker-Rewards-120.png))\n", - "\n", "Compare with these images after 50 and 100 iterations. Note the sign of the `reward` in all graphs!\n", "\n", "After 100 iterations, starting from a checkpoint at 50 (so 50 _new_ iterations):\n", @@ -412,7 +395,7 @@ "source": [ "from IPython.display import Video\n", "\n", - "sample_video='../../images/rllib/Bipedal-Walker-Example-100.mp4'\n", + "sample_video =\"../../images/rllib/Bipedal-Walker-Example-100.mp4\"\n", "Video(sample_video, embed=True)" ] }, @@ -432,7 +415,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] }, { @@ -477,7 +460,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/03-Custom-Environments-Reward-Shaping.ipynb b/ray-rllib/explore-rllib/03-Custom-Environments-Reward-Shaping.ipynb index 0867fcb..02c01cd 100644 --- a/ray-rllib/explore-rllib/03-Custom-Environments-Reward-Shaping.ipynb +++ b/ray-rllib/explore-rllib/03-Custom-Environments-Reward-Shaping.ipynb @@ -28,7 +28,10 @@ "source": [ "import numpy as np\n", "import pandas as pd\n", - "import json, os, shutil, sys\n", + "import json\n", + "import os\n", + "import shutil\n", + "import sys\n", "import gym\n", "\n", "import ray\n", @@ -41,8 +44,7 @@ "metadata": {}, "outputs": [], "source": [ - "sys.path.append('../..') # so we can import from \"util\"\n", - "from util.line_plots import plot_line, plot_line_with_min_max, plot_line_with_stddev" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -51,16 +53,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -158,11 +151,19 @@ "outputs": [], "source": [ "counts = {key:0 for key in range(10)}\n", - "counts\n", - "\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "for i in range(200):\n", " key = spaces.Discrete(10).sample()\n", " counts[key] = counts[key] + 1\n", + "\n", "counts" ] }, @@ -349,10 +350,13 @@ "def do_training(chainEnvClass, config = trainer_config, iterations=20):\n", " trainer = PPOTrainer(config, chainEnvClass)\n", " print(f'Training iterations: ', end='')\n", + " \n", " for i in range(iterations):\n", " print('.', end='')\n", " trainer.train()\n", + " \n", " print('')\n", + " \n", " return trainer" ] }, @@ -485,6 +489,7 @@ "done = False\n", "max_state = -1\n", "cumulative_reward = 0\n", + "\n", "while not done:\n", " action = trainer.compute_action(state)\n", " state, reward, done, results = env.step(action)\n", @@ -493,8 +498,10 @@ "\n", "print(f'Cumulative reward you received is: {cumulative_reward}!')\n", "print(f'Max state you visited is: {max_state}. (There are {env.n} states.)')\n", + "\n", "desired = env.done_percentage\n", "actual = (max_state+1)/env.n # add one because of zero indexing\n", + "\n", "print(f\"This policy traversed {actual*100:4.1f}% of the available states.\")\n", "assert actual > desired, f\"{actual*100:4.1f}% is less than the desired percentage of {desired*100:4.1f}%.\"" ] @@ -505,15 +512,8 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -532,7 +532,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/extras/Extra-Application-Frozen-Lake.ipynb b/ray-rllib/explore-rllib/extras/Extra-Application-Frozen-Lake.ipynb index c153949..2f1ca4e 100644 --- a/ray-rllib/explore-rllib/extras/Extra-Application-Frozen-Lake.ipynb +++ b/ray-rllib/explore-rllib/extras/Extra-Application-Frozen-Lake.ipynb @@ -25,7 +25,10 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import json, os, shutil, sys\n", + "import json\n", + "import os\n", + "import shutil\n", + "import sys\n", "import ray\n", "import ray.rllib.agents.ppo as ppo" ] @@ -36,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -45,7 +48,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -61,8 +64,8 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_root = 'tmp/ppo/frozen-lake'\n", - "shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None) # clean up old runs" + "checkpoint_root = \"tmp/ppo/frozen-lake\"\n", + "shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)" ] }, { @@ -107,16 +110,19 @@ "for n in range(N_ITER):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']}\n", + " 'episode_len_mean': result['episode_len_mean']\n", + " }\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", " file_name = agent.save(checkpoint_root)\n", - " print(f'{n+1:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}, len mean: {result[\"episode_len_mean\"]:8.4f}. Checkpoint saved to {file_name}')\n", - "reward_history = []" + " \n", + " print(f'{n+1:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}, len mean: {result[\"episode_len_mean\"]:8.4f}. Checkpoint saved to {file_name}')" ] }, { @@ -142,7 +148,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] }, { @@ -237,7 +243,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/extras/Extra-Application-Mountain-Car.ipynb b/ray-rllib/explore-rllib/extras/Extra-Application-Mountain-Car.ipynb index c73640c..7ae7566 100644 --- a/ray-rllib/explore-rllib/extras/Extra-Application-Mountain-Car.ipynb +++ b/ray-rllib/explore-rllib/extras/Extra-Application-Mountain-Car.ipynb @@ -10,7 +10,7 @@ "\n", "![Anyscale Academy](../../../images/AnyscaleAcademyLogo.png)\n", "\n", - "This example uses [RLlib](https://ray.readthedocs.io/en/latest/rllib.html) to train a policy with the `MountainCar-v0` environment, ([gym.openai.com/envs/MountainCar-v0/]. The idea is that a cart starts at an arbitrar point on a hill. Without any \"pushes\", it will rock back and forth between the two sides of the valley below, never rising above the starting point. However, there are three actions, accelerate to the left (by some unit), accelerate to the right, or apply no acceleration. Timing accelerations in the appropriate directions at the appropriate steps is the key to getting to the top of the hill.\n", + "This example uses [RLlib](https://ray.readthedocs.io/en/latest/rllib.html) to train a policy with the `MountainCar-v0` environment, ([gym.openai.com/envs/MountainCar-v0/](gym.openai.com/envs/MountainCar-v0/)). The idea is that a cart starts at an arbitrar point on a hill. Without any \"pushes\", it will rock back and forth between the two sides of the valley below, never rising above the starting point. However, there are three actions, accelerate to the left (by some unit), accelerate to the right, or apply no acceleration. Timing accelerations in the appropriate directions at the appropriate steps is the key to getting to the top of the hill.\n", "\n", "The primary idea demonstrated in this lesson is how to start from a previous checkpoint. A checkpoint is provided in the `mountain-car-checkpoint` directory, captured after 200 training episodes. Still, the with the provided checkpoint and addition training of 50 episodes, the cart is unable to reach the top.\n", "\n", @@ -40,7 +40,10 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import json, os, shutil, sys\n", + "import json\n", + "import os\n", + "import shutil\n", + "import sys\n", "import ray\n", "import ray.rllib.agents.ppo as ppo" ] @@ -51,7 +54,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -67,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -121,7 +124,7 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_root = 'tmp/ppo/mountain-car'\n", + "checkpoint_root = \"tmp/ppo/mountain-car\"\n", "shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None) # clean up old runs" ] }, @@ -191,7 +194,7 @@ "metadata": {}, "outputs": [], "source": [ - "agent.restore('mountain-car-checkpoint/checkpoint-20')" + "agent.restore(\"mountain-car-checkpoint/checkpoint-20\")" ] }, { @@ -209,14 +212,18 @@ "for n in range(N_ITER):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']}\n", + " 'episode_len_mean': result['episode_len_mean']\n", + " }\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", " file_name = agent.save(checkpoint_root)\n", + " \n", " print(f'{n+1:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}, len mean: {result[\"episode_len_mean\"]:8.4f}. Checkpoint saved to {file_name}')" ] }, @@ -255,7 +262,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] }, { @@ -325,7 +332,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/extras/Extra-Application-Taxi.ipynb b/ray-rllib/explore-rllib/extras/Extra-Application-Taxi.ipynb index 6a996f5..4b99347 100644 --- a/ray-rllib/explore-rllib/extras/Extra-Application-Taxi.ipynb +++ b/ray-rllib/explore-rllib/extras/Extra-Application-Taxi.ipynb @@ -25,7 +25,10 @@ "outputs": [], "source": [ "import pandas as pd\n", - "import json, os, shutil, sys\n", + "import json\n", + "import os\n", + "import shutil\n", + "import sys\n", "import ray\n", "import ray.rllib.agents.ppo as ppo" ] @@ -36,7 +39,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -45,7 +48,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(f'Dashboard URL: http://{ray.get_webui_url()}')" + "print(\"Dashboard URL: http://{}\".format(info[\"webui_url\"]))" ] }, { @@ -61,7 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_root = 'tmp/ppo/taxi'\n", + "checkpoint_root = \"tmp/ppo/taxi\"\n", "shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None) # clean up old runs" ] }, @@ -107,16 +110,19 @@ "for n in range(N_ITER):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']}\n", + " 'episode_len_mean': result['episode_len_mean']\n", + " }\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", " file_name = agent.save(checkpoint_root)\n", - " print(f'{n+1:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}, len mean: {result[\"episode_len_mean\"]:8.4f}. Checkpoint saved to {file_name}')\n", - "reward_history = []" + " \n", + " print(f'{n+1:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}, len mean: {result[\"episode_len_mean\"]:8.4f}. Checkpoint saved to {file_name}')" ] }, { @@ -244,7 +250,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb b/ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb index ad425ad..bd012b1 100644 --- a/ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb +++ b/ray-rllib/explore-rllib/solutions/Explore-RLlib-Solutions.ipynb @@ -24,7 +24,9 @@ "import gym\n", "import numpy as np\n", "import pandas as pd\n", - "import json, sys, os" + "import json\n", + "import sys\n", + "import os" ] }, { @@ -44,8 +46,8 @@ "metadata": {}, "outputs": [], "source": [ - "env = gym.make('CartPole-v1')\n", - "print('Created env:', env)" + "env = gym.make(\"CartPole-v1\")\n", + "print(\"Created env:\", env)" ] }, { @@ -78,8 +80,15 @@ " return 0 if state[0] < 0 else 1\n", "\n", "def sample_policy2(state):\n", - " return 1 if state[0] < 0 else 0\n", - "\n", + " return 1 if state[0] < 0 else 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "reward1 = np.mean([rollout_policy(env, sample_policy1) for _ in range(100)])\n", "reward2 = np.mean([rollout_policy(env, sample_policy2) for _ in range(100)])\n", "\n", @@ -135,11 +144,11 @@ "outputs": [], "source": [ "config = DEFAULT_CONFIG.copy()\n", - "config['num_workers'] = 3\n", - "config['num_sgd_iter'] = 10 # was 30\n", - "config['sgd_minibatch_size'] = 256 # was 128\n", - "config['model']['fcnet_hiddens'] = [20, 20] # was [100, 100]\n", - "config['num_cpus_per_worker'] = 0" + "config[\"num_workers\"] = 3\n", + "config[\"num_sgd_iter\"] = 10 # was 30\n", + "config[\"sgd_minibatch_size\"] = 256 # was 128\n", + "config[\"model\"][\"fcnet_hiddens\"] = [20, 20] # was [100, 100]\n", + "config[\"num_cpus_per_worker\"] = 0" ] }, { @@ -148,7 +157,7 @@ "metadata": {}, "outputs": [], "source": [ - "agent = PPOTrainer(config, 'CartPole-v1')" + "agent = PPOTrainer(config, \"CartPole-v1\")" ] }, { @@ -157,20 +166,25 @@ "metadata": {}, "outputs": [], "source": [ - "N=20 # was 10\n", + "N = 20 # was 10\n", "results = []\n", "episode_data = []\n", "episode_json = []\n", + "\n", "for n in range(N):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']} \n", + " 'episode_len_mean': result['episode_len_mean']\n", + " }\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", + " \n", " print(f'{n:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}')" ] }, @@ -190,33 +204,14 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "sys.path.append(\"../../..\")\n", - "from util.line_plots import plot_line, plot_line_with_min_max\n", - "\n", - "import bokeh.io\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_line_with_min_max(df, x_col='n', y_col='episode_reward_mean', min_col='episode_reward_min', max_col='episode_reward_max',\n", - " title='Episode Rewards', x_axis_label='n', y_axis_label='reward')" + "df.plot(x=\"n\", y=[\"episode_reward_mean\", \"episode_reward_min\", \"episode_reward_max\"], secondary_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "([image](../../../images/rllib/Cart-Pole-Episode-Rewards-Exercise.png))\n", - "\n", - "Compare this graph with the graph in the lesson, where we used a stronger network:\n", + "Compare your graph with the graph in the lesson, where we used more computing resources:\n", "\n", "![](../../../images/rllib/Cart-Pole-Episode-Rewards.png)\n", "\n", @@ -392,11 +387,13 @@ "source": [ "def do_training(chainEnvClass, config = trainer_config, iterations=20):\n", " trainer = PPOTrainer(config, chainEnvClass)\n", - " print(f'Training iterations: ', end='')\n", + " print(\"Training iterations: \", end=\"\")\n", + " \n", " for i in range(iterations):\n", - " print('.', end='')\n", + " print(\".\", end=\"\")\n", " trainer.train()\n", - " print('')\n", + " \n", + " print(\"\")\n", " return trainer" ] }, @@ -468,6 +465,7 @@ "done = False\n", "max_state = -1\n", "cumulative_reward = 0\n", + "\n", "while not done:\n", " action = trainer.compute_action(state)\n", " state, reward, done, results = env.step(action)\n", @@ -476,8 +474,10 @@ "\n", "print(f'Cumulative reward you received is: {cumulative_reward}!')\n", "print(f'Max state you visited is: {max_state}. (There are {env.n} states.)')\n", + "\n", "desired = env.done_percentage\n", "actual = (max_state+1)/env.n # add one because of zero indexing\n", + "\n", "print(f\"This policy traversed {actual*100:4.1f}% of the available states.\")\n", "assert actual >= desired, f\"{actual*100:4.1f}% is less than the desired percentage of {desired*100:4.1f}%.\"" ] @@ -495,7 +495,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -515,7 +515,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/multi-armed-bandits/00-Multi-Armed-Bandits-Overview.ipynb b/ray-rllib/multi-armed-bandits/00-Multi-Armed-Bandits-Overview.ipynb index 4968cc7..7c9ad4d 100644 --- a/ray-rllib/multi-armed-bandits/00-Multi-Armed-Bandits-Overview.ipynb +++ b/ray-rllib/multi-armed-bandits/00-Multi-Armed-Bandits-Overview.ipynb @@ -10,7 +10,9 @@ "\n", "![Anyscale Academy](../../images/AnyscaleAcademyLogo.png)\n", "\n", - "This part of the [RLlib tutorial](../00-Ray-RLlib-Overview.ipynb) tutorial introduces _Multi-Armed Bandits_ (MABs) a popular approach that is very similar to \"classic\" reinforcement learning (RL), but with some differences, which we'll cover in the [first lesson](01-Introduction-to-Multi-Armed-Bandits.ipynb). [RLlib](https://ray.readthedocs.io/en/latest/rllib.html) provides several bandit algorithms:\n", + "This part of the [RLlib tutorial](../00-Ray-RLlib-Overview.ipynb) tutorial introduces _Multi-Armed Bandits_ (MABs) a popular approach that is very similar to \"classic\" reinforcement learning (RL), but with some differences, which we'll cover in the [first lesson](01-Introduction-to-Multi-Armed-Bandits.ipynb).\n", + "\n", + "[RLlib](https://ray.readthedocs.io/en/latest/rllib.html) provides several bandit algorithms:\n", "\n", "* [Linear Upper Confidence Bound (contrib/LinUCB)](https://docs.ray.io/en/latest/rllib-algorithms.html#linear-upper-confidence-bound-contrib-linucb)\n", "* [Linear Thompson Sampling (contrib/LinTS)](https://docs.ray.io/en/latest/rllib-algorithms.html#linear-thompson-sampling-contrib-lints)" @@ -42,19 +44,11 @@ "## Getting Help\n", "\n", "* The [#tutorial channel](https://ray-distributed.slack.com/archives/C011ML23W5B) on the [Ray Slack](https://ray-distributed.slack.com). [Click here](https://forms.gle/9TSdDYUgxYs8SA9e8) to join.\n", - "* [Email](mailto:academy@anyscale.com)\n", "\n", "Find an issue? Please report it!\n", "\n", "* [GitHub issues](https://github.com/anyscale/academy/issues)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -73,7 +67,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/multi-armed-bandits/03-Simple-Multi-Armed-Bandit.ipynb b/ray-rllib/multi-armed-bandits/03-Simple-Multi-Armed-Bandit.ipynb index a8be0c7..e1e92a7 100644 --- a/ray-rllib/multi-armed-bandits/03-Simple-Multi-Armed-Bandit.ipynb +++ b/ray-rllib/multi-armed-bandits/03-Simple-Multi-Armed-Bandit.ipynb @@ -27,7 +27,8 @@ "import gym\n", "from gym.spaces import Discrete, Box\n", "import numpy as np\n", - "import random, time\n", + "import random\n", + "import time\n", "import ray" ] }, @@ -245,7 +246,7 @@ }, "outputs": [], "source": [ - "df = analysis.dataframe()\n", + "df = analysis.dataframe(metric=\"episode_reward_mean\", mode=\"max\")\n", "df" ] }, @@ -360,7 +361,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -380,7 +381,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/multi-armed-bandits/04-Linear-Upper-Confidence-Bound.ipynb b/ray-rllib/multi-armed-bandits/04-Linear-Upper-Confidence-Bound.ipynb index ee0e05a..50d1fb6 100644 --- a/ray-rllib/multi-armed-bandits/04-Linear-Upper-Confidence-Bound.ipynb +++ b/ray-rllib/multi-armed-bandits/04-Linear-Upper-Confidence-Bound.ipynb @@ -227,7 +227,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = analysis.dataframe()\n", + "df = analysis.dataframe(metric=\"episode_reward_mean\", mode=\"max\")\n", "df" ] }, @@ -257,15 +257,8 @@ " frame = frame.append(df, ignore_index=True)\n", "\n", "df = frame.groupby(\"info/num_steps_trained\")[\n", - " \"info/learner/default_policy/cumulative_regret\"].aggregate([\"mean\", \"max\", \"min\", \"std\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " \"info/learner/default_policy/cumulative_regret\"].aggregate([\"mean\", \"max\", \"min\", \"std\"])\n", + "\n", "df" ] }, @@ -282,28 +275,13 @@ "metadata": {}, "outputs": [], "source": [ - "from bokeh_util import plot_cumulative_regret\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "import bokeh\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_cumulative_regret(df)" + "df.plot(y=\"mean\", yerr=\"std\", xlabel=\"steps\", ylabel=\"regret\", title=\"Cumulative Regret\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "([image](../../images/rllib/LinUCB-Cumulative-Regret.png))\n", - "\n", "So the _cumulative_ regret increases for the entire number of training steps for all five trials, but for larger step numbers, the amount of regret added decreases as we learn, so the graph begins to level off as the system gets better at optimizing the mean reward." ] }, @@ -331,7 +309,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -351,7 +329,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/multi-armed-bandits/05-Linear-Thompson-Sampling.ipynb b/ray-rllib/multi-armed-bandits/05-Linear-Thompson-Sampling.ipynb index 4cfc448..199adfd 100644 --- a/ray-rllib/multi-armed-bandits/05-Linear-Thompson-Sampling.ipynb +++ b/ray-rllib/multi-armed-bandits/05-Linear-Thompson-Sampling.ipynb @@ -101,19 +101,6 @@ "from ray.rllib.contrib.bandits.envs import WheelBanditEnv" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from bokeh_util import plot_cumulative_regret, plot_wheel_bandit_model_weights\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "import bokeh\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, { "cell_type": "code", "execution_count": null, @@ -235,15 +222,8 @@ "\n", "regrets = df \\\n", " .groupby(\"info/num_steps_trained\")[\"info/learner/default_policy/cumulative_regret\"] \\\n", - " .aggregate([\"mean\", \"max\", \"min\", \"std\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " .aggregate([\"mean\", \"max\", \"min\", \"std\"])\n", + "\n", "regrets" ] }, @@ -253,7 +233,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_cumulative_regret(regrets)" + "regrets.plot(y=\"mean\", yerr=\"std\", xlabel=\"steps\", title=\"Cumulative Regrets\")" ] }, { @@ -304,7 +284,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Plot weight distributions for different arms" + "Plot the weight distributions for the different arms:" ] }, { @@ -313,14 +293,25 @@ "metadata": {}, "outputs": [], "source": [ - "plot_wheel_bandit_model_weights(means, covs)" + "%matplotlib inline\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "colors = [\"blue\", \"black\", \"green\", \"red\", \"yellow\"]\n", + "labels = [\"arm{}\".format(i) for i in range(5)]\n", + "\n", + "for i in range(0, 5):\n", + " x, y = np.random.multivariate_normal(means[i] / 30, covs[i], 5000).T\n", + " plt.scatter(x, y, color=colors[i])\n", + " \n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here is an [image](../../images/rllib/LinTS-Weight-Distribution-of-Arms-05.png) from a previous run. How similar is your graph?" + "Here's an [image](../../images/rllib/LinTS-Weight-Distribution-of-Arms-05.png) from a previous run. How similar is your graph?" ] }, { @@ -346,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -366,7 +357,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/multi-armed-bandits/06-Market-Example.ipynb b/ray-rllib/multi-armed-bandits/06-Market-Example.ipynb index 60d62e8..5303b0c 100644 --- a/ray-rllib/multi-armed-bandits/06-Market-Example.ipynb +++ b/ray-rllib/multi-armed-bandits/06-Market-Example.ipynb @@ -15,9 +15,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we've learned about multi-armed bandits and methods for optimizing rewards, let's look at real-world applications, starting with a stock market example. We'll also learn a little more about configuring RLlib trainers.\n", + "Now that we've learned about multi-armed bandits and methods for optimizing rewards, let's look at real-world applications, starting with a stock market example.\n", "\n", - "We'll load a dataset derived from this [NYU Stern table](http://pages.stern.nyu.edu/~adamodar/New_Home_Page/datafile/histretSP.html) that shows returns for nearly a century of market data, including dividends and adjustments for inflation. The `market.tsv` file in this folder contains the data." + "How well could you invest in the public markets, if you could only observe one macroeconomic signal *inflation* and could only update your investments once each year?\n", + "\n", + "To explore this, first we'll load a dataset derived from this [NYU Stern table](http://pages.stern.nyu.edu/~adamodar/New_Home_Page/datafile/histretSP.html) that shows returns for nearly a century of market data, including dividends and adjustments for inflation. The `market.tsv` file in this folder contains the data." ] }, { @@ -28,7 +30,8 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import os, sys" + "import os\n", + "import sys" ] }, { @@ -40,7 +43,7 @@ "# Some properties we'll need:\n", "DEFAULT_MAX_INFLATION = 100.0\n", "DEFAULT_TICKERS = [\"sp500\", \"t.bill\", \"t.bond\", \"corp\"]\n", - "DEFAULT_DATA_FILE = os.path.abspath(os.path.curdir) + '/market.tsv' # full path" + "DEFAULT_DATA_FILE = os.path.abspath(os.path.curdir) + \"/market.tsv\" # full path" ] }, { @@ -54,6 +57,13 @@ " return pd.read_table(f)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load and examine the data:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -68,37 +78,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you can see the data spans 92 years, from 1928 to 2019. The columns represent:\n", - " * the year\n", - " * inflation rate at the time\n", - " * [S&P500](https://en.wikipedia.org/wiki/S%26P_500_Index) (composite stock index)\n", - " * [Treasury Bills](https://www.investopedia.com/terms/t/treasurybill.asp) (short-term gov bonds)\n", - " * [Treasury Bonds](https://www.investopedia.com/terms/t/treasurybond.asp) (long-term gov bonds)\n", - " * [Moody's Baa Corporate Bonds](https://en.wikipedia.org/wiki/Moody%27s_Investors_Service#Moody's_credit_ratings) (moderate risk)" + "As you can see the data spans 92 years, from 1928 to 2019. \n", + "\n", + "The columns represent:\n", + " * `year`: the year\n", + " * `inflation`: the inflation rate at the time\n", + " * `sp500`: [S&P500](https://en.wikipedia.org/wiki/S%26P_500_Index) (composite stock index)\n", + " * `t.bill`: [Treasury Bills](https://www.investopedia.com/terms/t/treasurybill.asp) (short-term gov bonds)\n", + " * `t.bond`: [Treasury Bonds](https://www.investopedia.com/terms/t/treasurybond.asp) (long-term gov bonds)\n", + " * `corp`: [Moody's Baa Corporate Bonds](https://en.wikipedia.org/wiki/Moody%27s_Investors_Service#Moody's_credit_ratings) (moderate risk)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "df.describe()" + "## Analysis of the Data\n", + "\n", + "Let's also look at descriptions statistics for each column:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "\"Corp\" refers to corporate bonds." + "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Analysis of the Data\n", - "\n", "What are the worst case and best case scenarios? In other words, if one could predict the future market performance, what are the possible ranges of total failure vs. total success over the past century? By \"total\", we mean what if you had all your money in a given year invested in the worst performing _sector_ (S&P500, T bills, or other) or you were invested in the best performing sector for that year." ] }, @@ -127,30 +139,16 @@ "metadata": {}, "outputs": [], "source": [ - "sys.path.append('../..')\n", - "from util.line_plots import plot_line, plot_line_with_stddev, plot_between_lines\n", - "from bokeh_util import plot_cumulative_regret" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bokeh\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" + "min_max = pd.DataFrame.from_dict({'year': df['year'], 'min':min_list, 'max':max_list})\n", + "min_max" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "min_max = pd.DataFrame.from_dict({'year': df['year'], 'min':min_list, 'max':max_list})\n", - "min_max" + "We can visualize the best and worst returns, year over year.\n", + "Overall this should look like a [*random walk*](https://en.wikipedia.org/wiki/Random_walk):" ] }, { @@ -159,15 +157,28 @@ "metadata": {}, "outputs": [], "source": [ - "plot_between_lines(min_max, x_col='year', lower_col='min', upper_col='max', \n", - " title='Best to Worst', x_axis_label='year', y_axis_label='%')" + "%matplotlib inline\n", + "from matplotlib import pyplot as plt\n", + "\n", + "plt.fill_between(\n", + " df[\"year\"],\n", + " min_list,\n", + " max_list,\n", + " color=\"b\",\n", + " alpha=0.2\n", + ")\n", + "\n", + "plt.title(\"Best vs. Worst Market Return\")\n", + "plt.xlabel(\"Year\")\n", + "plt.ylabel(\"Return\")\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "From the graph, there are some years where the performance varies widely, while other years everything returns about the same performance." + "There are some years where the performance varies widely, while other years everything returns about the same performance." ] }, { @@ -188,7 +199,6 @@ "import gym\n", "from gym.spaces import Discrete, Box\n", "from gym.utils import seeding\n", - "import numpy as np\n", "import random" ] }, @@ -208,9 +218,9 @@ "class MarketBandit (gym.Env):\n", " \n", " def __init__ (self, config={}):\n", - " self.max_inflation = config.get('max-inflation', DEFAULT_MAX_INFLATION)\n", - " self.tickers = config.get('tickers', DEFAULT_TICKERS)\n", - " self.data_file = config.get('data-file', DEFAULT_DATA_FILE)\n", + " self.max_inflation = config.get(\"max-inflation\", DEFAULT_MAX_INFLATION)\n", + " self.tickers = config.get(\"tickers\", DEFAULT_TICKERS)\n", + " self.data_file = config.get(\"data-file\", DEFAULT_DATA_FILE)\n", " print(f\"MarketBandit: max_inflation: {self.max_inflation}, tickers: {self.tickers}, data file: {self.data_file} (config: {config})\")\n", "\n", " self.action_space = Discrete(4)\n", @@ -310,7 +320,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can use this environment in a kind of *monte carlo simulation* to measure a baseline for what the rewards would be over a long period if you always used a random action." + "We can use this environment in a kind of [*monte carlo simulation*](https://en.wikipedia.org/wiki/Monte_Carlo_method) to measure a baseline for what the rewards would be over a long period if you merely used actions selected at random." ] }, { @@ -357,7 +367,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_line(df_mc, x_col='index', y_col='reward', title='Reward Over Iterations')" + "df_mc.plot(y=\"reward\", title=\"Reward Over Iterations\")" ] }, { @@ -366,7 +376,7 @@ "source": [ "([image](../../images/rllib/MarketReward-Random.png))\n", "\n", - "Yes, it looks quite random... There's no improvement happening at all." + "Yes, it looks quite random... There is no improvement (i.e., *learning*) happening at all." ] }, { @@ -382,7 +392,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Recall in the `__init__()` method for `MarketBandit` that we set some parameters from the passed in `config` object (with defaults). We don't construct this explicitly ourselves. Rather, RLlib will do this. So, we need to construct the canonical `config` object we want to use. To do this, we use the idioms shown in the next several cells:" + "Recall in the `__init__()` method for `MarketBandit` that we set some parameters from the passed in `config` object. \n", + "So we need to create a custom config object with our parameters, by building on the default `TS_CONFIG` object for _LinTS_:" ] }, { @@ -391,17 +402,25 @@ "metadata": {}, "outputs": [], "source": [ - "from ray.rllib.agents.trainer import with_base_config, with_common_config\n", + "import copy\n", "from ray.rllib.contrib.bandits.agents.lin_ts import TS_CONFIG\n", - "from ray.rllib.contrib.bandits.agents.lin_ts import LinTSTrainer\n", - "import ray" + "\n", + "market_config = copy.deepcopy(TS_CONFIG)\n", + "\n", + "market_config[\"env\"] = MarketBandit\n", + "market_config[\"max-inflation\"] = DEFAULT_MAX_INFLATION;\n", + "market_config[\"tickers\"] = DEFAULT_TICKERS;\n", + "market_config[\"data-file\"] = DEFAULT_DATA_FILE;" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Initialize Ray:" + "We'll also define a custom trainer, which builds on the `LinTSTrainer` with \"updates\". \n", + "This will be the first argument that we'll pass to `ray.tune.run()` later. \n", + "\n", + "Note: if all we needed was the default `LinTSTrainer` trainer, as is and with no customized config settings, we could instead just pass the string `\"contrib/LinTS\"` to `ray.tune.run()`. " ] }, { @@ -410,14 +429,20 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "from ray.rllib.contrib.bandits.agents.lin_ts import LinTSTrainer\n", + "\n", + "MarketLinTSTrainer = LinTSTrainer.with_updates(\n", + " name=\"MarketLinTSTrainer\",\n", + " default_config=market_config, # Will be merged with Trainer.COMMON_CONFIG (rllib/agent/trainer.py)\n", + " #default_policy=[somePolicyClass] # If we had a policy...\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We need a custom config object with our parameters for `MarketBandit`. We do this building on the default `TS_CONFIG` object for _LinTS_:" + "Then initialize Ray:" ] }, { @@ -426,36 +451,16 @@ "metadata": {}, "outputs": [], "source": [ - "market_config = with_base_config(TS_CONFIG, {\n", - " \"env\": MarketBandit,\n", - " 'max-inflation': DEFAULT_MAX_INFLATION,\n", - " 'tickers': DEFAULT_TICKERS,\n", - " 'data-file': DEFAULT_DATA_FILE\n", - "})\n", + "import ray\n", "\n", - "stop = {\n", - " \"training_iteration\": 100\n", - "}" + "ray.init(ignore_reinit_error=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Also, we'll define a custom trainer, which builds on the `LinTSTrainer`, with \"updates\". Note that it's the first argument we'll pass to `tune.run()` in the following cell. When all we need is `LinTSTrainer`, as is, and no extra custom config settings, we can just pass the string `contrib/LinTS` to `tune.run()`. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MarketLinTSTrainer = LinTSTrainer.with_updates(\n", - " name=\"MarketLinTSTrainer\",\n", - " default_config=market_config, # Will be merged with Trainer.COMMON_CONFIG (rllib/agent/trainer.py)\n", - " #default_policy=[somePolicyClass] # If we had a policy...\n", - ")" + "Then run Tune:" ] }, { @@ -466,6 +471,10 @@ }, "outputs": [], "source": [ + "stop = {\n", + " \"training_iteration\": 100\n", + "}\n", + "\n", "analysis = ray.tune.run(\n", " MarketLinTSTrainer,\n", " config=market_config,\n", @@ -493,7 +502,7 @@ "source": [ "## Analyzing the results\n", "\n", - "Let's analyze the rewards and cumulative regrets of these trials." + "Let's analyze the rewards and cumulative regrets from these trials." ] }, { @@ -529,17 +538,14 @@ "metadata": {}, "outputs": [], "source": [ - "plot_line_with_stddev(rewards, x_col='info/num_steps_trained', y_col='mean', stddev_col='std', \n", - " title='Rewards vs. Steps', x_axis_label='step', y_axis_label='reward')" + "rewards.plot(y=[\"mean\", \"max\"], secondary_y=True, title=\"Rewards vs. Steps\", xlabel=\"step\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "([image](../../images/rllib/Market-Bandit-Rewards-vs-Steps.png))\n", - "\n", - "The rewards reach what appears to be nearly optimal by 3000 steps, then shows some slow improvement beyond 8000." + "The rewards bounce around at first, then appear to stabilize after 5000 steps, with slow improvement afterwards." ] }, { @@ -561,14 +567,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_cumulative_regret(regrets)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "([image](../../images/rllib/Market-Bandit-Cumulative-Regret.png))" + "regrets.plot(y=\"mean\", yerr=\"std\", title=\"Regrets vs. Steps\", xlabel=\"step\")" ] }, { @@ -593,7 +592,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You should see a number between 6-8%. That's better than the random action baseline of 3.75%, but no where near the best case scenario of 15.18% return. Hence, our regrets continue to grow over time...\n", + "You should see a number between near 6%. That's better than the random action baseline of 3.75%, but no where near the best case scenario of 15.18% return. Hence, our regrets continue to grow over time...\n", "\n", "Note that investing solely in the S&P stock index which would have produced better than 8% return over that period -- that is, if one could wait 92 years. However, investing one's entire portfolio into stocks can become quite a risky policy in the short-term, so we were exploring how to balance a portfolio given only limited information.\n", "\n", @@ -606,7 +605,20 @@ "source": [ "## Exercise 1\n", "\n", - "Try using a `LinUCBTrainer`-based trainer. How does the annualized return compare?" + "Try using a `LinUCBTrainer`-based trainer.\n", + "\n", + "How does the annualized return compare?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2\n", + "\n", + "Inflation rates tend to get reported months after they've occurred. To be more accurate with using this dataset, offset the *inflation* observation one step (1 year) ahead.\n", + "\n", + "How does the annualized return compare?" ] }, { @@ -666,7 +678,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A final note: when you checkpoint it will change how the training performs in this notebook, if you rerun the training! So be sure to start from scratch when doing experiments here, if that's what you intend!" + "A final note: using checkpoints will change how the training performs in this notebook, if you rerun it. So be sure to start from scratch when doing experiments here, if that's what you intend!" ] }, { @@ -675,7 +687,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -695,7 +707,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/multi-armed-bandits/bokeh_util.py b/ray-rllib/multi-armed-bandits/bokeh_util.py deleted file mode 100644 index 631dbca..0000000 --- a/ray-rllib/multi-armed-bandits/bokeh_util.py +++ /dev/null @@ -1,37 +0,0 @@ -from bokeh.plotting import figure, show, output_file -from bokeh.models import Band, ColumnDataSource, Range1d -import bokeh.io -import numpy as np -import sys - -sys.path.append('../..') -from util.line_plots import plot_line_with_stddev - -def plot_cumulative_regret(df): - plot_line_with_stddev(df, - x_col='info/num_steps_trained', y_col='mean', stddev_col='std', - title='Cumulative Regret', x_axis_label='step', y_axis_label='cumulative regret') - -def plot_wheel_bandit_model_weights(means, covs): - # markers = ["asterisk", "circle", "diamond", "square", "x"] - colors = ["blue", "black", "green", "red", "yellow"] - labels = ["arm{}".format(i) for i in range(5)] - - tooltips = [ - ("name", "$name"), - ("array size", "$x"), - ("time", "$y")] - - TOOLS = "pan,wheel_zoom,box_zoom,reset,save" - p = figure(tools=TOOLS, tooltips=tooltips, match_aspect=True) - - for i in range(0, 5): - x, y = np.random.multivariate_normal(means[i] / 30, covs[i], 5000).T - p.scatter(x=x, y=y, size=2, - color=colors[i], marker='circle', legend_label=labels[i], name=labels[i]) - - p.title.text = "Weight distributions of arms" - p.xgrid[0].grid_line_alpha=0.5 - p.ygrid[0].grid_line_alpha=0.5 - - show(p) diff --git a/ray-rllib/multi-armed-bandits/solutions/Multi-Armed-Bandits-Solutions.ipynb b/ray-rllib/multi-armed-bandits/solutions/Multi-Armed-Bandits-Solutions.ipynb index 78f4d2c..64423db 100644 --- a/ray-rllib/multi-armed-bandits/solutions/Multi-Armed-Bandits-Solutions.ipynb +++ b/ray-rllib/multi-armed-bandits/solutions/Multi-Armed-Bandits-Solutions.ipynb @@ -28,7 +28,9 @@ "from gym.spaces import Discrete, Box\n", "import numpy as np\n", "import pandas as pd\n", - "import os, time, random\n", + "import os\n", + "import time\n", + "import random\n", "import ray\n", "from ray.tune.progress_reporter import JupyterNotebookReporter" ] @@ -163,7 +165,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = analysis.dataframe()\n", + "df = analysis.dataframe(metric=\"episode_reward_mean\", mode=\"max\")\n", "df" ] }, @@ -289,7 +291,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, "outputs": [], "source": [ "stats = analysis.stats()\n", @@ -303,7 +309,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = analysis.dataframe()\n", + "df = analysis.dataframe(metric=\"episode_reward_mean\", mode=\"max\")\n", "df" ] }, @@ -311,7 +317,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It ran the maximum of 20,000 steps and the best it does falls between 4.8 to 5.2 (for different runs), not 10.0. the `episode_reward_mean` is chaotic:\n", + "It ran the maximum of 20,000 steps and the best it does (for different runs) is well below 10.0. the `episode_reward_mean` is chaotic:\n", "\n", "![Nonlinear model with LinUCB](../../../images/rllib/TensorBoard2.png).\n", "\n", @@ -369,7 +375,7 @@ }, "outputs": [], "source": [ - "df = analysis.dataframe()\n", + "df = analysis.dataframe(metric=\"episode_reward_mean\", mode=\"max\")\n", "df" ] }, @@ -416,7 +422,7 @@ }, "outputs": [], "source": [ - "df = analysis.dataframe()\n", + "df = analysis.dataframe(metric=\"episode_reward_mean\", mode=\"max\")\n", "df" ] }, @@ -504,33 +510,7 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "sys.path.append('../../..') # ... and line_plot functions from \"util\"\n", - "from util.line_plots import plot_line, plot_line_with_stddev, plot_line_with_min_max\n", - "\n", - "sys.path.append(\"..\") # So we can load the bokeh_util from the parent directory...\n", - "from bokeh_util import plot_cumulative_regret, plot_wheel_bandit_model_weights" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "import bokeh\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_cumulative_regret(df)" + "df.plot(y=\"mean\", yerr=\"std\")" ] }, { @@ -586,7 +566,7 @@ "metadata": {}, "outputs": [], "source": [ - "def run_ts(delta):\n", + "def run_ts (delta):\n", " TS_CONFIG[\"delta\"] = delta\n", "\n", " start_time = time.time()\n", @@ -615,7 +595,7 @@ "metadata": {}, "outputs": [], "source": [ - "def process_df(df, analysis):\n", + "def process_df (df, analysis):\n", " ts_regrets = df \\\n", " .groupby(\"info/num_steps_trained\")[\"info/learner/default_policy/cumulative_regret\"] \\\n", " .aggregate([\"mean\", \"max\", \"min\", \"std\"])\n", @@ -651,7 +631,8 @@ }, "outputs": [], "source": [ - "ts_regrets7, model7, means7, covs7 = process_df(ts_df7, analysis7)" + "ts_regrets7, model7, means7, covs7 = process_df(ts_df7, analysis7)\n", + "ts_regrets7.head()" ] }, { @@ -660,7 +641,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_cumulative_regret(ts_regrets7)" + "ts_regrets7.plot(y=\"mean\", yerr=\"std\")" ] }, { @@ -669,7 +650,7 @@ "source": [ "([image](../../../images/rllib/LinTS-Cumulative-Regret-07.png))\n", "\n", - "The cumulative regret values are much higher than for $\\delta = 0.5$ in the lesson, and the standard deviation is ... well crazy. We mentioned in the lesson that the problem becomes harder for higher $\\delta$, which fits this result." + "The cumulative regret values are much higher than for $\\delta = 0.5$ in the lesson, and the standard deviation may diverge. We mentioned in the lesson that the problem becomes harder for higher $\\delta$, which fits this result." ] }, { @@ -678,7 +659,17 @@ "metadata": {}, "outputs": [], "source": [ - "plot_wheel_bandit_model_weights(means7, covs7)" + "%matplotlib inline\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "colors = [\"blue\", \"black\", \"green\", \"red\", \"yellow\"]\n", + "\n", + "for i in range(0, 5):\n", + " x, y = np.random.multivariate_normal(means7[i] / 30, covs7[i], 5000).T\n", + " plt.scatter(x, y, color=colors[i])\n", + " \n", + "plt.show()" ] }, { @@ -721,7 +712,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_cumulative_regret(ts_regrets9)" + "ts_regrets9.plot(y=\"mean\", yerr=\"std\")" ] }, { @@ -739,7 +730,13 @@ "metadata": {}, "outputs": [], "source": [ - "plot_wheel_bandit_model_weights(means9, covs9)" + "colors = [\"blue\", \"black\", \"green\", \"red\", \"yellow\"]\n", + "\n", + "for i in range(0, 5):\n", + " x, y = np.random.multivariate_normal(means9[i] / 30, covs9[i], 5000).T\n", + " plt.scatter(x, y, color=colors[i])\n", + " \n", + "plt.show()" ] }, { @@ -767,7 +764,7 @@ "# Some properties we'll need:\n", "DEFAULT_MAX_INFLATION = 100.0\n", "DEFAULT_TICKERS = [\"sp500\", \"t.bill\", \"t.bond\", \"corp\"]\n", - "DEFAULT_DATA_FILE = os.path.abspath(os.path.curdir) + '/../market.tsv' # full path\n", + "DEFAULT_DATA_FILE = os.path.abspath(os.path.curdir) + \"/../market.tsv\" # full path\n", "\n", "def load_market_data (file_name):\n", " with open(file_name, \"r\") as f:\n", @@ -799,7 +796,6 @@ "metadata": {}, "outputs": [], "source": [ - "from ray.rllib.agents.trainer import with_base_config, with_common_config\n", "from ray.rllib.contrib.bandits.agents.lin_ucb import UCB_CONFIG\n", "from ray.rllib.contrib.bandits.agents.lin_ucb import LinUCBTrainer\n", "import ray" @@ -896,12 +892,14 @@ "metadata": {}, "outputs": [], "source": [ - "market_config = with_base_config(UCB_CONFIG, {\n", - " \"env\": MarketBandit,\n", - " 'max-inflation': DEFAULT_MAX_INFLATION,\n", - " 'tickers': DEFAULT_TICKERS,\n", - " 'data-file': DEFAULT_DATA_FILE\n", - "})\n", + "import copy\n", + "\n", + "market_config = copy.deepcopy(UCB_CONFIG)\n", + "\n", + "market_config[\"env\"] = MarketBandit\n", + "market_config[\"max-inflation\"] = DEFAULT_MAX_INFLATION;\n", + "market_config[\"tickers\"] = DEFAULT_TICKERS;\n", + "market_config[\"data-file\"] = DEFAULT_DATA_FILE;\n", "\n", "stop = {\n", " \"training_iteration\": 100\n", @@ -994,7 +992,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The results for _LinTS_ were ~570 for reward mean and the regret stayed under 10000. So, training with _LinUCB_ isn't as successful." + "The results for _LinTS_ were ~340 for reward mean. So, training with _LinUCB_ isn't as successful." ] }, { @@ -1003,8 +1001,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_line_with_stddev(rewards, x_col='info/num_steps_trained', y_col='mean', stddev_col='std', \n", - " title='Rewards vs. Steps', x_axis_label='step', y_axis_label='reward')" + "rewards.plot(y=\"mean\", yerr=\"std\")" ] }, { @@ -1020,7 +1017,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_cumulative_regret(regrets)" + "regrets.plot(y=\"mean\", yerr=\"std\")" ] }, { @@ -1056,7 +1053,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -1076,7 +1073,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/recsys/00-Recsys-Overview.ipynb b/ray-rllib/recsys/00-Recsys-Overview.ipynb index 1b1ee9b..58611c6 100644 --- a/ray-rllib/recsys/00-Recsys-Overview.ipynb +++ b/ray-rllib/recsys/00-Recsys-Overview.ipynb @@ -40,19 +40,11 @@ "## Getting Help\n", "\n", "* The [#tutorial channel](https://ray-distributed.slack.com/archives/C011ML23W5B) on the [Ray Slack](https://ray-distributed.slack.com). [Click here](https://forms.gle/9TSdDYUgxYs8SA9e8) to join.\n", - "* [Email](mailto:academy@anyscale.com)\n", "\n", "Find an issue? Please report it!\n", "\n", "* [GitHub issues](https://github.com/anyscale/academy/issues)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -71,7 +63,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/ray-rllib/recsys/01-Recsys.ipynb b/ray-rllib/recsys/01-Recsys.ipynb index 96b29e1..78213ca 100644 --- a/ray-rllib/recsys/01-Recsys.ipynb +++ b/ray-rllib/recsys/01-Recsys.ipynb @@ -823,7 +823,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.init(ignore_reinit_error=True)" + "info = ray.init(ignore_reinit_error=True)" ] }, { @@ -1010,25 +1010,41 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Exercises\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 1\n", "\n", - "For the exercises, there are several ways to modify the Gym environment or the RLlib training parameters, then compare how the outcomes differ:\n", + "Compare use of the other datasets `\"jester-data-2.csv\"` and `\"jester-data-3.csv\"` by substituting them during the rollout.\n", "\n", - " 1. Re-run using smaller and larger K values\n", - " 2. Adjust the rewards for depleted and unrated actions\n", - " 3. Increase the number of training iterations\n", - " 4. Compare use of the other dataset partitions during rollout: `\"jester-data-2.csv\"` or `\"jester-data-3.csv\"`\n", + "How do the mean cumulative reward differ from the metrics in the lesson?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise 2\n", + "\n", + "Compare the effect of using a larger `K` value for the number of clusters.\n", "\n", - "For each of these variations compare:\n", + "Show the difference, if any, by comparing:\n", "\n", " * baseline with random actions \n", " * baseline with the naïve strategy\n", " * predicted average reward from training\n", - " * stats from the rollout\n", - "\n", - "Let's discuss the results as a group.\n", - "\n", - "Other questions to discuss:\n", + " * stats from the rollout" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Discussion Questions\n", "\n", " 1. In what ways could the \"warm start\" be improved?\n", " 2. How could this code be modified to scale to millions of users? Or to thousands of items?" diff --git a/ray-rllib/solutions/Ray-RLlib-Solutions.ipynb b/ray-rllib/solutions/Ray-RLlib-Solutions.ipynb index 88ce67e..f2c0aea 100644 --- a/ray-rllib/solutions/Ray-RLlib-Solutions.ipynb +++ b/ray-rllib/solutions/Ray-RLlib-Solutions.ipynb @@ -24,7 +24,9 @@ "import gym\n", "import numpy as np\n", "import pandas as pd\n", - "import json, sys, os" + "import json\n", + "import sys\n", + "import os" ] }, { @@ -44,8 +46,8 @@ "metadata": {}, "outputs": [], "source": [ - "env = gym.make('CartPole-v1')\n", - "print('Created env:', env)" + "env = gym.make(\"CartPole-v1\")\n", + "print(\"Created env:\", env)" ] }, { @@ -78,8 +80,15 @@ " return 0 if state[0] < 0 else 1\n", "\n", "def sample_policy2(state):\n", - " return 1 if state[0] < 0 else 0\n", - "\n", + " return 1 if state[0] < 0 else 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "reward1 = np.mean([rollout_policy(env, sample_policy1) for _ in range(100)])\n", "reward2 = np.mean([rollout_policy(env, sample_policy2) for _ in range(100)])\n", "\n", @@ -135,11 +144,11 @@ "outputs": [], "source": [ "config = DEFAULT_CONFIG.copy()\n", - "config['num_workers'] = 3\n", - "config['num_sgd_iter'] = 10 # was 30\n", - "config['sgd_minibatch_size'] = 256 # was 128\n", - "config['model']['fcnet_hiddens'] = [20, 20] # was [100, 100]\n", - "config['num_cpus_per_worker'] = 0" + "config[\"num_workers\"] = 3\n", + "config[\"num_sgd_iter\"] = 10 # was 30\n", + "config[\"sgd_minibatch_size\"] = 256 # was 128\n", + "config[\"model\"][\"fcnet_hiddens\"] = [20, 20] # was [100, 100]\n", + "config[\"num_cpus_per_worker\"] = 0" ] }, { @@ -148,7 +157,7 @@ "metadata": {}, "outputs": [], "source": [ - "agent = PPOTrainer(config, 'CartPole-v1')" + "agent = PPOTrainer(config, \"CartPole-v1\")" ] }, { @@ -157,20 +166,25 @@ "metadata": {}, "outputs": [], "source": [ - "N=20 # was 10\n", + "N = 20 # was 10\n", "results = []\n", "episode_data = []\n", "episode_json = []\n", + "\n", "for n in range(N):\n", " result = agent.train()\n", " results.append(result)\n", + " \n", " episode = {'n': n, \n", " 'episode_reward_min': result['episode_reward_min'], \n", " 'episode_reward_mean': result['episode_reward_mean'], \n", " 'episode_reward_max': result['episode_reward_max'], \n", - " 'episode_len_mean': result['episode_len_mean']} \n", + " 'episode_len_mean': result['episode_len_mean']\n", + " }\n", + " \n", " episode_data.append(episode)\n", " episode_json.append(json.dumps(episode))\n", + " \n", " print(f'{n:3d}: Min/Mean/Max reward: {result[\"episode_reward_min\"]:8.4f}/{result[\"episode_reward_mean\"]:8.4f}/{result[\"episode_reward_max\"]:8.4f}')" ] }, @@ -190,33 +204,14 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "sys.path.append(\"../..\")\n", - "from util.line_plots import plot_line, plot_line_with_min_max\n", - "\n", - "import bokeh.io\n", - "# The next two lines prevent Bokeh from opening the graph in a new window.\n", - "bokeh.io.reset_output()\n", - "bokeh.io.output_notebook()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot_line_with_min_max(df, x_col='n', y_col='episode_reward_mean', min_col='episode_reward_min', max_col='episode_reward_max',\n", - " title='Episode Rewards', x_axis_label='n', y_axis_label='reward')" + "df.plot(x=\"n\", y=[\"episode_reward_mean\", \"episode_reward_min\", \"episode_reward_max\"], secondary_y=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "([image](../../images/rllib/Cart-Pole-Episode-Rewards-Exercise.png))\n", - "\n", - "Compare this graph with the graph in the lesson, where we used a stronger network:\n", + "Compare this graph with the graph in the lesson, where we used more computing resources:\n", "\n", "![](../../images/rllib/Cart-Pole-Episode-Rewards.png)\n", "\n", @@ -231,7 +226,7 @@ "metadata": {}, "outputs": [], "source": [ - "ray.shutdown() # \"Undo ray.init()\"." + "ray.shutdown()" ] } ], @@ -251,7 +246,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.7" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/requirements.extra b/requirements.extra new file mode 100644 index 0000000..3574ef2 --- /dev/null +++ b/requirements.extra @@ -0,0 +1,4 @@ +nodejs +holoviews +bokeh==2.1.1 +atoma diff --git a/requirements.txt b/requirements.txt index 5bc0e70..654ac51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ gym >= 0.17.2 numpy >= 1.18.5 -nodejs pandas requests torch @@ -10,8 +9,6 @@ tensorflow >= 2.3.0 tqdm >= 4.37.0 keras scikit-learn -holoviews -bokeh==2.1.1 ipywidgets psutil jupyterlab @@ -19,6 +16,5 @@ jupyter-server-proxy beautifulsoup4 lxml pytz -ray[all]==0.8.7 -atoma +ray box2d-py