From 2edbb6e9d4a0101ad82ee591f4ac8ec33963644e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sun, 26 May 2019 23:30:39 +0800 Subject: [PATCH] Add Reinforcement Learning notebook --- 18_reinforcement_learning.ipynb | 3262 ++++++++++++++++++------------- 1 file changed, 1857 insertions(+), 1405 deletions(-) diff --git a/18_reinforcement_learning.ipynb b/18_reinforcement_learning.ipynb index 7713752..7b9809b 100644 --- a/18_reinforcement_learning.ipynb +++ b/18_reinforcement_learning.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Chapter 16 – Reinforcement Learning**" + "**Chapter 18 – Reinforcement Learning**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook contains all the sample code and solutions to the exersices in chapter 16." + "_This notebook contains all the sample code in chapter 18_." ] }, { @@ -25,7 +25,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" + "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0-preview." ] }, { @@ -34,46 +34,51 @@ "metadata": {}, "outputs": [], "source": [ - "# To support both python 2 and python 3\n", - "from __future__ import division, print_function, unicode_literals\n", + "# Python ≥3.5 is required\n", + "import sys\n", + "assert sys.version_info >= (3, 5)\n", + "\n", + "# Scikit-Learn ≥0.20 is required\n", + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"\n", + "\n", + "# TensorFlow ≥2.0-preview is required\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "assert tf.__version__ >= \"2.0\"\n", "\n", "# Common imports\n", "import numpy as np\n", "import os\n", - "import sys\n", "\n", "# to make this notebook's output stable across runs\n", - "def reset_graph(seed=42):\n", - " tf.reset_default_graph()\n", - " tf.set_random_seed(seed)\n", - " np.random.seed(seed)\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", "\n", - "# To plot pretty figures and animations\n", - "%matplotlib nbagg\n", - "import matplotlib\n", - "import matplotlib.animation as animation\n", + "# To plot pretty figures\n", + "%matplotlib inline\n", + "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", - "plt.rcParams['axes.labelsize'] = 14\n", - "plt.rcParams['xtick.labelsize'] = 12\n", - "plt.rcParams['ytick.labelsize'] = 12\n", + "mpl.rc('axes', labelsize=14)\n", + "mpl.rc('xtick', labelsize=12)\n", + "mpl.rc('ytick', labelsize=12)\n", + "\n", + "# To get smooth animations\n", + "import matplotlib.animation as animation\n", + "mpl.rc('animation', html='jshtml')\n", "\n", "# Where to save the figures\n", "PROJECT_ROOT_DIR = \".\"\n", "CHAPTER_ID = \"rl\"\n", + "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n", + "os.makedirs(IMAGES_PATH, exist_ok=True)\n", "\n", - "def save_fig(fig_id, tight_layout=True):\n", - " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", + "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n", + " path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n", " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format='png', dpi=300)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: there may be minor differences between the output of this notebook and the examples shown in the book. You can safely ignore these differences. They are mainly due to the fact that most of the environments provided by OpenAI gym have some randomness." + " plt.savefig(path, format=fig_extension, dpi=resolution)" ] }, { @@ -103,7 +108,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next we will load the MsPacman environment, version 0." + "Let's list all the available environments:" ] }, { @@ -112,7 +117,23 @@ "metadata": {}, "outputs": [], "source": [ - "env = gym.make('MsPacman-v0')" + "gym.envs.registry.all()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Cart-Pole is a very simple environment composed of a cart that can move left or right, and pole placed vertically on top of it. The agent must move the cart left or right to keep the pole upright." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make('CartPole-v1')" ] }, { @@ -124,10 +145,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ + "env.seed(42)\n", "obs = env.reset()" ] }, @@ -135,23 +157,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Observations vary depending on the environment. In this case it is an RGB image represented as a 3D NumPy array of shape [width, height, channels] (with 3 channels: Red, Green and Blue). In other environments it may return different objects, as we will see later." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "obs.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An environment can be visualized by calling its `render()` method, and you can pick the rendering mode (the rendering options depend on the environment). In this example we will set `mode=\"rgb_array\"` to get an image of the environment as a NumPy array:" + "Observations vary depending on the environment. In this case it is a 1D NumPy array composed of 4 floats: they represent the cart's horizontal position, its velocity, the angle of the pole (0 = vertical), and the angular velocity." ] }, { @@ -160,43 +166,30 @@ "metadata": {}, "outputs": [], "source": [ - "img = env.render(mode=\"rgb_array\")" + "obs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's plot this image:" + "An environment can be visualized by calling its `render()` method, and you can pick the rendering mode (the rendering options depend on the environment)." ] }, { "cell_type": "code", "execution_count": 7, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(5,4))\n", - "plt.imshow(img)\n", - "plt.axis(\"off\")\n", - "save_fig(\"MsPacman\")\n", - "plt.show()" + "env.render()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Welcome back to the 1980s! :)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this environment, the rendered image is simply equal to the observation (but in many environments this is not the case):" + "In this example we will set `mode=\"rgb_array\"` to get an image of the environment as a NumPy array:" ] }, { @@ -205,14 +198,8 @@ "metadata": {}, "outputs": [], "source": [ - "(img == obs).all()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's create a little helper function to plot an environment:" + "img = env.render(mode=\"rgb_array\")\n", + "img.shape" ] }, { @@ -222,12 +209,60 @@ "outputs": [], "source": [ "def plot_environment(env, figsize=(5,4)):\n", - " plt.close() # or else nbagg sometimes plots in the previous cell\n", " plt.figure(figsize=figsize)\n", " img = env.render(mode=\"rgb_array\")\n", " plt.imshow(img)\n", " plt.axis(\"off\")\n", - " plt.show()" + " return img" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: some environments (including the Cart-Pole) require access to your display, which opens up a separate window, even if you specify `mode=\"rgb_array\"`. In general you can safely ignore that window. However, if Jupyter is running on a headless server (ie. without a screen) it will raise an exception. One way to avoid this is to install a fake X server like [Xvfb](http://en.wikipedia.org/wiki/Xvfb). On Debian or Ubuntu:\n", + "\n", + "```bash\n", + "$ apt update\n", + "$ apt install -y xvfb\n", + "```\n", + "\n", + "You can then start Jupyter using the `xvfb-run` command:\n", + "\n", + "```bash\n", + "$ xvfb-run -s \"-screen 0 1400x900x24\" jupyter notebook\n", + "```\n", + "\n", + "Alternatively, you can install the `pyvirtualdisplay` Python library which wraps Xvfb:\n", + "\n", + "```bash\n", + "python3 -m pip install -U pyvirtualdisplay\n", + "```\n", + "\n", + "And run the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import pyvirtualdisplay\n", + " display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()\n", + "except ImportError:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "plot_environment(env)\n", + "plt.show()" ] }, { @@ -239,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -250,50 +285,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`Discrete(9)` means that the possible actions are integers 0 through 8, which represents the 9 possible positions of the joystick (0=center, 1=up, 2=right, 3=left, 4=down, 5=upper-right, 6=upper-left, 7=lower-right, 8=lower-left)." + "Yep, just two possible actions: accelerate towards the left or towards the right." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Next we need to tell the environment which action to play, and it will compute the next step of the game. Let's go left for 110 steps, then lower left for 40 steps:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "env.reset()\n", - "for step in range(110):\n", - " env.step(3) #left\n", - "for step in range(40):\n", - " env.step(8) #lower-left" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Where are we now?" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "plot_environment(env)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `step()` function actually returns several important objects:" + "Since the pole is leaning toward the right (`obs[2] > 0`), let's accelerate the cart toward the right:" ] }, { @@ -302,14 +301,16 @@ "metadata": {}, "outputs": [], "source": [ - "obs, reward, done, info = env.step(0)" + "action = 1 # accelerate right\n", + "obs, reward, done, info = env.step(action)\n", + "obs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The observation tells the agent what the environment looks like, as discussed earlier. This is a 210x160 RGB image:" + "Notice that the cart is now moving toward the right (`obs[1] > 0`). The pole is still tilted toward the right (`obs[2] > 0`), but its angular velocity is now negative (`obs[3] < 0`), so it will likely be tilted toward the left after the next step." ] }, { @@ -318,7 +319,15 @@ "metadata": {}, "outputs": [], "source": [ - "obs.shape" + "plot_environment(env)\n", + "save_fig(\"cart_pole_plot\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like it's doing what we're telling it to do!" ] }, { @@ -357,7 +366,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, `info` is an environment-specific dictionary that can provide some extra information about the internal state of the environment. This is useful for debugging, but your agent should not use this information for learning (it would be cheating)." + "Finally, `info` is an environment-specific dictionary that can provide some extra information that you may find useful for debugging or for training. For example, in some games it may indicate how many lives the agent has." ] }, { @@ -373,7 +382,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's play one full game (with 3 lives), by moving in random directions for 10 steps at a time, recording each frame:" + "The sequence of steps between the moment the environment is reset until it is done is called an \"episode\". At the end of an episode (i.e., when `step()` returns `done=True`), you should reset the environment before you continue to use it." ] }, { @@ -382,299 +391,15 @@ "metadata": {}, "outputs": [], "source": [ - "frames = []\n", - "\n", - "n_max_steps = 1000\n", - "n_change_steps = 10\n", - "\n", - "obs = env.reset()\n", - "for step in range(n_max_steps):\n", - " img = env.render(mode=\"rgb_array\")\n", - " frames.append(img)\n", - " if step % n_change_steps == 0:\n", - " action = env.action_space.sample() # play randomly\n", - " obs, reward, done, info = env.step(action)\n", - " if done:\n", - " break" + "if done:\n", + " obs = env.reset()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now show the animation (it's a bit jittery within Jupyter):" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "def update_scene(num, frames, patch):\n", - " patch.set_data(frames[num])\n", - " return patch,\n", - "\n", - "def plot_animation(frames, repeat=False, interval=40):\n", - " plt.close() # or else nbagg sometimes plots in the previous cell\n", - " fig = plt.figure()\n", - " patch = plt.imshow(frames[0])\n", - " plt.axis('off')\n", - " return animation.FuncAnimation(fig, update_scene, fargs=(frames, patch), frames=len(frames), repeat=repeat, interval=interval)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "video = plot_animation(frames)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you have finished playing with an environment, you should close it to free up resources:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "env.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To code our first learning agent, we will be using a simpler environment: the Cart-Pole. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A simple environment: the Cart-Pole" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Cart-Pole is a very simple environment composed of a cart that can move left or right, and pole placed vertically on top of it. The agent must move the cart left or right to keep the pole upright." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make(\"CartPole-v0\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "obs = env.reset()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "obs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The observation is a 1D NumPy array composed of 4 floats: they represent the cart's horizontal position, its velocity, the angle of the pole (0 = vertical), and the angular velocity. Let's render the environment... unfortunately we need to fix an annoying rendering issue first." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fixing the rendering issue" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some environments (including the Cart-Pole) require access to your display, which opens up a separate window, even if you specify the `rgb_array` mode. In general you can safely ignore that window. However, if Jupyter is running on a headless server (ie. without a screen) it will raise an exception. One way to avoid this is to install a fake X server like Xvfb. You can start Jupyter using the `xvfb-run` command:\n", - "\n", - " $ xvfb-run -s \"-screen 0 1400x900x24\" jupyter notebook\n", - "\n", - "If Jupyter is running on a headless server but you don't want to worry about Xvfb, then you can just use the following rendering function for the Cart-Pole:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "from PIL import Image, ImageDraw\n", - "\n", - "try:\n", - " from pyglet.gl import gl_info\n", - " openai_cart_pole_rendering = True # no problem, let's use OpenAI gym's rendering function\n", - "except Exception:\n", - " openai_cart_pole_rendering = False # probably no X server available, let's use our own rendering function\n", - "\n", - "def render_cart_pole(env, obs):\n", - " if openai_cart_pole_rendering:\n", - " # use OpenAI gym's rendering function\n", - " return env.render(mode=\"rgb_array\")\n", - " else:\n", - " # rendering for the cart pole environment (in case OpenAI gym can't do it)\n", - " img_w = 600\n", - " img_h = 400\n", - " cart_w = img_w // 12\n", - " cart_h = img_h // 15\n", - " pole_len = img_h // 3.5\n", - " pole_w = img_w // 80 + 1\n", - " x_width = 2\n", - " max_ang = 0.2\n", - " bg_col = (255, 255, 255)\n", - " cart_col = 0x000000 # Blue Green Red\n", - " pole_col = 0x669acc # Blue Green Red\n", - "\n", - " pos, vel, ang, ang_vel = obs\n", - " img = Image.new('RGB', (img_w, img_h), bg_col)\n", - " draw = ImageDraw.Draw(img)\n", - " cart_x = pos * img_w // x_width + img_w // x_width\n", - " cart_y = img_h * 95 // 100\n", - " top_pole_x = cart_x + pole_len * np.sin(ang)\n", - " top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)\n", - " draw.line((0, cart_y, img_w, cart_y), fill=0)\n", - " draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col) # draw cart\n", - " draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col, width=pole_w) # draw pole\n", - " return np.array(img)\n", - "\n", - "def plot_cart_pole(env, obs):\n", - " plt.close() # or else nbagg sometimes plots in the previous cell\n", - " img = render_cart_pole(env, obs)\n", - " plt.imshow(img)\n", - " plt.axis(\"off\")\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "plot_cart_pole(env, obs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's look at the action space:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "env.action_space" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Yep, just two possible actions: accelerate towards the left or towards the right. Let's push the cart left until the pole falls:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "obs = env.reset()\n", - "while True:\n", - " obs, reward, done, info = env.step(0)\n", - " if done:\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "plt.close() # or else nbagg sometimes plots in the previous cell\n", - "img = render_cart_pole(env, obs)\n", - "plt.imshow(img)\n", - "plt.axis(\"off\")\n", - "save_fig(\"cart_pole_plot\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "img.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that the game is over when the pole tilts too much, not when it actually falls. Now let's reset the environment and push the cart to right instead:" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "obs = env.reset()\n", - "while True:\n", - " obs, reward, done, info = env.step(1)\n", - " if done:\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "plot_cart_pole(env, obs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looks like it's doing what we're telling it to do. Now how can we make the poll remain upright? We will need to define a _policy_ for that. This is the strategy that the agent will use to select an action at each step. It can use all the past actions and observations to decide what to do." + "Now how can we make the poll remain upright? We will need to define a _policy_ for that. This is the strategy that the agent will use to select an action at each step. It can use all the past actions and observations to decide what to do." ] }, { @@ -693,47 +418,115 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ + "env.seed(42)\n", + "\n", + "def basic_policy(obs):\n", + " angle = obs[2]\n", + " return 0 if angle < 0 else 1\n", + "\n", + "totals = []\n", + "for episode in range(500):\n", + " episode_rewards = 0\n", + " obs = env.reset()\n", + " for step in range(200):\n", + " action = basic_policy(obs)\n", + " obs, reward, done, info = env.step(action)\n", + " episode_rewards += reward\n", + " if done:\n", + " break\n", + " totals.append(episode_rewards)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "np.mean(totals), np.std(totals), np.min(totals), np.max(totals)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Well, as expected, this strategy is a bit too basic: the best it did was to keep the poll up for only 68 steps. This environment is considered solved when the agent keeps the poll up for 200 steps." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's visualize one episode:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "env.seed(42)\n", + "\n", "frames = []\n", "\n", - "n_max_steps = 1000\n", - "n_change_steps = 10\n", - "\n", "obs = env.reset()\n", - "for step in range(n_max_steps):\n", - " img = render_cart_pole(env, obs)\n", + "for step in range(200):\n", + " img = env.render(mode=\"rgb_array\")\n", " frames.append(img)\n", - "\n", - " # hard-coded policy\n", - " position, velocity, angle, angular_velocity = obs\n", - " if angle < 0:\n", - " action = 0\n", - " else:\n", - " action = 1\n", + " action = basic_policy(obs)\n", "\n", " obs, reward, done, info = env.step(action)\n", " if done:\n", " break" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now show the animation:" + ] + }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "video = plot_animation(frames)\n", - "plt.show()" + "def update_scene(num, frames, patch):\n", + " patch.set_data(frames[num])\n", + " return patch,\n", + "\n", + "def plot_animation(frames, repeat=False, interval=40):\n", + " fig = plt.figure()\n", + " patch = plt.imshow(frames[0])\n", + " plt.axis('off')\n", + " anim = animation.FuncAnimation(\n", + " fig, update_scene, fargs=(frames, patch),\n", + " frames=len(frames), repeat=repeat, interval=interval)\n", + " plt.close()\n", + " return anim" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "plot_animation(frames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Nope, the system is unstable and after just a few wobbles, the pole ends up too tilted: game over. We will need to be smarter than that!" + "Clearly the system is unstable and after just a few wobbles, the pole ends up too tilted: game over. We will need to be smarter than that!" ] }, { @@ -747,47 +540,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's create a neural network that will take observations as inputs, and output the action to take for each observation. To choose an action, the network will first estimate a probability for each action, then select an action randomly according to the estimated probabilities. In the case of the Cart-Pole environment, there are just two possible actions (left or right), so we only need one output neuron: it will output the probability `p` of the action 0 (left), and of course the probability of action 1 (right) will be `1 - p`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: instead of using the `fully_connected()` function from the `tensorflow.contrib.layers` module (as in the book), we now use the `dense()` function from the `tf.layers` module, which did not exist when this chapter was written. This is preferable because anything in contrib may change or be deleted without notice, while `tf.layers` is part of the official API. As you will see, the code is mostly the same.\n", - "\n", - "The main differences relevant to this chapter are:\n", - "* the `_fn` suffix was removed in all the parameters that had it (for example the `activation_fn` parameter was renamed to `activation`).\n", - "* the `weights` parameter was renamed to `kernel`,\n", - "* the default activation is `None` instead of `tf.nn.relu`" + "Let's create a neural network that will take observations as inputs, and output the action to take for each observation. To choose an action, the network will estimate a probability for each action, then we will select an action randomly according to the estimated probabilities. In the case of the Cart-Pole environment, there are just two possible actions (left or right), so we only need one output neuron: it will output the probability `p` of the action 0 (left), and of course the probability of action 1 (right) will be `1 - p`." ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "import tensorflow as tf\n", + "tf.random.set_seed(42)\n", + "np.random.seed(42)\n", "\n", - "# 1. Specify the network architecture\n", - "n_inputs = 4 # == env.observation_space.shape[0]\n", - "n_hidden = 4 # it's a simple task, we don't need more than this\n", - "n_outputs = 1 # only outputs the probability of accelerating left\n", - "initializer = tf.variance_scaling_initializer()\n", + "n_inputs = 4 # == env.observation_space.shape[0]\n", "\n", - "# 2. Build the neural network\n", - "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", - "hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu,\n", - " kernel_initializer=initializer)\n", - "outputs = tf.layers.dense(hidden, n_outputs, activation=tf.nn.sigmoid,\n", - " kernel_initializer=initializer)\n", - "\n", - "# 3. Select a random action based on the estimated probabilities\n", - "p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])\n", - "action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)\n", - "\n", - "init = tf.global_variables_initializer()" + "model = keras.models.Sequential([\n", + " keras.layers.Dense(5, activation=\"elu\", input_shape=[n_inputs]),\n", + " keras.layers.Dense(1, activation=\"sigmoid\"),\n", + "])" ] }, { @@ -801,126 +571,101 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You may wonder why we are picking a random action based on the probability given by the policy network, rather than just picking the action with the highest probability. This approach lets the agent find the right balance between _exploring_ new actions and _exploiting_ the actions that are known to work well. Here's an analogy: suppose you go to a restaurant for the first time, and all the dishes look equally appealing so you randomly pick one. If it turns out to be good, you can increase the probability to order it next time, but you shouldn't increase that probability to 100%, or else you will never try out the other dishes, some of which may be even better than the one you tried." + "You may wonder why we plan to pick a random action based on the probability given by the policy network, rather than just picking the action with the highest probability. This approach lets the agent find the right balance between _exploring_ new actions and _exploiting_ the actions that are known to work well. Here's an analogy: suppose you go to a restaurant for the first time, and all the dishes look equally appealing so you randomly pick one. If it turns out to be good, you can increase the probability to order it next time, but you shouldn't increase that probability to 100%, or else you will never try out the other dishes, some of which may be even better than the one you tried." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's randomly initialize this policy neural network and use it to play one game:" + "Let's write a small function that will run the model to play one episode, and return the frames so we can display an animation:" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "n_max_steps = 1000\n", - "frames = []\n", - "\n", - "with tf.Session() as sess:\n", - " init.run()\n", + "def render_policy_net(model, n_max_steps=200, seed=42):\n", + " frames = []\n", + " env = gym.make(\"CartPole-v1\")\n", + " env.seed(seed)\n", + " np.random.seed(seed)\n", " obs = env.reset()\n", " for step in range(n_max_steps):\n", - " img = render_cart_pole(env, obs)\n", - " frames.append(img)\n", - " action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})\n", - " obs, reward, done, info = env.step(action_val[0][0])\n", + " frames.append(env.render(mode=\"rgb_array\"))\n", + " left_proba = model.predict(obs.reshape(1, -1))\n", + " action = int(np.random.rand() > left_proba)\n", + " obs, reward, done, info = env.step(action)\n", " if done:\n", " break\n", - "\n", - "env.close()" + " env.close()\n", + " return frames" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's look at how well this randomly initialized policy network performed:" + "Now let's look at how well this randomly initialized policy network performs:" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "video = plot_animation(frames)\n", - "plt.show()" + "frames = render_policy_net(model)\n", + "plot_animation(frames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Yeah... pretty bad. The neural network will have to learn to do better. First let's see if it is capable of learning the basic policy we used earlier: go left if the pole is tilting left, and go right if it is tilting right. The following code defines the same neural network but we add the target probabilities `y`, and the training operations (`cross_entropy`, `optimizer` and `training_op`):" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow as tf\n", - "\n", - "reset_graph()\n", - "\n", - "n_inputs = 4\n", - "n_hidden = 4\n", - "n_outputs = 1\n", - "\n", - "learning_rate = 0.01\n", - "\n", - "initializer = tf.variance_scaling_initializer()\n", - "\n", - "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", - "y = tf.placeholder(tf.float32, shape=[None, n_outputs])\n", - "\n", - "hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)\n", - "logits = tf.layers.dense(hidden, n_outputs)\n", - "outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)\n", - "p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])\n", - "action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)\n", - "\n", - "cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)\n", - "optimizer = tf.train.AdamOptimizer(learning_rate)\n", - "training_op = optimizer.minimize(cross_entropy)\n", - "\n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" + "Yeah... pretty bad. The neural network will have to learn to do better. First let's see if it is capable of learning the basic policy we used earlier: go left if the pole is tilting left, and go right if it is tilting right." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can make the same net play in 10 different environments in parallel, and train for 1000 iterations. We also reset environments when they are done." + "We can make the same net play in 50 different environments in parallel (this will give us a diverse training batch at each step), and train for 5000 iterations. We also reset environments when they are done. We train the model using a custom training loop so we can easily use the predictions at each training step to advance the environments." ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "n_environments = 10\n", - "n_iterations = 1000\n", + "n_environments = 50\n", + "n_iterations = 5000\n", "\n", - "envs = [gym.make(\"CartPole-v0\") for _ in range(n_environments)]\n", + "envs = [gym.make(\"CartPole-v1\") for _ in range(n_environments)]\n", + "for index, env in enumerate(envs):\n", + " env.seed(index)\n", + "np.random.seed(42)\n", "observations = [env.reset() for env in envs]\n", + "optimizer = keras.optimizers.RMSprop()\n", + "loss_fn = keras.losses.binary_crossentropy\n", "\n", - "with tf.Session() as sess:\n", - " init.run()\n", - " for iteration in range(n_iterations):\n", - " target_probas = np.array([([1.] if obs[2] < 0 else [0.]) for obs in observations]) # if angle<0 we want proba(left)=1., or else proba(left)=0.\n", - " action_val, _ = sess.run([action, training_op], feed_dict={X: np.array(observations), y: target_probas})\n", - " for env_index, env in enumerate(envs):\n", - " obs, reward, done, info = env.step(action_val[env_index][0])\n", - " observations[env_index] = obs if not done else env.reset()\n", - " saver.save(sess, \"./my_policy_net_basic.ckpt\")\n", + "for iteration in range(n_iterations):\n", + " # if angle < 0, we want proba(left) = 1., or else proba(left) = 0.\n", + " target_probas = np.array([([1.] if obs[2] < 0 else [0.])\n", + " for obs in observations])\n", + " with tf.GradientTape() as tape:\n", + " left_probas = model(np.array(observations))\n", + " loss = tf.reduce_mean(loss_fn(target_probas, left_probas))\n", + " print(\"\\rIteration: {}, Loss: {:.3f}\".format(iteration, loss.numpy()), end=\"\")\n", + " grads = tape.gradient(loss, model.trainable_variables)\n", + " optimizer.apply_gradients(zip(grads, model.trainable_variables))\n", + " actions = (np.random.rand(n_environments, 1) > left_probas.numpy()).astype(np.int32)\n", + " for env_index, env in enumerate(envs):\n", + " obs, reward, done, info = env.step(actions[env_index][0])\n", + " observations[env_index] = obs if not done else env.reset()\n", "\n", "for env in envs:\n", " env.close()" @@ -928,43 +673,19 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "def render_policy_net(model_path, action, X, n_max_steps = 1000):\n", - " frames = []\n", - " env = gym.make(\"CartPole-v0\")\n", - " obs = env.reset()\n", - " with tf.Session() as sess:\n", - " saver.restore(sess, model_path)\n", - " for step in range(n_max_steps):\n", - " img = render_cart_pole(env, obs)\n", - " frames.append(img)\n", - " action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})\n", - " obs, reward, done, info = env.step(action_val[0][0])\n", - " if done:\n", - " break\n", - " env.close()\n", - " return frames " - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "frames = render_policy_net(\"./my_policy_net_basic.ckpt\", action, X)\n", - "video = plot_animation(frames)\n", - "plt.show()" + "frames = render_policy_net(model)\n", + "plot_animation(frames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Looks like it learned the policy correctly. Now let's see if it can learn a better policy on its own." + "Looks like it learned the policy correctly. Now let's see if it can learn a better policy on its own. One that does not wobble as much." ] }, { @@ -978,79 +699,112 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To train this neural network we will need to define the target probabilities `y`. If an action is good we should increase its probability, and conversely if it is bad we should reduce it. But how do we know whether an action is good or bad? The problem is that most actions have delayed effects, so when you win or lose points in a game, it is not clear which actions contributed to this result: was it just the last action? Or the last 10? Or just one action 50 steps earlier? This is called the _credit assignment problem_.\n", + "To train this neural network we will need to define the target probabilities `y`. If an action is good we should increase its probability, and conversely if it is bad we should reduce it. But how do we know whether an action is good or bad? The problem is that most actions have delayed effects, so when you win or lose points in an episode, it is not clear which actions contributed to this result: was it just the last action? Or the last 10? Or just one action 50 steps earlier? This is called the _credit assignment problem_.\n", "\n", - "The _Policy Gradients_ algorithm tackles this problem by first playing multiple games, then making the actions in good games slightly more likely, while actions in bad games are made slightly less likely. First we play, then we go back and think about what we did." + "The _Policy Gradients_ algorithm tackles this problem by first playing multiple episodes, then making the actions in good episodes slightly more likely, while actions in bad episodes are made slightly less likely. First we play, then we go back and think about what we did." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start by creating a function to play a single step using the model. We will also pretend for now that whatever action it takes is the right one, so we can compute the loss and its gradients (we will just save these gradients for now, and modify them later depending on how good or bad the action turned out to be):" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "import tensorflow as tf\n", - "\n", - "reset_graph()\n", - "\n", - "n_inputs = 4\n", - "n_hidden = 4\n", - "n_outputs = 1\n", - "\n", - "learning_rate = 0.01\n", - "\n", - "initializer = tf.variance_scaling_initializer()\n", - "\n", - "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", - "\n", - "hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.elu, kernel_initializer=initializer)\n", - "logits = tf.layers.dense(hidden, n_outputs)\n", - "outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)\n", - "p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])\n", - "action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)\n", - "\n", - "y = 1. - tf.to_float(action)\n", - "cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits)\n", - "optimizer = tf.train.AdamOptimizer(learning_rate)\n", - "grads_and_vars = optimizer.compute_gradients(cross_entropy)\n", - "gradients = [grad for grad, variable in grads_and_vars]\n", - "gradient_placeholders = []\n", - "grads_and_vars_feed = []\n", - "for grad, variable in grads_and_vars:\n", - " gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())\n", - " gradient_placeholders.append(gradient_placeholder)\n", - " grads_and_vars_feed.append((gradient_placeholder, variable))\n", - "training_op = optimizer.apply_gradients(grads_and_vars_feed)\n", - "\n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" + "def play_one_step(env, obs, model, loss_fn):\n", + " with tf.GradientTape() as tape:\n", + " left_proba = model(obs[np.newaxis])\n", + " action = (tf.random.uniform([1, 1]) > left_proba)\n", + " y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)\n", + " loss = tf.reduce_mean(loss_fn(y_target, left_proba))\n", + " grads = tape.gradient(loss, model.trainable_variables)\n", + " obs, reward, done, info = env.step(int(action[0, 0].numpy()))\n", + " return obs, reward, done, grads" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If `left_proba` is high, then `action` will most likely be `False` (since a random number uniformally sampled between 0 and 1 will probably not be greater than `left_proba`). And `False` means 0 when you cast it to a number, so `y_target` would be equal to 1 - 0 = 1. In other words, we set the target to 1, meaning we pretend that the probability of going left should have been 100% (so we took the right action)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's create another function that will rely on the `play_one_step()` function to play multiple episodes, returning all the rewards and gradients, for each episode and each step:" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):\n", + " all_rewards = []\n", + " all_grads = []\n", + " for episode in range(n_episodes):\n", + " current_rewards = []\n", + " current_grads = []\n", + " obs = env.reset()\n", + " for step in range(n_max_steps):\n", + " obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)\n", + " current_rewards.append(reward)\n", + " current_grads.append(grads)\n", + " if done:\n", + " break\n", + " all_rewards.append(current_rewards)\n", + " all_grads.append(current_grads)\n", + " return all_rewards, all_grads" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Policy Gradients algorithm uses the model to play the episode several times (e.g., 10 times), then it goes back and looks at all the rewards, discounts them and normalizes them. So let's create couple functions for that: the first will compute discounted rewards; the second will normalize the discounted rewards across many episodes." + ] + }, + { + "cell_type": "code", + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def discount_rewards(rewards, discount_rate):\n", - " discounted_rewards = np.zeros(len(rewards))\n", - " cumulative_rewards = 0\n", - " for step in reversed(range(len(rewards))):\n", - " cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate\n", - " discounted_rewards[step] = cumulative_rewards\n", - " return discounted_rewards\n", + " discounted = np.array(rewards)\n", + " for step in range(len(rewards) - 2, -1, -1):\n", + " discounted[step] += discounted[step + 1] * discount_rate\n", + " return discounted\n", "\n", "def discount_and_normalize_rewards(all_rewards, discount_rate):\n", - " all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]\n", + " all_discounted_rewards = [discount_rewards(rewards, discount_rate)\n", + " for rewards in all_rewards]\n", " flat_rewards = np.concatenate(all_discounted_rewards)\n", " reward_mean = flat_rewards.mean()\n", " reward_std = flat_rewards.std()\n", - " return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]" + " return [(discounted_rewards - reward_mean) / reward_std\n", + " for discounted_rewards in all_discounted_rewards]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Say there were 3 actions, and after each action there was a reward: first 10, then 0, then -50. If we use a discount factor of 80%, then the 3rd action will get -50 (full credit for the last reward), but the 2nd action will only get -40 (80% credit for the last reward), and the 1st action will get 80% of -40 (-32) plus full credit for the first reward (+10), which leads to a discounted reward of -22:" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1058,9 +812,18 @@ ] }, { - "cell_type": "code", - "execution_count": 45, + "cell_type": "markdown", "metadata": {}, + "source": [ + "To normalize all discounted rewards across all episodes, we compute the mean and standard deviation of all the discounted rewards, and we subtract the mean from each discounted reward, and divide by the standard deviation:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)" @@ -1068,68 +831,78 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "env = gym.make(\"CartPole-v0\")\n", - "\n", - "n_games_per_update = 10\n", - "n_max_steps = 1000\n", - "n_iterations = 250\n", - "save_iterations = 10\n", - "discount_rate = 0.95\n", - "\n", - "with tf.Session() as sess:\n", - " init.run()\n", - " for iteration in range(n_iterations):\n", - " print(\"\\rIteration: {}\".format(iteration), end=\"\")\n", - " all_rewards = []\n", - " all_gradients = []\n", - " for game in range(n_games_per_update):\n", - " current_rewards = []\n", - " current_gradients = []\n", - " obs = env.reset()\n", - " for step in range(n_max_steps):\n", - " action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})\n", - " obs, reward, done, info = env.step(action_val[0][0])\n", - " current_rewards.append(reward)\n", - " current_gradients.append(gradients_val)\n", - " if done:\n", - " break\n", - " all_rewards.append(current_rewards)\n", - " all_gradients.append(current_gradients)\n", - "\n", - " all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)\n", - " feed_dict = {}\n", - " for var_index, gradient_placeholder in enumerate(gradient_placeholders):\n", - " mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]\n", - " for game_index, rewards in enumerate(all_rewards)\n", - " for step, reward in enumerate(rewards)], axis=0)\n", - " feed_dict[gradient_placeholder] = mean_gradients\n", - " sess.run(training_op, feed_dict=feed_dict)\n", - " if iteration % save_iterations == 0:\n", - " saver.save(sess, \"./my_policy_net_pg.ckpt\")" + "n_iterations = 150\n", + "n_episodes_per_update = 10\n", + "n_max_steps = 200\n", + "discount_rate = 0.95" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ + "optimizer = keras.optimizers.Adam(lr=0.01)\n", + "loss_fn = keras.losses.binary_crossentropy" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Dense(5, activation=\"elu\", input_shape=[4]),\n", + " keras.layers.Dense(1, activation=\"sigmoid\"),\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make(\"CartPole-v1\")\n", + "env.seed(42);\n", + "\n", + "for iteration in range(n_iterations):\n", + " all_rewards, all_grads = play_multiple_episodes(\n", + " env, n_episodes_per_update, n_max_steps, model, loss_fn)\n", + " total_rewards = sum(map(sum, all_rewards)) # Not shown in the book\n", + " print(\"\\rIteration: {}, mean rewards: {:.1f}\".format( # Not shown\n", + " iteration, total_rewards / n_episodes_per_update), end=\"\") # Not shown\n", + " all_final_rewards = discount_and_normalize_rewards(all_rewards,\n", + " discount_rate)\n", + " all_mean_grads = []\n", + " for var_index in range(len(model.trainable_variables)):\n", + " mean_grads = tf.reduce_mean(\n", + " [final_reward * all_grads[episode_index][step][var_index]\n", + " for episode_index, final_rewards in enumerate(all_final_rewards)\n", + " for step, final_reward in enumerate(final_rewards)], axis=0)\n", + " all_mean_grads.append(mean_grads)\n", + " optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))\n", + "\n", "env.close()" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ - "frames = render_policy_net(\"./my_policy_net_pg.ckpt\", action, X, n_max_steps=1000)\n", - "video = plot_animation(frames)\n", - "plt.show()" + "frames = render_policy_net(model)\n", + "plot_animation(frames)" ] }, { @@ -1141,21 +914,22 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "transition_probabilities = [\n", + "np.random.seed(42)\n", + "\n", + "transition_probabilities = [ # shape=[s, s']\n", " [0.7, 0.2, 0.0, 0.1], # from s0 to s0, s1, s2, s3\n", " [0.0, 0.0, 0.9, 0.1], # from s1 to ...\n", " [0.0, 1.0, 0.0, 0.0], # from s2 to ...\n", - " [0.0, 0.0, 0.0, 1.0], # from s3 to ...\n", - " ]\n", + " [0.0, 0.0, 0.0, 1.0]] # from s3 to ...\n", "\n", "n_max_steps = 50\n", "\n", - "def print_sequence(start_state=0):\n", - " current_state = start_state\n", + "def print_sequence():\n", + " current_state = 0\n", " print(\"States:\", end=\" \")\n", " for step in range(n_max_steps):\n", " print(current_state, end=\" \")\n", @@ -1177,75 +951,154 @@ "# Markov Decision Process" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define some transition probabilities, rewards and possible actions. For example, in state s0, if action a0 is chosen then with proba 0.7 we will go to state s0 with reward +10, with probability 0.3 we will go to state s1 with no reward, and with never go to state s2 (so the transition probabilities are `[0.7, 0.3, 0.0]`, and the rewards are `[+10, 0, 0]`):" + ] + }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ - "transition_probabilities = [\n", - " [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]], # in s0, if action a0 then proba 0.7 to state s0 and 0.3 to state s1, etc.\n", + "transition_probabilities = [ # shape=[s, a, s']\n", + " [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],\n", " [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],\n", - " [None, [0.8, 0.1, 0.1], None],\n", - " ]\n", - "\n", - "rewards = [\n", + " [None, [0.8, 0.1, 0.1], None]]\n", + "rewards = [ # shape=[s, a, s']\n", " [[+10, 0, 0], [0, 0, 0], [0, 0, 0]],\n", " [[0, 0, 0], [0, 0, 0], [0, 0, -50]],\n", - " [[0, 0, 0], [+40, 0, 0], [0, 0, 0]],\n", - " ]\n", + " [[0, 0, 0], [+40, 0, 0], [0, 0, 0]]]\n", + "possible_actions = [[0, 1, 2], [0, 2], [1]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q-Value Iteration" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "Q_values = np.full((3, 3), -np.inf) # -np.inf for impossible actions\n", + "for state, actions in enumerate(possible_actions):\n", + " Q_values[state, actions] = 0.0 # for all possible actions" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "gamma = 0.90 # the discount factor\n", "\n", - "possible_actions = [[0, 1, 2], [0, 2], [1]]\n", + "history1 = [] # Not shown in the book (for the figure below)\n", + "for iteration in range(50):\n", + " Q_prev = Q_values.copy()\n", + " history1.append(Q_prev) # Not shown\n", + " for s in range(3):\n", + " for a in possible_actions[s]:\n", + " Q_values[s, a] = np.sum([\n", + " transition_probabilities[s][a][sp]\n", + " * (rewards[s][a][sp] + gamma * np.max(Q_prev[sp]))\n", + " for sp in range(3)])\n", "\n", - "def policy_fire(state):\n", - " return [0, 2, 1][state]\n", + "history1 = np.array(history1) # Not shown" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "Q_values" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "np.argmax(Q_values, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The optimal policy for this MDP, when using a discount factor of 0.90, is to choose action a0 when in state s0, and choose action a0 when in state s1, and finally choose action a1 (the only possible action) when in state s2." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try again with a discount factor of 0.95:" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "Q_values = np.full((3, 3), -np.inf) # -np.inf for impossible actions\n", + "for state, actions in enumerate(possible_actions):\n", + " Q_values[state, actions] = 0.0 # for all possible actions" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "gamma = 0.95 # the discount factor\n", "\n", - "def policy_random(state):\n", - " return np.random.choice(possible_actions[state])\n", - "\n", - "def policy_safe(state):\n", - " return [0, 0, 1][state]\n", - "\n", - "class MDPEnvironment(object):\n", - " def __init__(self, start_state=0):\n", - " self.start_state=start_state\n", - " self.reset()\n", - " def reset(self):\n", - " self.total_rewards = 0\n", - " self.state = self.start_state\n", - " def step(self, action):\n", - " next_state = np.random.choice(range(3), p=transition_probabilities[self.state][action])\n", - " reward = rewards[self.state][action][next_state]\n", - " self.state = next_state\n", - " self.total_rewards += reward\n", - " return self.state, reward\n", - "\n", - "def run_episode(policy, n_steps, start_state=0, display=True):\n", - " env = MDPEnvironment()\n", - " if display:\n", - " print(\"States (+rewards):\", end=\" \")\n", - " for step in range(n_steps):\n", - " if display:\n", - " if step == 10:\n", - " print(\"...\", end=\" \")\n", - " elif step < 10:\n", - " print(env.state, end=\" \")\n", - " action = policy(env.state)\n", - " state, reward = env.step(action)\n", - " if display and step < 10:\n", - " if reward:\n", - " print(\"({})\".format(reward), end=\" \")\n", - " if display:\n", - " print(\"Total rewards =\", env.total_rewards)\n", - " return env.total_rewards\n", - "\n", - "for policy in (policy_fire, policy_random, policy_safe):\n", - " all_totals = []\n", - " print(policy.__name__)\n", - " for episode in range(1000):\n", - " all_totals.append(run_episode(policy, n_steps=100, display=(episode<5)))\n", - " print(\"Summary: mean={:.1f}, std={:1f}, min={}, max={}\".format(np.mean(all_totals), np.std(all_totals), np.min(all_totals), np.max(all_totals)))\n", - " print()" + "for iteration in range(50):\n", + " Q_prev = Q_values.copy()\n", + " for s in range(3):\n", + " for a in possible_actions[s]:\n", + " Q_values[s, a] = np.sum([\n", + " transition_probabilities[s][a][sp]\n", + " * (rewards[s][a][sp] + gamma * np.max(Q_prev[sp]))\n", + " for sp in range(3)])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "Q_values" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "np.argmax(Q_values, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the policy has changed! In state s1, we now prefer to go through the fire (choose action a2). This is because the discount factor is larger so the agent values the future more, and it is therefore ready to pay an immediate penalty in order to get more future rewards." ] }, { @@ -1262,29 +1115,79 @@ "Q-Learning works by watching an agent play (e.g., randomly) and gradually improving its estimates of the Q-Values. Once it has accurate Q-Value estimates (or close enough), then the optimal policy consists in choosing the action that has the highest Q-Value (i.e., the greedy policy)." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will need to simulate an agent moving around in the environment, so let's define a function to perform some action and get the new state and a reward:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "def step(state, action):\n", + " probas = transition_probabilities[state][action]\n", + " next_state = np.random.choice([0, 1, 2], p=probas)\n", + " reward = rewards[state][action][next_state]\n", + " return next_state, reward" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also need an exploration policy, which can be any policy, as long as it visits every possible state many times. We will just use a random policy, since the state space is very small:" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def exploration_policy(state):\n", + " return np.random.choice(possible_actions[state])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's initialize the Q-Values like earlier, and run the Q-Learning algorithm:" + ] + }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "n_states = 3\n", - "n_actions = 3\n", - "n_steps = 20000\n", - "alpha = 0.01\n", - "gamma = 0.99\n", - "exploration_policy = policy_random\n", - "q_values = np.full((n_states, n_actions), -np.inf)\n", - "for state, actions in enumerate(possible_actions):\n", - " q_values[state][actions]=0\n", + "np.random.seed(42)\n", "\n", - "env = MDPEnvironment()\n", - "for step in range(n_steps):\n", - " action = exploration_policy(env.state)\n", - " state = env.state\n", - " next_state, reward = env.step(action)\n", - " next_value = np.max(q_values[next_state]) # greedy policy\n", - " q_values[state, action] = (1-alpha)*q_values[state, action] + alpha*(reward + gamma * next_value)" + "Q_values = np.full((3, 3), -np.inf)\n", + "for state, actions in enumerate(possible_actions):\n", + " Q_values[state][actions] = 0\n", + "\n", + "alpha0 = 0.05 # initial learning rate\n", + "decay = 0.005 # learning rate decay\n", + "gamma = 0.90 # discount factor\n", + "state = 0 # initial state\n", + "history2 = [] # Not shown in the book\n", + "\n", + "for iteration in range(10000):\n", + " history2.append(Q_values.copy()) # Not shown\n", + " action = exploration_policy(state)\n", + " next_state, reward = step(state, action)\n", + " next_value = np.max(Q_values[next_state]) # greedy policy at the next step\n", + " alpha = alpha0 / (1 + iteration * decay)\n", + " Q_values[state, action] *= 1 - alpha\n", + " Q_values[state, action] += alpha * (reward + gamma * next_value)\n", + " state = next_state\n", + "\n", + "history2 = np.array(history2) # Not shown" ] }, { @@ -1293,8 +1196,7 @@ "metadata": {}, "outputs": [], "source": [ - "def optimal_policy(state):\n", - " return np.argmax(q_values[state])" + "Q_values" ] }, { @@ -1303,7 +1205,7 @@ "metadata": {}, "outputs": [], "source": [ - "q_values" + "np.argmax(Q_values, axis=1) # optimal action for each state" ] }, { @@ -1312,46 +1214,33 @@ "metadata": {}, "outputs": [], "source": [ - "all_totals = []\n", - "for episode in range(1000):\n", - " all_totals.append(run_episode(optimal_policy, n_steps=100, display=(episode<5)))\n", - "print(\"Summary: mean={:.1f}, std={:1f}, min={}, max={}\".format(np.mean(all_totals), np.std(all_totals), np.min(all_totals), np.max(all_totals)))\n", - "print()" + "true_Q_value = history1[-1, 0, 0]\n", + "\n", + "fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)\n", + "axes[0].set_ylabel(\"Q-Value$(s_0, a_0)$\", fontsize=14)\n", + "axes[0].set_title(\"Q-Value Iteration\", fontsize=14)\n", + "axes[1].set_title(\"Q-Learning\", fontsize=14)\n", + "for ax, width, history in zip(axes, (50, 10000), (history1, history2)):\n", + " ax.plot([0, width], [true_Q_value, true_Q_value], \"k--\")\n", + " ax.plot(np.arange(width), history[:, 0, 0], \"b-\", linewidth=2)\n", + " ax.set_xlabel(\"Iterations\", fontsize=14)\n", + " ax.axis([0, width, 0, 24])\n", + "\n", + "save_fig(\"q_value_plot\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Learning to Play MsPacman Using the DQN Algorithm" + "# Deep Q-Network" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: Unfortunately, the first version of the book contained two important errors in this section.\n", - "\n", - "1. The actor DQN and critic DQN should have been named _online DQN_ and _target DQN_ respectively. Actor-critic algorithms are a distinct class of algorithms.\n", - "2. The online DQN is the one that learns and is copied to the target DQN at regular intervals. The target DQN's only role is to estimate the next state's Q-Values for each possible action. This is needed to compute the target Q-Values for training the online DQN, as shown in this equation:\n", - "\n", - "$y(s,a) = \\text{r} + \\gamma . \\underset{a'}{\\max} \\, Q_\\text{target}(s', a')$\n", - "\n", - "* $y(s,a)$ is the target Q-Value to train the online DQN for the state-action pair $(s, a)$.\n", - "* $r$ is the reward actually collected after playing action $a$ in state $s$.\n", - "* $\\gamma$ is the discount rate.\n", - "* $s'$ is the state actually reached after played action $a$ in state $s$.\n", - "* $a'$ is one of the possible actions in state $s'$.\n", - "* $Q_\\text{target}(s', a')$ is the target DQN's estimate of the Q-Value of playing action $a'$ while in state $s'$.\n", - "\n", - "I hope these errors did not affect you, and if they did, I sincerely apologize." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating the MsPacman environment" + "Let's build the DQN. Given a state, it will estimate, for each possible action, the sum of discounted future rewards it can expect after it plays that action (but before it sees its outcome):" ] }, { @@ -1360,9 +1249,25 @@ "metadata": {}, "outputs": [], "source": [ - "env = gym.make(\"MsPacman-v0\")\n", - "obs = env.reset()\n", - "obs.shape" + "tf.random.set_seed(42)\n", + "np.random.seed(42)\n", + "\n", + "env = gym.make(\"CartPole-v1\")\n", + "input_shape = [4] # == env.observation_space.shape\n", + "n_outputs = 2 # == env.action_space.n\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Dense(32, activation=\"elu\", input_shape=input_shape),\n", + " keras.layers.Dense(32, activation=\"elu\"),\n", + " keras.layers.Dense(n_outputs)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To select an action using this DQN, we just pick the action with the largest predicted Q-value. However, to ensure that the agent explores the environment, we choose a random action with probability `epsilon`." ] }, { @@ -1371,21 +1276,19 @@ "metadata": {}, "outputs": [], "source": [ - "env.action_space" + "def epsilon_greedy_policy(state, epsilon=0):\n", + " if np.random.rand() < epsilon:\n", + " return np.random.randint(2)\n", + " else:\n", + " Q_values = model.predict(state[np.newaxis])\n", + " return np.argmax(Q_values[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Preprocessing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Preprocessing the images is optional but greatly speeds up training." + "We will also need a replay memory. It will contain the agent's experiences, in the form of tuples: `(obs, action, reward, next_obs, done)`. We can use the `deque` class for that:" ] }, { @@ -1394,23 +1297,16 @@ "metadata": {}, "outputs": [], "source": [ - "mspacman_color = 210 + 164 + 74\n", + "from collections import deque\n", "\n", - "def preprocess_observation(obs):\n", - " img = obs[1:176:2, ::2] # crop and downsize\n", - " img = img.sum(axis=2) # to greyscale\n", - " img[img==mspacman_color] = 0 # Improve contrast\n", - " img = (img // 3 - 128).astype(np.int8) # normalize from -128 to 127\n", - " return img.reshape(88, 80, 1)\n", - "\n", - "img = preprocess_observation(obs)" + "replay_memory = deque(maxlen=2000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Note: the `preprocess_observation()` function is slightly different from the one in the book: instead of representing pixels as 64-bit floats from -1.0 to 1.0, it represents them as signed bytes (from -128 to 127). The benefit is that the replay memory will take up roughly 8 times less RAM (about 6.5 GB instead of 52 GB). The reduced precision has no visible impact on training." + "And let's create a function to sample experiences from the replay memory. It will return 5 NumPy arrays: `[obs, actions, rewards, next_obs, dones]`." ] }, { @@ -1419,37 +1315,20 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(11, 7))\n", - "plt.subplot(121)\n", - "plt.title(\"Original observation (160×210 RGB)\")\n", - "plt.imshow(obs)\n", - "plt.axis(\"off\")\n", - "plt.subplot(122)\n", - "plt.title(\"Preprocessed observation (88×80 greyscale)\")\n", - "plt.imshow(img.reshape(88, 80), interpolation=\"nearest\", cmap=\"gray\")\n", - "plt.axis(\"off\")\n", - "save_fig(\"preprocessing_plot\")\n", - "plt.show()" + "def sample_experiences(batch_size):\n", + " indices = np.random.randint(len(replay_memory), size=batch_size)\n", + " batch = [replay_memory[index] for index in indices]\n", + " states, actions, rewards, next_states, dones = [\n", + " np.array([experience[index] for experience in batch])\n", + " for index in range(5)]\n", + " return states, actions, rewards, next_states, dones" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Build DQN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: instead of using `tf.contrib.layers.convolution2d()` or `tf.contrib.layers.conv2d()` (as in the first version of the book), we now use the `tf.layers.conv2d()`, which did not exist when this chapter was written. This is preferable because anything in contrib may change or be deleted without notice, while `tf.layers` is part of the official API. As you will see, the code is mostly the same, except that the parameter names have changed slightly:\n", - "* the `num_outputs` parameter was renamed to `filters`,\n", - "* the `stride` parameter was renamed to `strides`,\n", - "* the `_fn` suffix was removed from parameter names that had it (e.g., `activation_fn` was renamed to `activation`),\n", - "* the `weights_initializer` parameter was renamed to `kernel_initializer`,\n", - "* the weights variable was renamed to `\"kernel\"` (instead of `\"weights\"`), and the biases variable was renamed from `\"biases\"` to `\"bias\"`,\n", - "* and the default `activation` is now `None` instead of `tf.nn.relu`." + "Now we can create a function that will use the DQN to play one step, and record its experience in the replay memory:" ] }, { @@ -1458,43 +1337,18 @@ "metadata": {}, "outputs": [], "source": [ - "reset_graph()\n", - "\n", - "input_height = 88\n", - "input_width = 80\n", - "input_channels = 1\n", - "conv_n_maps = [32, 64, 64]\n", - "conv_kernel_sizes = [(8,8), (4,4), (3,3)]\n", - "conv_strides = [4, 2, 1]\n", - "conv_paddings = [\"SAME\"] * 3 \n", - "conv_activation = [tf.nn.relu] * 3\n", - "n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each\n", - "n_hidden = 512\n", - "hidden_activation = tf.nn.relu\n", - "n_outputs = env.action_space.n # 9 discrete actions are available\n", - "initializer = tf.variance_scaling_initializer()\n", - "\n", - "def q_network(X_state, name):\n", - " prev_layer = X_state / 128.0 # scale pixel intensities to the [-1.0, 1.0] range.\n", - " with tf.variable_scope(name) as scope:\n", - " for n_maps, kernel_size, strides, padding, activation in zip(\n", - " conv_n_maps, conv_kernel_sizes, conv_strides,\n", - " conv_paddings, conv_activation):\n", - " prev_layer = tf.layers.conv2d(\n", - " prev_layer, filters=n_maps, kernel_size=kernel_size,\n", - " strides=strides, padding=padding, activation=activation,\n", - " kernel_initializer=initializer)\n", - " last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])\n", - " hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,\n", - " activation=hidden_activation,\n", - " kernel_initializer=initializer)\n", - " outputs = tf.layers.dense(hidden, n_outputs,\n", - " kernel_initializer=initializer)\n", - " trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,\n", - " scope=scope.name)\n", - " trainable_vars_by_name = {var.name[len(scope.name):]: var\n", - " for var in trainable_vars}\n", - " return outputs, trainable_vars_by_name" + "def play_one_step(env, state, epsilon):\n", + " action = epsilon_greedy_policy(state, epsilon)\n", + " next_state, reward, done, info = env.step(action)\n", + " replay_memory.append((state, action, reward, next_state, done))\n", + " return next_state, reward, done, info" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lastly, let's create a function that will sample some experiences from the replay memory and perform a training step:" ] }, { @@ -1503,14 +1357,31 @@ "metadata": {}, "outputs": [], "source": [ - "X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,\n", - " input_channels])\n", - "online_q_values, online_vars = q_network(X_state, name=\"q_networks/online\")\n", - "target_q_values, target_vars = q_network(X_state, name=\"q_networks/target\")\n", + "batch_size = 32\n", + "discount_rate = 0.95\n", + "optimizer = keras.optimizers.Adam(lr=1e-3)\n", + "loss_fn = keras.losses.mean_squared_error\n", "\n", - "copy_ops = [target_var.assign(online_vars[var_name])\n", - " for var_name, target_var in target_vars.items()]\n", - "copy_online_to_target = tf.group(*copy_ops)" + "def training_step(batch_size):\n", + " experiences = sample_experiences(batch_size)\n", + " states, actions, rewards, next_states, dones = experiences\n", + " next_Q_values = model.predict(next_states)\n", + " max_next_Q_values = np.max(next_Q_values, axis=1)\n", + " target_Q_values = rewards + (1 - dones) * discount_rate * max_next_Q_values\n", + " mask = tf.one_hot(actions, n_outputs)\n", + " with tf.GradientTape() as tape:\n", + " all_Q_values = model(states)\n", + " Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)\n", + " loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))\n", + " grads = tape.gradient(loss, model.trainable_variables)\n", + " optimizer.apply_gradients(zip(grads, model.trainable_variables))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, let's train the model!" ] }, { @@ -1519,48 +1390,38 @@ "metadata": {}, "outputs": [], "source": [ - "online_vars" + "env.seed(42)\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", + "\n", + "rewards = [] \n", + "best_score = 0" ] }, { "cell_type": "code", "execution_count": 62, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "learning_rate = 0.001\n", - "momentum = 0.95\n", + "for episode in range(600):\n", + " obs = env.reset() \n", + " for step in range(200):\n", + " epsilon = max(1 - episode / 500, 0.01)\n", + " obs, reward, done, info = play_one_step(env, obs, epsilon)\n", + " if done:\n", + " break\n", + " rewards.append(step) # Not shown in the book\n", + " if step > best_score: # Not shown\n", + " best_weights = model.get_weights() # Not shown\n", + " best_score = step # Not shown\n", + " print(\"\\rEpisode: {}, Steps: {}, eps: {:.3f}\".format(episode, step + 1, epsilon), end=\"\") # Not shown\n", + " if episode > 50:\n", + " training_step(batch_size)\n", "\n", - "with tf.variable_scope(\"train\"):\n", - " X_action = tf.placeholder(tf.int32, shape=[None])\n", - " y = tf.placeholder(tf.float32, shape=[None, 1])\n", - " q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),\n", - " axis=1, keepdims=True)\n", - " error = tf.abs(y - q_value)\n", - " clipped_error = tf.clip_by_value(error, 0.0, 1.0)\n", - " linear_error = 2 * (error - clipped_error)\n", - " loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)\n", - "\n", - " global_step = tf.Variable(0, trainable=False, name='global_step')\n", - " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)\n", - " training_op = optimizer.minimize(loss, global_step=global_step)\n", - "\n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: in the first version of the book, the loss function was simply the squared error between the target Q-Values (`y`) and the estimated Q-Values (`q_value`). However, because the experiences are very noisy, it is better to use a quadratic loss only for small errors (below 1.0) and a linear loss (twice the absolute error) for larger errors, which is what the code above computes. This way large errors don't push the model parameters around as much. Note that we also tweaked some hyperparameters (using a smaller learning rate, and using Nesterov Accelerated Gradients rather than Adam optimization, since adaptive gradient algorithms may sometimes be bad, according to this [paper](https://arxiv.org/abs/1705.08292)). We also tweaked a few other hyperparameters below (a larger replay memory, longer decay for the $\\epsilon$-greedy policy, larger discount rate, less frequent copies of the online DQN to the target DQN, etc.)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use this `ReplayMemory` class instead of a `deque` because it is much faster for random access (thanks to @NileshPS who contributed it). Moreover, we default to sampling with replacement, which is much faster than sampling without replacement for large replay memories." + "model.set_weights(best_weights)" ] }, { @@ -1569,24 +1430,12 @@ "metadata": {}, "outputs": [], "source": [ - "class ReplayMemory:\n", - " def __init__(self, maxlen):\n", - " self.maxlen = maxlen\n", - " self.buf = np.empty(shape=maxlen, dtype=np.object)\n", - " self.index = 0\n", - " self.length = 0\n", - " \n", - " def append(self, data):\n", - " self.buf[self.index] = data\n", - " self.length = min(self.length + 1, self.maxlen)\n", - " self.index = (self.index + 1) % self.maxlen\n", - " \n", - " def sample(self, batch_size, with_replacement=True):\n", - " if with_replacement:\n", - " indices = np.random.randint(self.length, size=batch_size) # faster\n", - " else:\n", - " indices = np.random.permutation(self.length)[:batch_size]\n", - " return self.buf[indices]" + "plt.figure(figsize=(8, 4))\n", + "plt.plot(rewards)\n", + "plt.xlabel(\"Episode\", fontsize=14)\n", + "plt.ylabel(\"Sum of rewards\", fontsize=14)\n", + "save_fig(\"dqn_rewards_plot\")\n", + "plt.show()" ] }, { @@ -1595,8 +1444,34 @@ "metadata": {}, "outputs": [], "source": [ - "replay_memory_size = 500000\n", - "replay_memory = ReplayMemory(replay_memory_size)" + "env.seed(42)\n", + "state = env.reset()\n", + "\n", + "frames = []\n", + "\n", + "for step in range(200):\n", + " action = epsilon_greedy_policy(state)\n", + " state, reward, done, info = env.step(action)\n", + " if done:\n", + " break\n", + " img = env.render(mode=\"rgb_array\")\n", + " frames.append(img)\n", + " \n", + "plot_animation(frames)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Not bad at all!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Double DQN" ] }, { @@ -1605,13 +1480,17 @@ "metadata": {}, "outputs": [], "source": [ - "def sample_memories(batch_size):\n", - " cols = [[], [], [], [], []] # state, action, reward, next_state, continue\n", - " for memory in replay_memory.sample(batch_size):\n", - " for col, value in zip(cols, memory):\n", - " col.append(value)\n", - " cols = [np.array(col) for col in cols]\n", - " return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)" + "tf.random.set_seed(42)\n", + "np.random.seed(42)\n", + "\n", + "model = keras.models.Sequential([\n", + " keras.layers.Dense(32, activation=\"elu\", input_shape=[4]),\n", + " keras.layers.Dense(32, activation=\"elu\"),\n", + " keras.layers.Dense(n_outputs)\n", + "])\n", + "\n", + "target = keras.models.clone_model(model)\n", + "target.set_weights(model.get_weights())" ] }, { @@ -1620,16 +1499,26 @@ "metadata": {}, "outputs": [], "source": [ - "eps_min = 0.1\n", - "eps_max = 1.0\n", - "eps_decay_steps = 2000000\n", + "batch_size = 32\n", + "discount_rate = 0.95\n", + "optimizer = keras.optimizers.Adam(lr=1e-3)\n", + "loss_fn = keras.losses.Huber()\n", "\n", - "def epsilon_greedy(q_values, step):\n", - " epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)\n", - " if np.random.rand() < epsilon:\n", - " return np.random.randint(n_outputs) # random action\n", - " else:\n", - " return np.argmax(q_values) # optimal action" + "def training_step(batch_size):\n", + " experiences = sample_experiences(batch_size)\n", + " states, actions, rewards, next_states, dones = experiences\n", + " next_Q_values = model.predict(next_states)\n", + " best_next_actions = np.argmax(next_Q_values, axis=1)\n", + " next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()\n", + " next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)\n", + " target_Q_values = rewards + (1 - dones) * discount_rate * next_best_Q_values\n", + " mask = tf.one_hot(actions, n_outputs)\n", + " with tf.GradientTape() as tape:\n", + " all_Q_values = model(states)\n", + " Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)\n", + " loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))\n", + " grads = tape.gradient(loss, model.trainable_variables)\n", + " optimizer.apply_gradients(zip(grads, model.trainable_variables))" ] }, { @@ -1638,24 +1527,7 @@ "metadata": {}, "outputs": [], "source": [ - "n_steps = 4000000 # total number of training steps\n", - "training_start = 10000 # start training after 10,000 game iterations\n", - "training_interval = 4 # run a training step every 4 game iterations\n", - "save_steps = 1000 # save the model every 1,000 training steps\n", - "copy_steps = 10000 # copy online DQN to target DQN every 10,000 training steps\n", - "discount_rate = 0.99\n", - "skip_start = 90 # Skip the start of every game (it's just waiting time).\n", - "batch_size = 50\n", - "iteration = 0 # game iterations\n", - "checkpoint_path = \"./my_dqn.ckpt\"\n", - "done = True # env needs to be reset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few variables for tracking progress:" + "replay_memory = deque(maxlen=2000)" ] }, { @@ -1664,17 +1536,38 @@ "metadata": {}, "outputs": [], "source": [ - "loss_val = np.infty\n", - "game_length = 0\n", - "total_max_q = 0\n", - "mean_max_q = 0.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now the main training loop!" + "env.seed(42)\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", + "\n", + "rewards = []\n", + "best_score = 0\n", + "\n", + "for episode in range(600):\n", + " obs = env.reset() \n", + " for step in range(200):\n", + " epsilon = max(1 - episode / 500, 0.01)\n", + " obs, reward, done, info = play_one_step(env, obs, epsilon)\n", + " if done:\n", + " break\n", + " rewards.append(step)\n", + " if step > best_score:\n", + " best_weights = model.get_weights()\n", + " best_score = step\n", + " print(\"\\rEpisode: {}, Steps: {}, eps: {:.3f}\".format(episode, step + 1, epsilon), end=\"\")\n", + " if episode > 50:\n", + " training_step(batch_size)\n", + " if episode % 50 == 0:\n", + " target.set_weights(model.get_weights())\n", + " # Alternatively, you can do soft updates at each step:\n", + " #if episode > 50:\n", + " #target_weights = target.get_weights()\n", + " #online_weights = model.get_weights()\n", + " #for index in range(len(target_weights)):\n", + " # target_weights[index] = 0.99 * target_weights[index] + 0.01 * target_weights[index]\n", + " #target.set_weights(target_weights)\n", + "\n", + "model.set_weights(best_weights)" ] }, { @@ -1683,117 +1576,941 @@ "metadata": {}, "outputs": [], "source": [ - "with tf.Session() as sess:\n", - " if os.path.isfile(checkpoint_path + \".index\"):\n", - " saver.restore(sess, checkpoint_path)\n", - " else:\n", - " init.run()\n", - " copy_online_to_target.run()\n", - " while True:\n", - " step = global_step.eval()\n", - " if step >= n_steps:\n", - " break\n", - " iteration += 1\n", - " print(\"\\rIteration {}\\tTraining step {}/{} ({:.1f})%\\tLoss {:5f}\\tMean Max-Q {:5f} \".format(\n", - " iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end=\"\")\n", - " if done: # game over, start again\n", - " obs = env.reset()\n", - " for skip in range(skip_start): # skip the start of each game\n", - " obs, reward, done, info = env.step(0)\n", - " state = preprocess_observation(obs)\n", + "plt.figure(figsize=(8, 4))\n", + "plt.plot(rewards)\n", + "plt.xlabel(\"Episode\", fontsize=14)\n", + "plt.ylabel(\"Sum of rewards\", fontsize=14)\n", + "save_fig(\"dqn_rewards_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "env.seed(42)\n", + "state = env.reset()\n", "\n", - " # Online DQN evaluates what to do\n", - " q_values = online_q_values.eval(feed_dict={X_state: [state]})\n", - " action = epsilon_greedy(q_values, step)\n", + "frames = []\n", "\n", - " # Online DQN plays\n", - " obs, reward, done, info = env.step(action)\n", - " next_state = preprocess_observation(obs)\n", - "\n", - " # Let's memorize what happened\n", - " replay_memory.append((state, action, reward, next_state, 1.0 - done))\n", - " state = next_state\n", - "\n", - " # Compute statistics for tracking progress (not shown in the book)\n", - " total_max_q += q_values.max()\n", - " game_length += 1\n", - " if done:\n", - " mean_max_q = total_max_q / game_length\n", - " total_max_q = 0.0\n", - " game_length = 0\n", - "\n", - " if iteration < training_start or iteration % training_interval != 0:\n", - " continue # only train after warmup period and at regular intervals\n", - " \n", - " # Sample memories and use the target DQN to produce the target Q-Value\n", - " X_state_val, X_action_val, rewards, X_next_state_val, continues = (\n", - " sample_memories(batch_size))\n", - " next_q_values = target_q_values.eval(\n", - " feed_dict={X_state: X_next_state_val})\n", - " max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)\n", - " y_val = rewards + continues * discount_rate * max_next_q_values\n", - "\n", - " # Train the online DQN\n", - " _, loss_val = sess.run([training_op, loss], feed_dict={\n", - " X_state: X_state_val, X_action: X_action_val, y: y_val})\n", - "\n", - " # Regularly copy the online DQN to the target DQN\n", - " if step % copy_steps == 0:\n", - " copy_online_to_target.run()\n", - "\n", - " # And save regularly\n", - " if step % save_steps == 0:\n", - " saver.save(sess, checkpoint_path)" + "for step in range(200):\n", + " action = epsilon_greedy_policy(state)\n", + " state, reward, done, info = env.step(action)\n", + " if done:\n", + " break\n", + " img = env.render(mode=\"rgb_array\")\n", + " frames.append(img)\n", + " \n", + "plot_animation(frames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can interrupt the cell above at any time to test your agent using the cell below. You can then run the cell above once again, it will load the last parameters saved and resume training." - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "frames = []\n", - "n_max_steps = 10000\n", - "\n", - "with tf.Session() as sess:\n", - " saver.restore(sess, checkpoint_path)\n", - "\n", - " obs = env.reset()\n", - " for step in range(n_max_steps):\n", - " state = preprocess_observation(obs)\n", - "\n", - " # Online DQN evaluates what to do\n", - " q_values = online_q_values.eval(feed_dict={X_state: [state]})\n", - " action = np.argmax(q_values)\n", - "\n", - " # Online DQN plays\n", - " obs, reward, done, info = env.step(action)\n", - "\n", - " img = env.render(mode=\"rgb_array\")\n", - " frames.append(img)\n", - "\n", - " if done:\n", - " break" + "# Dueling Double DQN" ] }, { "cell_type": "code", "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42)\n", + "np.random.seed(42)\n", + "\n", + "K = keras.backend\n", + "input_states = keras.layers.Input(shape=[4])\n", + "hidden1 = keras.layers.Dense(32, activation=\"elu\")(input_states)\n", + "hidden2 = keras.layers.Dense(32, activation=\"elu\")(hidden1)\n", + "state_values = keras.layers.Dense(1)(hidden2)\n", + "raw_advantages = keras.layers.Dense(n_outputs)(hidden2)\n", + "advantages = raw_advantages - K.max(raw_advantages, axis=1, keepdims=True)\n", + "Q_values = state_values + advantages\n", + "model = keras.models.Model(inputs=[input_states], outputs=[Q_values])\n", + "\n", + "target = keras.models.clone_model(model)\n", + "target.set_weights(model.get_weights())" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 32\n", + "discount_rate = 0.95\n", + "optimizer = keras.optimizers.Adam(lr=1e-2)\n", + "loss_fn = keras.losses.Huber()\n", + "\n", + "def training_step(batch_size):\n", + " experiences = sample_experiences(batch_size)\n", + " states, actions, rewards, next_states, dones = experiences\n", + " next_Q_values = model.predict(next_states)\n", + " best_next_actions = np.argmax(next_Q_values, axis=1)\n", + " next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()\n", + " next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)\n", + " target_Q_values = rewards + (1 - dones) * discount_rate * next_best_Q_values\n", + " mask = tf.one_hot(actions, n_outputs)\n", + " with tf.GradientTape() as tape:\n", + " all_Q_values = model(states)\n", + " Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)\n", + " loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))\n", + " grads = tape.gradient(loss, model.trainable_variables)\n", + " optimizer.apply_gradients(zip(grads, model.trainable_variables))" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "replay_memory = deque(maxlen=2000)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "env.seed(42)\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", + "\n", + "rewards = []\n", + "best_score = 0\n", + "\n", + "for episode in range(600):\n", + " obs = env.reset() \n", + " for step in range(200):\n", + " epsilon = max(1 - episode / 500, 0.01)\n", + " obs, reward, done, info = play_one_step(env, obs, epsilon)\n", + " if done:\n", + " break\n", + " rewards.append(step)\n", + " if step > best_score:\n", + " best_weights = model.get_weights()\n", + " best_score = step\n", + " print(\"\\rEpisode: {}, Steps: {}, eps: {:.3f}\".format(episode, step + 1, epsilon), end=\"\")\n", + " if episode > 50:\n", + " training_step(batch_size)\n", + " if episode % 200 == 0:\n", + " target.set_weights(model.get_weights())\n", + "\n", + "model.set_weights(best_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(rewards)\n", + "plt.xlabel(\"Episode\")\n", + "plt.ylabel(\"Sum of rewards\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, "metadata": { "scrolled": true }, "outputs": [], "source": [ + "env.seed(42)\n", + "state = env.reset()\n", + "\n", + "frames = []\n", + "\n", + "for step in range(200):\n", + " action = epsilon_greedy_policy(state)\n", + " state, reward, done, info = env.step(action)\n", + " if done:\n", + " break\n", + " img = env.render(mode=\"rgb_array\")\n", + " frames.append(img)\n", + " \n", "plot_animation(frames)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This looks like a pretty robust agent!" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using TF-Agents to Beat Breakout" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use TF-Agents to create an agent that will learn to play Breakout. We will use the Deep Q-Learning algorithm, so you can easily compare the components with the previous implementation, but TF-Agents implements many other (and more sophisticated) algorithms!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TF-Agents Environments" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42)\n", + "np.random.seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.environments import suite_gym\n", + "\n", + "env = suite_gym.load(\"Breakout-v4\")\n", + "env" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "env.gym" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "env.seed(42)\n", + "env.reset()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "env.step(1) # Fire" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "img = env.render(mode=\"rgb_array\")\n", + "\n", + "plt.figure(figsize=(6, 8))\n", + "plt.imshow(img)\n", + "plt.axis(\"off\")\n", + "plt.show()\n", + "save_fig(\"breakout_plot\")" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "env.current_time_step()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Specifications" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "env.observation_spec()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "env.action_spec()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "env.time_step_spec()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment Wrappers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can wrap a TF-Agents environments in a TF-Agents wrapper:" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.environments.wrappers import ActionRepeat\n", + "\n", + "repeating_env = ActionRepeat(env, times=4)\n", + "repeating_env" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "repeating_env.unwrapped" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the list of available wrappers:" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "import tf_agents.environments.wrappers\n", + "\n", + "for name in dir(tf_agents.environments.wrappers):\n", + " obj = getattr(tf_agents.environments.wrappers, name)\n", + " if hasattr(obj, \"__base__\") and issubclass(obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):\n", + " print(\"{:27s} {}\".format(name, obj.__doc__.split(\"\\n\")[0]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `suite_gym.load()` function can create an env and wrap it for you, both with TF-Agents environment wrappers and Gym environment wrappers (the latter are applied first)." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import partial\n", + "from gym.wrappers import TimeLimit\n", + "\n", + "limited_repeating_env = suite_gym.load(\n", + " \"Breakout-v4\",\n", + " gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],\n", + " env_wrappers=[partial(ActionRepeat, times=4)],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "limited_repeating_env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create an Atari Breakout environment, and wrap it to apply the default Atari preprocessing steps:" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "limited_repeating_env.unwrapped" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.environments import suite_atari\n", + "from tf_agents.environments.atari_preprocessing import AtariPreprocessing\n", + "from tf_agents.environments.atari_wrappers import FrameStack4\n", + "\n", + "max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames\n", + "environment_name = \"BreakoutNoFrameskip-v4\"\n", + "\n", + "env = suite_atari.load(\n", + " environment_name,\n", + " max_episode_steps=max_episode_steps,\n", + " gym_env_wrappers=[AtariPreprocessing, FrameStack4])" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Play a few steps just to see what happens:" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "env.seed(42)\n", + "env.reset()\n", + "time_step = env.step(1) # FIRE\n", + "for _ in range(4):\n", + " time_step = env.step(3) # LEFT" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_observation(obs):\n", + " # Since there are only 3 color channels, you cannot display 4 frames\n", + " # with one primary color per frame. So this code computes the delta between\n", + " # the current frame and the mean of the other frames, and it adds this delta\n", + " # to the red and blue channels to get a pink color for the current frame.\n", + " obs = obs.astype(np.float32)\n", + " img = obs[..., :3]\n", + " current_frame_delta = np.maximum(obs[..., 3] - obs[..., :3].mean(axis=-1), 0.)\n", + " img[..., 0] += current_frame_delta\n", + " img[..., 2] += current_frame_delta\n", + " img = (img - img.min()) / (img.max() - img.min())\n", + " plt.imshow(img)\n", + " plt.axis(\"off\")" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(6, 6))\n", + "plot_observation(time_step.observation)\n", + "save_fig(\"preprocessed_breakout_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the Python environment to a TF environment:" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.environments.tf_py_environment import TFPyEnvironment\n", + "\n", + "tf_env = TFPyEnvironment(env)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating the DQN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a small class to normalize the observations. Images are stored using bytes from 0 to 255 to use less RAM, but we want to pass floats from 0.0 to 1.0 to the neural network:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Q-Network:" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.networks.q_network import QNetwork\n", + "\n", + "preprocessing_layer = keras.layers.Lambda(\n", + " lambda obs: tf.cast(obs, np.float32) / 255.)\n", + "conv_layer_params=[(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]\n", + "fc_layer_params=[512]\n", + "\n", + "q_net = QNetwork(\n", + " tf_env.observation_spec(),\n", + " tf_env.action_spec(),\n", + " preprocessing_layers=preprocessing_layer,\n", + " conv_layer_params=conv_layer_params,\n", + " fc_layer_params=fc_layer_params)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the DQN Agent:" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.agents.dqn.dqn_agent import DqnAgent\n", + "\n", + "# see TF-agents issue #113\n", + "#optimizer = keras.optimizers.RMSprop(lr=2.5e-4, rho=0.95, momentum=0.0,\n", + "# epsilon=0.00001, centered=True)\n", + "\n", + "train_step = tf.Variable(0)\n", + "update_period = 4 # run a training step every 4 collect steps\n", + "optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate=2.5e-4, decay=0.95, momentum=0.0,\n", + " epsilon=0.00001, centered=True)\n", + "epsilon_fn = keras.optimizers.schedules.PolynomialDecay(\n", + " initial_learning_rate=1.0, # initial ε\n", + " decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames\n", + " end_learning_rate=0.01) # final ε\n", + "agent = DqnAgent(tf_env.time_step_spec(),\n", + " tf_env.action_spec(),\n", + " q_network=q_net,\n", + " optimizer=optimizer,\n", + " target_update_period=2000, # <=> 32,000 ALE frames\n", + " td_errors_loss_fn=keras.losses.Huber(reduction=\"none\"),\n", + " gamma=0.99, # discount factor\n", + " train_step_counter=train_step,\n", + " epsilon_greedy=lambda: epsilon_fn(train_step))\n", + "agent.initialize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the replay buffer:" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.replay_buffers import tf_uniform_replay_buffer\n", + "\n", + "replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(\n", + " data_spec=agent.collect_data_spec,\n", + " batch_size=tf_env.batch_size,\n", + " max_length=1000000)\n", + "\n", + "replay_buffer_observer = replay_buffer.add_batch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a simple custom observer that counts and displays the number of times it is called (except when it is passed a trajectory that represents the boundary between two episodes, as this does not count as a step):" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "class ShowProgress:\n", + " def __init__(self, total):\n", + " self.counter = 0\n", + " self.total = total\n", + " def __call__(self, trajectory):\n", + " if not trajectory.is_boundary():\n", + " self.counter += 1\n", + " if self.counter % 100 == 0:\n", + " print(\"\\r{}/{}\".format(self.counter, self.total), end=\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's add some training metrics:" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.metrics import tf_metrics\n", + "\n", + "train_metrics = [\n", + " tf_metrics.NumberOfEpisodes(),\n", + " tf_metrics.EnvironmentSteps(),\n", + " tf_metrics.AverageReturnMetric(),\n", + " tf_metrics.AverageEpisodeLengthMetric(),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "train_metrics[0].result()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.eval.metric_utils import log_metrics\n", + "import logging\n", + "logging.getLogger().setLevel(logging.INFO)\n", + "log_metrics(train_metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the collect driver:" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver\n", + "\n", + "collect_driver = DynamicStepDriver(\n", + " tf_env,\n", + " agent.collect_policy,\n", + " observers=[replay_buffer_observer] + train_metrics,\n", + " num_steps=update_period) # collect 4 steps for each training iteration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Collect the initial experiences, before training:" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.policies.random_tf_policy import RandomTFPolicy\n", + "\n", + "initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),\n", + " tf_env.action_spec())\n", + "init_driver = DynamicStepDriver(\n", + " tf_env,\n", + " initial_collect_policy,\n", + " observers=[replay_buffer.add_batch, ShowProgress(20000)],\n", + " num_steps=20000) # <=> 80,000 ALE frames\n", + "final_time_step, final_policy_state = init_driver.run()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's sample 2 sub-episodes, with 3 time steps each and display them:" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(888) # chosen to show an example of trajectory at the end of an episode\n", + "\n", + "trajectories, buffer_info = replay_buffer.get_next(\n", + " sample_batch_size=2, num_steps=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "trajectories._fields" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "trajectories.observation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.trajectories.trajectory import to_transition\n", + "\n", + "time_steps, action_steps, next_time_steps = to_transition(trajectories)\n", + "time_steps.observation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "trajectories.step_type.numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 6.8))\n", + "for row in range(2):\n", + " for col in range(3):\n", + " plt.subplot(2, 3, row * 3 + col + 1)\n", + " plot_observation(trajectories.observation[row, col].numpy())\n", + "plt.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0, wspace=0.02)\n", + "save_fig(\"sub_episodes_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's create the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = replay_buffer.as_dataset(\n", + " sample_batch_size=64,\n", + " num_steps=2,\n", + " num_parallel_calls=3).prefetch(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the main functions to TF Functions for better performance:" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.utils.common import function\n", + "\n", + "collect_driver.run = function(collect_driver.run)\n", + "agent.train = function(agent.train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now we are ready to run the main loop!" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "def train_agent(n_iterations):\n", + " time_step = None\n", + " policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)\n", + " iterator = iter(dataset)\n", + " for iteration in range(n_iterations):\n", + " time_step, policy_state = collect_driver.run(time_step, policy_state)\n", + " trajectories, buffer_info = next(iterator)\n", + " train_loss = agent.train(trajectories)\n", + " print(\"\\r{} loss:{:.5f}\".format(\n", + " iteration, train_loss.loss.numpy()), end=\"\")\n", + " if iteration % 1000 == 0:\n", + " log_metrics(train_metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "train_agent(n_iterations=200) # change this to 10 million or more!" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "num_eval_episodes = 10\n", + "eval_metrics = [\n", + " tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),\n", + " tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "eval_tf_env = suite_atari.load(\n", + " environment_name,\n", + " max_episode_steps=max_episode_steps,\n", + " gym_env_wrappers=[AtariPreprocessing, FrameStack4])\n", + "\n", + "eval_tf_env = TFPyEnvironment(eval_tf_env)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "from tf_agents.eval import metric_utils\n", + "\n", + "results = metric_utils.eager_compute(\n", + " eval_metrics,\n", + " eval_tf_env,\n", + " agent.policy,\n", + " num_episodes=num_eval_episodes,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "results" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1805,455 +2522,190 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Preprocessing for Breakout" + "## Deque vs rotating list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here is a preprocessing function you can use to train a DQN for the Breakout-v0 Atari game:" + "The `deque` class offers fast append, but fairly slow random access (for large replay memories):" ] }, { "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def preprocess_observation(obs):\n", - " img = obs[34:194:2, ::2] # crop and downsize\n", - " return np.mean(img, axis=2).reshape(80, 80) / 255.0" - ] - }, - { - "cell_type": "code", - "execution_count": 74, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ - "env = gym.make(\"Breakout-v0\")\n", - "obs = env.reset()\n", - "for step in range(10):\n", - " obs, _, _, _ = env.step(1)\n", + "np.random.seed(42)\n", "\n", - "img = preprocess_observation(obs)" + "mem = deque(maxlen=1000000)\n", + "for i in range(1000000):\n", + " mem.append(i)\n", + "[mem[i] for i in np.random.randint(1000000, size=5)]" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(11, 7))\n", - "plt.subplot(121)\n", - "plt.title(\"Original observation (160×210 RGB)\")\n", - "plt.imshow(obs)\n", - "plt.axis(\"off\")\n", - "plt.subplot(122)\n", - "plt.title(\"Preprocessed observation (80×80 grayscale)\")\n", - "plt.imshow(img, interpolation=\"nearest\", cmap=\"gray\")\n", - "plt.axis(\"off\")\n", - "plt.show()" + "%timeit mem.append(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit [mem[i] for i in np.random.randint(1000000, size=5)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As you can see, a single image does not give you the direction and speed of the ball, which are crucial informations for playing this game. For this reason, it is best to actually combine several consecutive observations to create the environment's state representation. One way to do that is to create a multi-channel image, with one channel per recent observation. Another is to merge all recent observations into a single-channel image, using `np.max()`. In this case, we need to dim the older images so that the DQN can distinguish the past from the present." + "Alternatively, you could use a rotating list like this `ReplayMemory` class. This would make random access faster for large replay memories:" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ - "from collections import deque\n", + "class ReplayMemory:\n", + " def __init__(self, max_size):\n", + " self.buffer = np.empty(max_size, dtype=np.object)\n", + " self.max_size = max_size\n", + " self.index = 0\n", + " self.size = 0\n", "\n", - "def combine_observations_multichannel(preprocessed_observations):\n", - " return np.array(preprocessed_observations).transpose([1, 2, 0])\n", + " def append(self, obj):\n", + " self.buffer[self.index] = obj\n", + " self.size = min(self.size + 1, self.max_size)\n", + " self.index = (self.index + 1) % self.max_size\n", "\n", - "def combine_observations_singlechannel(preprocessed_observations, dim_factor=0.5):\n", - " dimmed_observations = [obs * dim_factor**index\n", - " for index, obs in enumerate(reversed(preprocessed_observations))]\n", - " return np.max(np.array(dimmed_observations), axis=0)\n", - "\n", - "n_observations_per_state = 3\n", - "preprocessed_observations = deque([], maxlen=n_observations_per_state)\n", - "\n", - "obs = env.reset()\n", - "for step in range(10):\n", - " obs, _, _, _ = env.step(1)\n", - " preprocessed_observations.append(preprocess_observation(obs))" + " def sample(self, batch_size):\n", + " indices = np.random.randint(self.size, size=batch_size)\n", + " return self.buffer[indices]" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ - "img1 = combine_observations_multichannel(preprocessed_observations)\n", - "img2 = combine_observations_singlechannel(preprocessed_observations)\n", - "\n", - "plt.figure(figsize=(11, 7))\n", - "plt.subplot(121)\n", - "plt.title(\"Multichannel state\")\n", - "plt.imshow(img1, interpolation=\"nearest\")\n", - "plt.axis(\"off\")\n", - "plt.subplot(122)\n", - "plt.title(\"Singlechannel state\")\n", - "plt.imshow(img2, interpolation=\"nearest\", cmap=\"gray\")\n", - "plt.axis(\"off\")\n", - "plt.show()" + "mem = ReplayMemory(max_size=1000000)\n", + "for i in range(1000000):\n", + " mem.append(i)\n", + "mem.sample(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit mem.append(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit mem.sample(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Exercise solutions" + "## Creating a Custom TF-Agents Environment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 1. to 7." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "See Appendix A." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. BipedalWalker-v2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Exercise: _Use policy gradients to tackle OpenAI gym's \"BipedalWalker-v2\"._" + "To create a custom TF-Agent environment, you just need to write a class that inherits from the `PyEnvironment` class and implements a few methods. For example, the following minimal environment represents a simple 4x4 grid. The agent starts in one corner (0,0) and must move to the opposite corner (3,3). The episode is done if the agent reaches the goal (it gets a +10 reward) or if the agent goes out of bounds (-1 reward). The actions are up (0), down (1), left (2) and right (3)." ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ - "import gym" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "env = gym.make(\"BipedalWalker-v2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: if you run into [this issue](https://github.com/openai/gym/issues/100) (\"`module 'Box2D._Box2D' has no attribute 'RAND_LIMIT'`\") when making the `BipedalWalker-v2` environment, then try this workaround:\n", + "class MyEnvironment(tf_agents.environments.py_environment.PyEnvironment):\n", + " def __init__(self, discount=1.0):\n", + " super().__init__()\n", + " self._action_spec = tf_agents.specs.BoundedArraySpec(\n", + " shape=(), dtype=np.int32, name=\"action\", minimum=0, maximum=3)\n", + " self._observation_spec = tf_agents.specs.BoundedArraySpec(\n", + " shape=(4, 4), dtype=np.int32, name=\"observation\", minimum=0, maximum=1)\n", + " self.discount = discount\n", "\n", - "```\n", - "$ pip uninstall Box2D-kengz\n", - "$ pip install git+https://github.com/pybox2d/pybox2d\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [], - "source": [ - "obs = env.reset()" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "img = env.render(mode=\"rgb_array\")" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "plt.imshow(img)\n", - "plt.axis(\"off\")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [], - "source": [ - "obs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can find the meaning of each of these 24 numbers in the [documentation](https://github.com/openai/gym/wiki/BipedalWalker-v2)." - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [], - "source": [ - "env.action_space" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [], - "source": [ - "env.action_space.low" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "env.action_space.high" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a 4D continuous action space controling each leg's hip torque and knee torque (from -1 to 1). To deal with a continuous action space, one method is to discretize it. For example, let's limit the possible torque values to these 3 values: -1.0, 0.0, and 1.0. This means that we are left with $3^4=81$ possible actions." - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "from itertools import product" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "possible_torques = np.array([-1.0, 0.0, 1.0])\n", - "possible_actions = np.array(list(product(possible_torques, possible_torques, possible_torques, possible_torques)))\n", - "possible_actions.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [], - "source": [ - "tf.reset_default_graph()\n", + " def action_spec(self):\n", + " return self._action_spec\n", "\n", - "# 1. Specify the network architecture\n", - "n_inputs = env.observation_space.shape[0] # == 24\n", - "n_hidden = 10\n", - "n_outputs = len(possible_actions) # == 625\n", - "initializer = tf.variance_scaling_initializer()\n", + " def observation_spec(self):\n", + " return self._observation_spec\n", "\n", - "# 2. Build the neural network\n", - "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", + " def _reset(self):\n", + " self._state = np.zeros(2, dtype=np.int32)\n", + " obs = np.zeros((4, 4), dtype=np.int32)\n", + " obs[self._state[0], self._state[1]] = 1\n", + " return tf_agents.trajectories.time_step.restart(obs)\n", "\n", - "hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.selu,\n", - " kernel_initializer=initializer)\n", - "logits = tf.layers.dense(hidden, n_outputs,\n", - " kernel_initializer=initializer)\n", - "outputs = tf.nn.softmax(logits)\n", - "\n", - "# 3. Select a random action based on the estimated probabilities\n", - "action_index = tf.squeeze(tf.multinomial(logits, num_samples=1), axis=-1)\n", - "\n", - "# 4. Training\n", - "learning_rate = 0.01\n", - "\n", - "y = tf.one_hot(action_index, depth=len(possible_actions))\n", - "cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits)\n", - "optimizer = tf.train.AdamOptimizer(learning_rate)\n", - "grads_and_vars = optimizer.compute_gradients(cross_entropy)\n", - "gradients = [grad for grad, variable in grads_and_vars]\n", - "gradient_placeholders = []\n", - "grads_and_vars_feed = []\n", - "for grad, variable in grads_and_vars:\n", - " gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())\n", - " gradient_placeholders.append(gradient_placeholder)\n", - " grads_and_vars_feed.append((gradient_placeholder, variable))\n", - "training_op = optimizer.apply_gradients(grads_and_vars_feed)\n", - "\n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's try running this policy network, although it is not trained yet." - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "def run_bipedal_walker(model_path=None, n_max_steps = 1000):\n", - " env = gym.make(\"BipedalWalker-v2\")\n", - " frames = []\n", - " with tf.Session() as sess:\n", - " if model_path is None:\n", - " init.run()\n", + " def _step(self, action):\n", + " self._state += [(-1, 0), (+1, 0), (0, -1), (0, +1)][action]\n", + " reward = 0\n", + " obs = np.zeros((4, 4), dtype=np.int32)\n", + " done = (self._state.min() < 0 or self._state.max() > 3)\n", + " if not done:\n", + " obs[self._state[0], self._state[1]] = 1\n", + " if done or np.all(self._state == np.array([3, 3])):\n", + " reward = -1 if done else +10\n", + " return tf_agents.trajectories.time_step.termination(obs, reward)\n", " else:\n", - " saver.restore(sess, model_path)\n", - " obs = env.reset()\n", - " for step in range(n_max_steps):\n", - " img = env.render(mode=\"rgb_array\")\n", - " frames.append(img)\n", - " action_index_val = action_index.eval(feed_dict={X: obs.reshape(1, n_inputs)})\n", - " action = possible_actions[action_index_val]\n", - " obs, reward, done, info = env.step(action[0])\n", - " if done:\n", - " break\n", - " env.close()\n", - " return frames" + " return tf_agents.trajectories.time_step.transition(obs, reward,\n", + " self.discount)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The action and observation specs will generally be instances of the `ArraySpec` or `BoundedArraySpec` classes from the `tf_agents.specs` package (check out the other specs in this package as well). Optionally, you can also define a `render()` method, a `close()` method to free resources, as well as a `time_step_spec()` method if you don't want the `reward` and `discount` to be 32-bit float scalars. Note that the base class takes care of keeping track of the current time step, which is why we must implement `_reset()` and `_step()` rather than `reset()` and `step()`.\n" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ - "frames = run_bipedal_walker()\n", - "video = plot_animation(frames)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nope, it really can't walk. So let's train it!" + "my_env = MyEnvironment()\n", + "time_step = my_env.reset()\n", + "time_step" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ - "n_games_per_update = 10\n", - "n_max_steps = 1000\n", - "n_iterations = 1000\n", - "save_iterations = 10\n", - "discount_rate = 0.95\n", - "\n", - "with tf.Session() as sess:\n", - " init.run()\n", - " for iteration in range(n_iterations):\n", - " print(\"\\rIteration: {}/{}\".format(iteration + 1, n_iterations), end=\"\")\n", - " all_rewards = []\n", - " all_gradients = []\n", - " for game in range(n_games_per_update):\n", - " current_rewards = []\n", - " current_gradients = []\n", - " obs = env.reset()\n", - " for step in range(n_max_steps):\n", - " action_index_val, gradients_val = sess.run([action_index, gradients],\n", - " feed_dict={X: obs.reshape(1, n_inputs)})\n", - " action = possible_actions[action_index_val]\n", - " obs, reward, done, info = env.step(action[0])\n", - " current_rewards.append(reward)\n", - " current_gradients.append(gradients_val)\n", - " if done:\n", - " break\n", - " all_rewards.append(current_rewards)\n", - " all_gradients.append(current_gradients)\n", - "\n", - " all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)\n", - " feed_dict = {}\n", - " for var_index, gradient_placeholder in enumerate(gradient_placeholders):\n", - " mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]\n", - " for game_index, rewards in enumerate(all_rewards)\n", - " for step, reward in enumerate(rewards)], axis=0)\n", - " feed_dict[gradient_placeholder] = mean_gradients\n", - " sess.run(training_op, feed_dict=feed_dict)\n", - " if iteration % save_iterations == 0:\n", - " saver.save(sess, \"./my_bipedal_walker_pg.ckpt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [], - "source": [ - "frames = run_bipedal_walker(\"./my_bipedal_walker_pg.ckpt\")\n", - "video = plot_animation(frames)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Not the best walker, but at least it stays up and makes (slow) progress to the right.\n", - "A better solution for this problem is to use an actor-critic algorithm, as it does not require discretizing the action space, and it converges much faster. Check out this nice [blog post](https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69) by Yash Patel for more details." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 9." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Coming soon**" + "time_step = my_env.step(1)\n", + "time_step" ] }, { @@ -2280,7 +2732,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.8" } }, "nbformat": 4,