diff --git a/16_reinforcement_learning.ipynb b/16_reinforcement_learning.ipynb index c6b6da9..3c9d54d 100644 --- a/16_reinforcement_learning.ipynb +++ b/16_reinforcement_learning.ipynb @@ -57,7 +57,7 @@ "plt.rcParams['ytick.labelsize'] = 12\n", "\n", "# Where to save the figures\n", - "PROJECT_ROOT_DIR = \".\"\n", + "PROJECT_ROOT_DIR = \"../handson-ml\"\n", "CHAPTER_ID = \"rl\"\n", "\n", "def save_fig(fig_id, tight_layout=True):\n", @@ -184,6 +184,7 @@ "plt.figure(figsize=(5,4))\n", "plt.imshow(img)\n", "plt.axis(\"off\")\n", + "save_fig(\"MsPacman\")\n", "plt.show()" ] }, @@ -408,14 +409,14 @@ "source": [ "frames = []\n", "\n", - "n_max_iterations = 1000\n", + "n_max_steps = 1000\n", "n_change_steps = 10\n", "\n", "obs = env.reset()\n", - "for iteration in range(n_max_iterations):\n", + "for step in range(n_max_steps):\n", " img = env.render(mode=\"rgb_array\")\n", " frames.append(img)\n", - " if iteration % n_change_steps == 0:\n", + " if step % n_change_steps == 0:\n", " action = env.action_space.sample() # play randomly\n", " obs, reward, done, info = env.step(action)\n", " if done:\n", @@ -431,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": { "collapsed": false }, @@ -451,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -470,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": { "collapsed": false }, @@ -502,7 +503,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -513,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": { "collapsed": true }, @@ -524,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": { "collapsed": false }, @@ -537,7 +538,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The observation is a 1D NumPy array composed of 4 floats: they represent the cart's horizontal position, its velocity, the angle of the pole (O = vertical), and the angular velocity. Let's render the environment... unfortunately we need to fix an annoying rendering issue first." + "The observation is a 1D NumPy array composed of 4 floats: they represent the cart's horizontal position, its velocity, the angle of the pole (0 = vertical), and the angular velocity. Let's render the environment... unfortunately we need to fix an annoying rendering issue first." ] }, { @@ -551,7 +552,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Some environments (including the CartPole) require access to your display, which opens up a separate window, even if you specify the `rgb_array` mode. In general you can safely ignore that window. However, if Jupyter is running on a headless server (ie. without a screen) it will raise an exception. One way to avoid this is to install a fake X server like Xvfb. You can start Jupyter using the `xvfb-run` command:\n", + "Some environments (including the Cart-Pole) require access to your display, which opens up a separate window, even if you specify the `rgb_array` mode. In general you can safely ignore that window. However, if Jupyter is running on a headless server (ie. without a screen) it will raise an exception. One way to avoid this is to install a fake X server like Xvfb. You can start Jupyter using the `xvfb-run` command:\n", "\n", " $ xvfb-run -s \"-screen 0 1400x900x24\" jupyter notebook\n", "\n", @@ -560,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 25, "metadata": { "collapsed": false }, @@ -579,16 +580,18 @@ " # use OpenAI gym's rendering function\n", " return env.render(mode=\"rgb_array\")\n", " else:\n", - " # basic rendering for the cart pole environment if OpenAI can't render it\n", - " img_w = 100\n", - " img_h = 50\n", - " cart_w = 20\n", - " pole_len = 30\n", + " # rendering for the cart pole environment (in case OpenAI gym can't do it)\n", + " img_w = 600\n", + " img_h = 400\n", + " cart_w = img_w // 12\n", + " cart_h = img_h // 15\n", + " pole_len = img_h // 3.5\n", + " pole_w = img_w // 80 + 1\n", " x_width = 2\n", " max_ang = 0.2\n", " bg_col = (255, 255, 255)\n", " cart_col = 0x000000 # Blue Green Red\n", - " pole_col = 0x0000FF # Blue Green Red\n", + " pole_col = 0x669acc # Blue Green Red\n", "\n", " pos, vel, ang, ang_vel = obs\n", " img = Image.new('RGB', (img_w, img_h), bg_col)\n", @@ -596,10 +599,10 @@ " cart_x = pos * img_w // x_width + img_w // x_width\n", " cart_y = img_h * 95 // 100\n", " top_pole_x = cart_x + pole_len * np.sin(ang)\n", - " top_pole_y = cart_y - pole_len * np.cos(ang)\n", - " pole_col = int(np.minimum(np.abs(ang / max_ang), 1) * pole_col)\n", - " draw.line((cart_x, cart_y, top_pole_x, top_pole_y), fill=pole_col) # draw pole\n", - " draw.line((cart_x - cart_w // 2, cart_y, cart_x + cart_w // 2, cart_y), fill=cart_col) # draw cart\n", + " top_pole_y = cart_y - cart_h // 2 - pole_len * np.cos(ang)\n", + " draw.line((0, cart_y, img_w, cart_y), fill=0)\n", + " draw.rectangle((cart_x - cart_w // 2, cart_y - cart_h // 2, cart_x + cart_w // 2, cart_y + cart_h // 2), fill=cart_col) # draw cart\n", + " draw.line((cart_x, cart_y - cart_h // 2, top_pole_x, top_pole_y), fill=pole_col, width=pole_w) # draw pole\n", " return np.array(img)\n", "\n", "def plot_cart_pole(env, obs):\n", @@ -612,7 +615,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 26, "metadata": { "collapsed": false }, @@ -630,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 27, "metadata": { "collapsed": false }, @@ -648,7 +651,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 28, "metadata": { "collapsed": false }, @@ -663,13 +666,17 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "plot_cart_pole(env, obs)" + "plt.close() # or else nbagg sometimes plots in the previous cell\n", + "img = render_cart_pole(env, obs)\n", + "plt.imshow(img)\n", + "plt.axis(\"off\")\n", + "save_fig(\"cart_pole_plot\")" ] }, { @@ -681,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 30, "metadata": { "collapsed": false }, @@ -696,7 +703,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 31, "metadata": { "collapsed": false }, @@ -728,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 32, "metadata": { "collapsed": false }, @@ -736,11 +743,11 @@ "source": [ "frames = []\n", "\n", - "n_max_iterations = 1000\n", + "n_max_steps = 1000\n", "n_change_steps = 10\n", "\n", "obs = env.reset()\n", - "for iteration in range(n_max_iterations):\n", + "for step in range(n_max_steps):\n", " img = render_cart_pole(env, obs)\n", " frames.append(img)\n", "\n", @@ -758,7 +765,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 33, "metadata": { "collapsed": false }, @@ -775,6 +782,241 @@ "Nope, the system is unstable and after just a few wobbles, the pole ends up too tilted: game over. We will need to be smarter than that!" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Neural Network Policies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a neural network that will take observations as inputs, and output the action to take for each observation. To choose an action, the network will first estimate a probability for each action, then select an action randomly according to the estimated probabilities. In the case of the Cart-Pole environment, there are just two possible actions (left or right), so we only need one output neuron: it will output the probability `p` of the action 0 (left), and of course the probability of action 1 (right) will be `1 - p`." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.contrib.layers import fully_connected\n", + "\n", + "# 1. Specify the network architecture\n", + "n_inputs = 4 # == env.observation_space.shape[0]\n", + "n_hidden = 4 # it's a simple task, we don't need more than this\n", + "n_outputs = 1 # only outputs the probability of accelerating left\n", + "initializer = tf.contrib.layers.variance_scaling_initializer()\n", + "\n", + "# 2. Build the neural network\n", + "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", + "hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu,\n", + " weights_initializer=initializer)\n", + "outputs = fully_connected(hidden, n_outputs, activation_fn=tf.nn.sigmoid,\n", + " weights_initializer=initializer)\n", + "\n", + "# 3. Select a random action based on the estimated probabilities\n", + "p_left_and_right = tf.concat(concat_dim=1, values=[outputs, 1 - outputs])\n", + "action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)\n", + "\n", + "init = tf.initialize_all_variables()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this particular environment, the past actions and observations can safely be ignored, since each observation contains the environment's full state. If there were some hidden state then you may need to consider past actions and observations in order to try to infer the hidden state of the environment. For example, if the environment only revealed the position of the cart but not its velocity, you would have to consider not only the current observation but also the previous observation in order to estimate the current velocity. Another example is if the observations are noisy: you may want to use the past few observations to estimate the most likely current state. Our problem is thus as simple as can be: the current observation is noise-free and contains the environment's full state." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may wonder why we are picking a random action based on the probability given by the policy network, rather than just picking the action with the highest probability. This approach lets the agent find the right balance between _exploring_ new actions and _exploiting_ the actions that are known to work well. Here's an analogy: suppose you go to a restaurant for the first time, and all the dishes look equally appealing so you randomly pick one. If it turns out to be good, you can increase the probability to order it next time, but you shouldn't increase that probability to 100%, or else you will never try out the other dishes, some of which may be even better than the one you tried." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's randomly initialize this policy neural network and use it to play one game:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_max_steps = 1000\n", + "frames = []\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " obs = env.reset()\n", + " for step in range(n_max_steps):\n", + " img = render_cart_pole(env, obs)\n", + " frames.append(img)\n", + " action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})\n", + " obs, reward, done, info = env.step(action_val[0][0])\n", + " if done:\n", + " break\n", + "\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's look at how well this randomly initialized policy network performed:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "video = plot_animation(frames)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Yeah... pretty bad. The neural network will have to learn to do better. First let's see if it is capable of learning the basic policy we used earlier: go left if the pole is tilting left, and go right if it is tilting right. The following code defines the same neural network but we add the target probabilities `y`, and the training operations (`cross_entropy`, `optimizer` and `training_op`):" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.contrib.layers import fully_connected\n", + "\n", + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 4\n", + "n_hidden = 4\n", + "n_outputs = 1\n", + "\n", + "learning_rate = 0.01\n", + "\n", + "initializer = tf.contrib.layers.variance_scaling_initializer()\n", + "\n", + "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", + "y = tf.placeholder(tf.float32, shape=[None, n_outputs])\n", + "\n", + "hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer)\n", + "logits = fully_connected(hidden, n_outputs, activation_fn=None)\n", + "outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)\n", + "p_left_and_right = tf.concat(concat_dim=1, values=[outputs, 1 - outputs])\n", + "action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)\n", + "\n", + "cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits, y)\n", + "optimizer = tf.train.AdamOptimizer(learning_rate)\n", + "training_op = optimizer.minimize(cross_entropy)\n", + "\n", + "init = tf.initialize_all_variables()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can make the same net play in 10 different environments in parallel, and train for 1000 iterations. We also reset environments when they are done." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_environments = 10\n", + "n_iterations = 1000\n", + "\n", + "envs = [gym.make(\"CartPole-v0\") for _ in range(n_environments)]\n", + "observations = [env.reset() for env in envs]\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for iteration in range(n_iterations):\n", + " target_probas = np.array([([1.] if obs[2] < 0 else [0.]) for obs in observations]) # if angle<0 we want proba(left)=1., or else proba(left)=0.\n", + " action_val, _ = sess.run([action, training_op], feed_dict={X: np.array(observations), y: target_probas})\n", + " for env_index, env in enumerate(envs):\n", + " obs, reward, done, info = env.step(action_val[env_index][0])\n", + " observations[env_index] = obs if not done else env.reset()\n", + " saver.save(sess, \"my_policy_net_basic.ckpt\")\n", + "\n", + "for env in envs:\n", + " env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def render_policy_net(model_path, action, X, n_max_steps = 1000):\n", + " frames = []\n", + " env = gym.make(\"CartPole-v0\")\n", + " obs = env.reset()\n", + " with tf.Session() as sess:\n", + " saver.restore(sess, model_path)\n", + " for step in range(n_max_steps):\n", + " img = render_cart_pole(env, obs)\n", + " frames.append(img)\n", + " action_val = action.eval(feed_dict={X: obs.reshape(1, n_inputs)})\n", + " obs, reward, done, info = env.step(action_val[0][0])\n", + " if done:\n", + " break\n", + " env.close()\n", + " return frames " + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frames = render_policy_net(\"my_policy_net_basic.ckpt\", action, X)\n", + "video = plot_animation(frames)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like it learned the policy correctly. Now let's see if it can learn a better policy on its own." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -786,7 +1028,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's create a neural network that will take the observations as inputs, and output the action to take. More precisely, it will output a probability for each action, and we will sample an action based on those probabilities. For example, if it says that the probability of pushing left should be 70%, and the probability of pushing right should be 30%, then we will pick a random number between 0 and 1 and if it is lower than 0.7 we will push left, or else we will push right. This approach lets the agent find the right balance between exploring new actions and exploiting the actions that are known to work well. Suppose you go to the same restaurant every week, and the first time you really enjoyed the caesar salad, you could order the same thing every week and be guaranteed to enjoy your meal. But you may be missing out on another great dish. Once in a while, you should try out something new." + "To train this neural network we will need to define the target probabilities `y`. If an action is good we should increase its probability, and conversely if it is bad we should reduce it. But how do we know whether an action is good or bad? The problem is that most actions have delayed effects, so when you win or lose points in a game, it is not clear which actions contributed to this result: was it just the last action? Or the last 10? Or just one action 50 steps earlier? This is called the _credit assignment problem_.\n", + "\n", + "The _Policy Gradients_ algorithm tackles this problem by first playing multiple games, then making the actions in good games slightly more likely, while actions in bad games are made slightly less likely. First we play, then we go back and think about what we did." ] }, { @@ -798,57 +1042,845 @@ "outputs": [], "source": [ "import tensorflow as tf\n", - "\n", "from tensorflow.contrib.layers import fully_connected\n", "\n", + "tf.reset_default_graph()\n", + "\n", "n_inputs = 4\n", "n_hidden = 4\n", "n_outputs = 1\n", "\n", - "learning_rate=0.01\n", + "learning_rate = 0.01\n", + "\n", + "initializer = tf.contrib.layers.variance_scaling_initializer()\n", "\n", "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", - "y = tf.placeholder(tf.float32, shape=[None, n_outputs])\n", - "hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu)\n", + "\n", + "hidden = fully_connected(X, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer)\n", "logits = fully_connected(hidden, n_outputs, activation_fn=None)\n", - "outputs = tf.nn.softmax(logits)\n", - "cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, y)\n", + "outputs = tf.nn.sigmoid(logits) # probability of action 0 (left)\n", + "p_left_and_right = tf.concat(concat_dim=1, values=[outputs, 1 - outputs])\n", + "action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)\n", + "\n", + "y = 1. - tf.to_float(action)\n", + "cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits, y)\n", "optimizer = tf.train.AdamOptimizer(learning_rate)\n", - "training_op = optimizer.minimize(cross_entropy)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now to train this network we will need to feed the input batches `X` and the targets `y`. The inputs are easy enough, these will be the observations.\n", + "grads_and_vars = optimizer.compute_gradients(cross_entropy)\n", + "gradients = [grad for grad, variable in grads_and_vars]\n", + "gradient_placeholders = []\n", + "grads_and_vars_feed = []\n", + "for grad, variable in grads_and_vars:\n", + " gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())\n", + " gradient_placeholders.append(gradient_placeholder)\n", + " grads_and_vars_feed.append((gradient_placeholder, variable))\n", + "training_op = optimizer.apply_gradients(grads_and_vars_feed)\n", "\n", - "_Note_: in this particular environment, the past actions and observations can safely be ignored, since you can observe the environment's full state. If there were some hidden state then you may need to consider all past actions and observations in order to try to infer the hidden state of the environment. For example, if the environment only revealed the position of the cart but not its velocity, you would have to consider not only the current observation but also the previous observation in order to estimate the current velocity. Another example is if the observations are noisy: you may want to use the past few observations to estimate the most likely current state. Our problem is thus as simple as can be: the current observation is noise-free and contains the environment's full state.\n", - "\n", - "But what about the labels? How can we tell what the target probabilities should be? One option is to let this policy network play the game say 100 times. Then rank the games according to the total reward they get. The actions taken during the best games were good, on average, so they should be made a bit more likely, while the actions taken during the worst games were bad, on average, so they should be made less likely. Of course, perhaps the policy network made a few good moves during a very bad game, and unfortunately these good moves will be made less likely, but that's ok because if we repeat the process many times, after a while the good moves should on average get more and more likely, and the bad moves should get less and less likely. A good basketball player sometimes plays in a really bad team: this obviously damages his reputation, but if he stars in a sufficient number of movies, overall his reputation should correspond to his talent." + "init = tf.initialize_all_variables()\n", + "saver = tf.train.Saver()" ] }, { "cell_type": "code", "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def discount_rewards(rewards, discount_rate):\n", + " discounted_rewards = np.zeros(len(rewards))\n", + " cumulative_rewards = 0\n", + " for step in reversed(range(len(rewards))):\n", + " cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate\n", + " discounted_rewards[step] = cumulative_rewards\n", + " return discounted_rewards\n", + "\n", + "def discount_and_normalize_rewards(all_rewards, discount_rate):\n", + " all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]\n", + " flat_rewards = np.concatenate(all_discounted_rewards)\n", + " reward_mean = flat_rewards.mean()\n", + " reward_std = flat_rewards.std()\n", + " return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "obs = env.reset()\n", - "while True:\n", - " obs, reward, done, info = env.step(env.action_space.sample())\n", - " print(reward)\n", - " if done:\n", - " break" + "discount_rewards([10, 0, -50], discount_rate=0.8)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "discount_and_normalize_rewards([[10, 0, -50], [10, 20]], discount_rate=0.8)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "env = gym.make(\"CartPole-v0\")\n", + "\n", + "n_games_per_update = 10\n", + "n_max_steps = 1000\n", + "n_iterations = 250\n", + "save_iterations = 10\n", + "discount_rate = 0.95\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for iteration in range(n_iterations):\n", + " print(\"\\r{}\\tTotal rewards: \".format(iteration), end=\"\")\n", + " all_rewards = []\n", + " all_gradients = []\n", + " for game in range(n_games_per_update):\n", + " current_rewards = []\n", + " current_gradients = []\n", + " obs = env.reset()\n", + " for step in range(n_max_steps):\n", + " action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})\n", + " obs, reward, done, info = env.step(action_val[0][0])\n", + " current_rewards.append(reward)\n", + " current_gradients.append(gradients_val)\n", + " if done:\n", + " break\n", + " all_rewards.append(current_rewards)\n", + " all_gradients.append(current_gradients)\n", + " print(np.sum(current_rewards), end=\" \")\n", + "\n", + " all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)\n", + " feed_dict = {}\n", + " for var_index, gradient_placeholder in enumerate(gradient_placeholders):\n", + " mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]\n", + " for game_index, rewards in enumerate(all_rewards)\n", + " for step, reward in enumerate(rewards)], axis=0)\n", + " feed_dict[gradient_placeholder] = mean_gradients\n", + " sess.run(training_op, feed_dict=feed_dict)\n", + " if iteration % save_iterations == 0:\n", + " saver.save(sess, \"my_policy_net_pg.ckpt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frames = render_policy_net(\"my_policy_net_pg.ckpt\", action, X, n_max_steps=1000)\n", + "video = plot_animation(frames)\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Work in progress – more content coming soon...**" + "# Markov Chains" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "transition_probabilities = [\n", + " [0.7, 0.2, 0.0, 0.1], # from s0 to s0, s1, s2, s3\n", + " [0.0, 0.0, 0.9, 0.1], # from s1 to ...\n", + " [0.0, 1.0, 0.0, 0.0], # from s2 to ...\n", + " [0.0, 0.0, 0.0, 1.0], # from s3 to ...\n", + " ]\n", + "\n", + "n_max_steps = 50\n", + "\n", + "def print_sequence(start_state=0):\n", + " current_state = start_state\n", + " print(\"States:\", end=\" \")\n", + " for step in range(n_max_steps):\n", + " print(current_state, end=\" \")\n", + " if current_state == 3:\n", + " break\n", + " current_state = rnd.choice(range(4), p=transition_probabilities[current_state])\n", + " else:\n", + " print(\"...\", end=\"\")\n", + " print()\n", + "\n", + "for _ in range(10):\n", + " print_sequence()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Markov Decision Process" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "transition_probabilities = [\n", + " [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]], # in s0, if action a0 then proba 0.7 to state s0 and 0.3 to state s1, etc.\n", + " [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],\n", + " [None, [0.8, 0.1, 0.1], None],\n", + " ]\n", + "\n", + "rewards = [\n", + " [[+10, 0, 0], [0, 0, 0], [0, 0, 0]],\n", + " [[0, 0, 0], [0, 0, 0], [0, 0, -50]],\n", + " [[0, 0, 0], [+40, 0, 0], [0, 0, 0]],\n", + " ]\n", + "\n", + "possible_actions = [[0, 1, 2], [0, 2], [1]]\n", + "\n", + "def policy_fire(state):\n", + " return [0, 2, 1][state]\n", + "\n", + "def policy_random(state):\n", + " return rnd.choice(possible_actions[state])\n", + "\n", + "def policy_safe(state):\n", + " return [0, 0, 1][state]\n", + "\n", + "class MDPEnvironment(object):\n", + " def __init__(self, start_state=0):\n", + " self.start_state=start_state\n", + " self.reset()\n", + " def reset(self):\n", + " self.total_rewards = 0\n", + " self.state = self.start_state\n", + " def step(self, action):\n", + " next_state = rnd.choice(range(3), p=transition_probabilities[self.state][action])\n", + " reward = rewards[self.state][action][next_state]\n", + " self.state = next_state\n", + " self.total_rewards += reward\n", + " return self.state, reward\n", + "\n", + "def run_episode(policy, n_steps, start_state=0, display=True):\n", + " env = MDPEnvironment()\n", + " if display:\n", + " print(\"States (+rewards):\", end=\" \")\n", + " for step in range(n_steps):\n", + " if display:\n", + " if step == 10:\n", + " print(\"...\", end=\" \")\n", + " elif step < 10:\n", + " print(env.state, end=\" \")\n", + " action = policy(env.state)\n", + " state, reward = env.step(action)\n", + " if display and step < 10:\n", + " if reward:\n", + " print(\"({})\".format(reward), end=\" \")\n", + " if display:\n", + " print(\"Total rewards =\", env.total_rewards)\n", + " return env.total_rewards\n", + "\n", + "for policy in (policy_fire, policy_random, policy_safe):\n", + " all_totals = []\n", + " print(policy.__name__)\n", + " for episode in range(1000):\n", + " all_totals.append(run_episode(policy, n_steps=100, display=(episode<5)))\n", + " print(\"Summary: mean={:.1f}, std={:1f}, min={}, max={}\".format(np.mean(all_totals), np.std(all_totals), np.min(all_totals), np.max(all_totals)))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Q-Learning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Q-Learning will learn the optimal policy by watching the random policy play." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_states = 3\n", + "n_actions = 3\n", + "n_steps = 20000\n", + "alpha = 0.01\n", + "gamma = 0.99\n", + "exploration_policy = policy_random\n", + "q_values = np.full((n_states, n_actions), -np.inf)\n", + "for state, actions in enumerate(possible_actions):\n", + " q_values[state][actions]=0\n", + "\n", + "env = MDPEnvironment()\n", + "for step in range(n_steps):\n", + " action = exploration_policy(env.state)\n", + " state = env.state\n", + " next_state, reward = env.step(action)\n", + " next_value = np.max(q_values[next_state]) # greedy policy\n", + " q_values[state, action] = (1-alpha)*q_values[state, action] + alpha*(reward + gamma * next_value)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def optimal_policy(state):\n", + " return np.argmax(q_values[state])" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "q_values" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "all_totals = []\n", + "for episode in range(1000):\n", + " all_totals.append(run_episode(optimal_policy, n_steps=100, display=(episode<5)))\n", + "print(\"Summary: mean={:.1f}, std={:1f}, min={}, max={}\".format(np.mean(all_totals), np.std(all_totals), np.min(all_totals), np.max(all_totals)))\n", + "print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Learning to play MsPacman using Deep Q-Learning" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "env = gym.make(\"MsPacman-v0\")\n", + "obs = env.reset()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obs.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "env.action_space" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preprocessing the images is optional but greatly speeds up training." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mspacman_color = np.array([210, 164, 74]).mean()\n", + "\n", + "def preprocess_observation(obs):\n", + " img = obs[1:176:2, ::2] # crop and downsize\n", + " img = img.mean(axis=2) # to greyscale\n", + " img[img==mspacman_color] = 0 # Improve contrast\n", + " img = (img - 128) / 128 - 1 # normalize from -1. to 1.\n", + " return img.reshape(88, 80, 1)\n", + "\n", + "img = preprocess_observation(obs)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(11, 7))\n", + "plt.subplot(121)\n", + "plt.title(\"Original observation (160×210 RGB)\")\n", + "plt.imshow(obs)\n", + "plt.axis(\"off\")\n", + "plt.subplot(122)\n", + "plt.title(\"Preprocessed observation (88×80 greyscale)\")\n", + "plt.imshow(img.reshape(88, 80), interpolation=\"nearest\", cmap=\"gray\")\n", + "plt.axis(\"off\")\n", + "save_fig(\"preprocessing_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build DQN" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "from tensorflow.contrib.layers import convolution2d, fully_connected\n", + "\n", + "input_height = 88\n", + "input_width = 80\n", + "input_channels = 1\n", + "conv_n_maps = [32, 64, 64]\n", + "conv_kernel_sizes = [(8,8), (4,4), (3,3)]\n", + "conv_strides = [4, 2, 1]\n", + "conv_paddings = [\"SAME\"]*3 \n", + "conv_activation = [tf.nn.relu]*3\n", + "n_hidden_inputs = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each\n", + "n_hidden = 512\n", + "hidden_activation = tf.nn.relu\n", + "n_outputs = env.action_space.n\n", + "initializer = tf.contrib.layers.variance_scaling_initializer()\n", + "\n", + "learning_rate = 0.01\n", + "\n", + "def q_network(X_state, scope):\n", + " prev_layer = X_state\n", + " conv_layers = []\n", + " with tf.variable_scope(scope) as scope:\n", + " for n_maps, kernel_size, stride, padding, activation in zip(conv_n_maps, conv_kernel_sizes, conv_strides, conv_paddings, conv_activation):\n", + " prev_layer = convolution2d(prev_layer, num_outputs=n_maps, kernel_size=kernel_size, stride=stride, padding=padding, activation_fn=activation, weights_initializer=initializer)\n", + " conv_layers.append(prev_layer)\n", + " last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_inputs])\n", + " hidden = fully_connected(last_conv_layer_flat, n_hidden, activation_fn=hidden_activation, weights_initializer=initializer)\n", + " outputs = fully_connected(hidden, n_outputs, activation_fn=None)\n", + " trainable_vars = {var.name[len(scope.name):]: var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}\n", + " return outputs, trainable_vars\n", + "\n", + "X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width, input_channels])\n", + "actor_q_values, actor_vars = q_network(X_state, scope=\"q_networks/actor\") # acts\n", + "critic_q_values, critic_vars = q_network(X_state, scope=\"q_networks/critic\") # learns\n", + "\n", + "copy_ops = [actor_var.assign(critic_vars[var_name])\n", + " for var_name, actor_var in actor_vars.items()]\n", + "copy_critic_to_actor = tf.group(*copy_ops)\n", + "\n", + "with tf.variable_scope(\"train\"):\n", + " X_action = tf.placeholder(tf.int32, shape=[None])\n", + " y = tf.placeholder(tf.float32, shape=[None, 1])\n", + " q_value = tf.reduce_sum(critic_q_values * tf.one_hot(X_action, n_outputs),\n", + " reduction_indices=1, keep_dims=True)\n", + " cost = tf.reduce_mean(tf.square(y - q_value))\n", + " global_step = tf.Variable(0, trainable=False, name='global_step')\n", + " optimizer = tf.train.AdamOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(cost, global_step=global_step)\n", + " \n", + "init = tf.initialize_all_variables()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "actor_vars" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import deque\n", + "\n", + "replay_memory_size = 10000\n", + "replay_memory = deque([], maxlen=replay_memory_size)\n", + "\n", + "def sample_memories(batch_size):\n", + " indices = rnd.permutation(len(replay_memory))[:batch_size]\n", + " cols = [[], [], [], [], []] # state, action, reward, next_state, continue\n", + " for idx in indices:\n", + " memory = replay_memory[idx]\n", + " for col, value in zip(cols, memory):\n", + " col.append(value)\n", + " cols = [np.array(col) for col in cols]\n", + " return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "eps_min = 0.05\n", + "eps_max = 1.0\n", + "eps_decay_steps = 50000\n", + "import sys\n", + "\n", + "def epsilon_greedy(q_values, step):\n", + " epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)\n", + " print(\" epsilon {}\".format(epsilon), end=\"\")\n", + " sys.stdout.flush()\n", + " if rnd.rand() < epsilon:\n", + " return rnd.randint(n_outputs) # random action\n", + " else:\n", + " return np.argmax(q_values) # optimal action" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_steps = 100000 # total number of training steps\n", + "training_start = 1000 # start training after 1,000 game iterations\n", + "training_interval = 3 # run a training step every 3 game iterations\n", + "save_steps = 50 # save the model every 50 training steps\n", + "copy_steps = 25 # copy the critic to the actor every 25 training steps\n", + "discount_rate = 0.95\n", + "skip_start = 90 # Skip the start of every game (it's just waiting time).\n", + "batch_size = 50\n", + "iteration = 0 # game iterations\n", + "checkpoint_path = \"my_dqn.ckpt\"\n", + "done = True # env needs to be reset\n", + "\n", + "with tf.Session() as sess:\n", + " if os.path.isfile(checkpoint_path):\n", + " saver.restore(sess, checkpoint_path)\n", + " else:\n", + " init.run()\n", + " while True:\n", + " step = global_step.eval()\n", + " if step >= n_steps:\n", + " break\n", + " iteration += 1\n", + " print(\"\\rIteration {}\\tTraining step {}/{} ({:.1f}%)\".format(iteration, step, n_steps, step * 100 / n_steps), end=\"\")\n", + " if done: # game over, start again\n", + " obs = env.reset()\n", + " for skip in range(skip_start): # skip boring game iterations at the start of each game\n", + " obs, reward, done, info = env.step(0)\n", + " state = preprocess_observation(obs)\n", + "\n", + " # Actor evaluates what to do\n", + " q_values = actor_q_values.eval(feed_dict={X_state: [state]})\n", + " action = epsilon_greedy(q_values, step)\n", + "\n", + " # Actor plays\n", + " obs, reward, done, info = env.step(action)\n", + " next_state = preprocess_observation(obs)\n", + "\n", + " # Let's memorize what happened\n", + " replay_memory.append((state, action, reward, next_state, 1.0 - done))\n", + " state = next_state\n", + "\n", + " if iteration < training_start or iteration % training_interval != 0:\n", + " continue\n", + " \n", + " # Critic learns\n", + " X_state_val, X_action_val, rewards, X_next_state_val, continues = sample_memories(batch_size)\n", + " next_q_values = actor_q_values.eval(feed_dict={X_state: X_next_state_val})\n", + " y_val = rewards + continues * discount_rate * np.max(next_q_values, axis=1, keepdims=True)\n", + " training_op.run(feed_dict={X_state: X_state_val, X_action: X_action_val, y: y_val})\n", + "\n", + " # Regularly copy critic to actor\n", + " if step % copy_steps == 0:\n", + " copy_critic_to_actor.run()\n", + "\n", + " # And save regularly\n", + " if step % save_steps == 0:\n", + " saver.save(sess, checkpoint_path)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DQN for the Cart-Pole" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "eps_min = 0.1\n", + "eps_max = 1.0\n", + "eps_decay_steps = 20000\n", + "import sys\n", + "\n", + "def epsilon_greedy(q_values, step):\n", + " epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)\n", + " print(\" epsilon {}\".format(epsilon), end=\"\")\n", + " sys.stdout.flush()\n", + " if rnd.rand() < epsilon:\n", + " return rnd.randint(n_outputs) # random action\n", + " else:\n", + " return np.argmax(q_values) # optimal action" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.contrib.layers import fully_connected\n", + "\n", + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 4\n", + "n_hidden = 4\n", + "n_outputs = 2\n", + "\n", + "learning_rate = 0.01\n", + "\n", + "initializer = tf.contrib.layers.variance_scaling_initializer()\n", + "\n", + "def q_network(X_state, scope):\n", + " with tf.variable_scope(scope) as scope:\n", + " hidden = fully_connected(X_state, n_hidden, activation_fn=tf.nn.elu, weights_initializer=initializer)\n", + " outputs = fully_connected(hidden, n_outputs, activation_fn=None)\n", + " trainable_vars = {var.name[len(scope.name):]: var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}\n", + " return outputs, trainable_vars \n", + "\n", + "X_state = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", + "actor_q_values, actor_vars = q_network(X_state, scope=\"q_networks/actor\") # acts\n", + "critic_q_values, critic_vars = q_network(X_state, scope=\"q_networks/critic\") # learns\n", + "\n", + "copy_ops = [actor_var.assign(critic_vars[var_name])\n", + " for var_name, actor_var in actor_vars.items()]\n", + "copy_critic_to_actor = tf.group(*copy_ops)\n", + "\n", + "with tf.variable_scope(\"train\"):\n", + " X_action = tf.placeholder(tf.int32, shape=[None])\n", + " y = tf.placeholder(tf.float32, shape=[None, 1])\n", + " q_value = tf.reduce_sum(critic_q_values * tf.one_hot(X_action, n_outputs),\n", + " reduction_indices=1, keep_dims=True)\n", + " cost = tf.reduce_mean(tf.square(y - q_value))\n", + " global_step = tf.Variable(0, trainable=False, name='global_step')\n", + " optimizer = tf.train.AdamOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(cost, global_step=global_step)\n", + " \n", + "init = tf.initialize_all_variables()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_steps = 50000 # total number of training steps\n", + "training_start = 1000 # start training after 1,000 game iterations\n", + "training_interval = 3 # run a training step every 3 game iterations\n", + "save_steps = 50 # save the model every 50 training steps\n", + "copy_steps = 25 # copy the critic to the actor every 25 training steps\n", + "discount_rate = 0.95\n", + "batch_size = 50\n", + "iteration = 0 # game iterations\n", + "checkpoint_path = \"my_dqn.ckpt\"\n", + "done = True # env needs to be reset\n", + "\n", + "env = gym.make(\"CartPole-v0\")\n", + "\n", + "replay_memory.clear()\n", + "\n", + "with tf.Session() as sess:\n", + " if os.path.isfile(checkpoint_path):\n", + " saver.restore(sess, checkpoint_path)\n", + " else:\n", + " init.run()\n", + " while True:\n", + " step = global_step.eval()\n", + " if step >= n_steps:\n", + " break\n", + " iteration += 1\n", + " print(\"\\rIteration {}\\tTraining step {}/{} ({:.1f}%)\".format(iteration, step, n_steps, step * 100 / n_steps), end=\"\")\n", + " if done: # game over, start again\n", + " obs = env.reset()\n", + " state = obs\n", + "\n", + " # Actor evaluates what to do\n", + " q_values = actor_q_values.eval(feed_dict={X_state: [state]})\n", + " action = epsilon_greedy(q_values, step)\n", + "\n", + " # Actor plays\n", + " obs, reward, done, info = env.step(action)\n", + " next_state = obs\n", + "\n", + " # Let's memorize what happened\n", + " replay_memory.append((state, action, reward, next_state, 1.0 - done))\n", + " state = next_state\n", + "\n", + " if iteration < training_start or iteration % training_interval != 0:\n", + " continue\n", + " \n", + " # Critic learns\n", + " X_state_val, X_action_val, rewards, X_next_state_val, continues = sample_memories(batch_size)\n", + " next_q_values = actor_q_values.eval(feed_dict={X_state: X_next_state_val})\n", + " y_val = rewards + continues * discount_rate * np.max(next_q_values, axis=1, keepdims=True)\n", + " training_op.run(feed_dict={X_state: X_state_val, X_action: X_action_val, y: y_val})\n", + "\n", + " # Regularly copy critic to actor\n", + " if step % copy_steps == 0:\n", + " copy_critic_to_actor.run()\n", + "\n", + " # And save regularly\n", + " if step % save_steps == 0:\n", + " saver.save(sess, checkpoint_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_max_steps = 1000\n", + "\n", + "frames = []\n", + "obs = env.reset()\n", + "with tf.Session() as sess:\n", + " saver.restore(sess, checkpoint_path)\n", + " for step in range(n_max_steps):\n", + " img = render_cart_pole(env, obs)\n", + " frames.append(img)\n", + " actor_q_values_val = actor_q_values.eval(feed_dict={X_state: obs.reshape(1, n_inputs)})\n", + " action_val = np.argmax(actor_q_values_val)\n", + " obs, reward, done, info = env.step(action_val)\n", + " if done:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "len(frames)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "video = plot_animation(frames)\n", + "plt.show()" ] }, { @@ -882,7 +1914,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "2.7.12" }, "nav_menu": {}, "toc": {