diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb index 6a080af..8b99fce 100644 --- a/01_the_machine_learning_landscape.ipynb +++ b/01_the_machine_learning_landscape.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 1 – The Machine Learning landscape**\n", "\n", @@ -14,20 +11,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -36,9 +27,6 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "slideshow": { "slide_type": "-" } @@ -50,11 +38,10 @@ "\n", "# Common imports\n", "import numpy as np\n", - "import numpy.random as rnd\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", - "rnd.seed(42)\n", + "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", @@ -73,35 +60,173 @@ " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format='png', dpi=300)" + " plt.savefig(path, format='png', dpi=300)\n", + "\n", + "# Ignore useless warnings (see SciPy issue #5998)\n", + "import warnings\n", + "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ - "# Load and prepare Life satisfaction data" + "# Code example 1-1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function just merges the OECD's life satisfaction data and the IMF's GDP per capita data. It's a bit too long and boring and it's not specific to Machine Learning, which is why I left it out of the book." ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ + "def prepare_country_stats(oecd_bli, gdp_per_capita):\n", + " oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", + " oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", + " gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", + " gdp_per_capita.set_index(\"Country\", inplace=True)\n", + " full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,\n", + " left_index=True, right_index=True)\n", + " full_country_stats.sort_values(by=\"GDP per capita\", inplace=True)\n", + " remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", + " keep_indices = list(set(range(36)) - set(remove_indices))\n", + " return full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code in the book expects the data files to be located in the current directory. I just tweaked it here to fetch the files in datasets/lifesat." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "datapath = os.path.join(\"datasets\", \"lifesat\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Code example\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", + "import sklearn.linear_model\n", "\n", - "# Download CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n", - "datapath = \"datasets/lifesat/\"\n", + "# Load the data\n", + "oecd_bli = pd.read_csv(datapath + \"oecd_bli_2015.csv\", thousands=',')\n", + "gdp_per_capita = pd.read_csv(datapath + \"gdp_per_capita.csv\",thousands=',',delimiter='\\t',\n", + " encoding='latin1', na_values=\"n/a\")\n", "\n", - "oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", + "# Prepare the data\n", + "country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n", + "X = np.c_[country_stats[\"GDP per capita\"]]\n", + "y = np.c_[country_stats[\"Life satisfaction\"]]\n", + "\n", + "# Visualize the data\n", + "country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n", + "plt.show()\n", + "\n", + "# Select a linear model\n", + "model = sklearn.linear_model.LinearRegression()\n", + "\n", + "# Train the model\n", + "model.fit(X, y)\n", + "\n", + "# Make a prediction for Cyprus\n", + "X_new = [[22587]] # Cyprus' GDP per capita\n", + "print(model.predict(X_new)) # outputs [[ 5.96242338]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Note: you can ignore the rest of this notebook, it just generates many of the figures in chapter 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load and prepare Life satisfaction data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want, you can get fresh data from the OECD's website.\n", + "Download the CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n", + "and save it to `datasets/lifesat/`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "oecd_bli = pd.read_csv(datapath + \"oecd_bli_2015.csv\", thousands=',')\n", "oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", "oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", "oecd_bli.head(2)" @@ -109,12 +234,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "oecd_bli[\"Life satisfaction\"].head()" @@ -122,25 +243,24 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Load and prepare GDP per capita data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just like above, you can update the GDP per capita data if you want. Just download data from http://goo.gl/j1MSKe (=> imf.org) and save it to `datasets/lifesat/`." + ] + }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ - "# Download data from http://goo.gl/j1MSKe (=> imf.org)\n", "gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',', delimiter='\\t',\n", " encoding='latin1', na_values=\"n/a\")\n", "gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", @@ -150,12 +270,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 8, + "metadata": {}, "outputs": [], "source": [ "full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)\n", @@ -165,12 +281,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 9, + "metadata": {}, "outputs": [], "source": [ "full_country_stats[[\"GDP per capita\", 'Life satisfaction']].loc[\"United States\"]" @@ -178,12 +290,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 10, + "metadata": {}, "outputs": [], "source": [ "remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", @@ -195,12 +303,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", @@ -224,25 +328,17 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 12, + "metadata": {}, "outputs": [], "source": [ - "sample_data.to_csv(\"life_satisfaction_vs_gdp_per_capita.csv\")" + "sample_data.to_csv(os.path.join(\"datasets\", \"lifesat\", \"lifesat.csv\"))" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "sample_data.loc[list(position_text.keys())]" @@ -250,12 +346,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -278,12 +370,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [], "source": [ "from sklearn import linear_model\n", @@ -297,12 +385,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", @@ -317,12 +401,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "cyprus_gdp_per_capita = gdp_per_capita.loc[\"Cyprus\"][\"GDP per capita\"]\n", @@ -333,12 +413,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 18, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3), s=1)\n", @@ -356,12 +432,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ "sample_data[7:10]" @@ -369,12 +441,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 20, + "metadata": {}, "outputs": [], "source": [ "(5.1+5.7+6.5)/3" @@ -382,28 +450,29 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "backup = oecd_bli, gdp_per_capita\n", "\n", "def prepare_country_stats(oecd_bli, gdp_per_capita):\n", - " return sample_data" + " oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", + " oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", + " gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", + " gdp_per_capita.set_index(\"Country\", inplace=True)\n", + " full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,\n", + " left_index=True, right_index=True)\n", + " full_country_stats.sort_values(by=\"GDP per capita\", inplace=True)\n", + " remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", + " keep_indices = list(set(range(36)) - set(remove_indices))\n", + " return full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 22, + "metadata": {}, "outputs": [], "source": [ "# Code example\n", @@ -440,12 +509,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 23, + "metadata": {}, "outputs": [], "source": [ "oecd_bli, gdp_per_capita = backup" @@ -453,12 +518,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [], "source": [ "missing_data" @@ -466,12 +527,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 25, + "metadata": {}, "outputs": [], "source": [ "position_text2 = {\n", @@ -487,12 +544,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 26, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n", @@ -522,12 +575,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "full_country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n", @@ -550,12 +599,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [], "source": [ "full_country_stats.loc[[c for c in full_country_stats.index if \"W\" in c.upper()]][\"Life satisfaction\"]" @@ -563,12 +608,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 29, + "metadata": {}, "outputs": [], "source": [ "gdp_per_capita.loc[[c for c in gdp_per_capita.index if \"W\" in c.upper()]].head()" @@ -576,12 +617,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 30, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8,3))\n", @@ -611,12 +648,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ "backup = oecd_bli, gdp_per_capita\n", @@ -627,12 +660,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 32, + "metadata": {}, "outputs": [], "source": [ "# Replace this linear model:\n", @@ -641,12 +670,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "# with this k-neighbors regression model:\n", @@ -655,12 +680,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 34, + "metadata": {}, "outputs": [], "source": [ "X = np.c_[country_stats[\"GDP per capita\"]]\n", @@ -677,11 +698,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -702,7 +719,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.3" }, "nav_menu": {}, "toc": { @@ -723,5 +740,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 1cadabb..2eed4e3 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -66,7 +66,11 @@ " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format=fig_extension, dpi=resolution)" + " plt.savefig(path, format=fig_extension, dpi=resolution)\n", + "\n", + "# Ignore useless warnings (see SciPy issue #5998)\n", + "import warnings\n", + "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" ] }, { @@ -215,26 +219,41 @@ "metadata": {}, "outputs": [], "source": [ - "import hashlib\n", + "from zlib import crc32\n", "\n", - "def test_set_check(identifier, test_ratio, hash):\n", - " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n", + "def test_set_check(identifier, test_ratio):\n", + " return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32\n", "\n", - "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n", + "def split_train_test_by_id(data, test_ratio, id_column):\n", " ids = data[id_column]\n", - " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n", + " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))\n", " return data.loc[~in_test_set], data.loc[in_test_set]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The implementation of `test_set_check()` above works fine in both Python 2 and Python 3. In earlier releases, the following implementation was proposed, which supported any hash function, but was much slower and did not support Python 2:" + ] + }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "# This version supports both Python 2 and Python 3, instead of just Python 3.\n", - "def test_set_check(identifier, test_ratio, hash):\n", - " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" + "import hashlib\n", + "\n", + "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", + " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want an implementation that supports any hash function and is compatible with both Python 2 and Python 3, here is one:" ] }, { @@ -242,6 +261,16 @@ "execution_count": 15, "metadata": {}, "outputs": [], + "source": [ + "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", + " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], "source": [ "housing_with_id = housing.reset_index() # adds an `index` column\n", "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")" @@ -249,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -259,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -268,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -279,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -288,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -297,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -318,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -327,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -350,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -359,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -379,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -388,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -405,10 +434,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": true - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ "housing = strat_train_set.copy()" @@ -416,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -426,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -443,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -457,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -468,7 +495,8 @@ " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", " colorbar=False, alpha=0.4,\n", " )\n", - "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n", + "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,\n", + " cmap=plt.get_cmap(\"jet\"))\n", "plt.ylabel(\"Latitude\", fontsize=14)\n", "plt.xlabel(\"Longitude\", fontsize=14)\n", "\n", @@ -485,10 +513,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": true - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "corr_matrix = housing.corr()" @@ -496,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -505,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -520,7 +546,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -532,10 +558,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": true - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n", @@ -552,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -562,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -574,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -590,10 +614,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": true - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n", @@ -602,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -612,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -621,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -630,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -641,10 +663,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": true - }, + "execution_count": 49, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", @@ -661,10 +681,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": true - }, + "execution_count": 50, + "metadata": {}, "outputs": [], "source": [ "housing_num = housing.drop('ocean_proximity', axis=1)\n", @@ -673,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -682,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -698,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -714,34 +732,21 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": true - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "X = imputer.transform(housing_num)" ] }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", - " index = list(housing.index.values))" - ] - }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ - "housing_tr.loc[sample_incomplete_rows.index.values]" + "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", + " index = list(housing.index.values))" ] }, { @@ -750,7 +755,7 @@ "metadata": {}, "outputs": [], "source": [ - "imputer.strategy" + "housing_tr.loc[sample_incomplete_rows.index.values]" ] }, { @@ -758,6 +763,15 @@ "execution_count": 57, "metadata": {}, "outputs": [], + "source": [ + "imputer.strategy" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], "source": [ "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n", "housing_tr.head()" @@ -772,7 +786,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -789,7 +803,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -799,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -822,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -842,7 +856,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -858,10 +872,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "# Definition of the CategoricalEncoder class, copied from PR #9151.\n", @@ -1061,7 +1073,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1082,7 +1094,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1098,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1121,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1125,10 +1137,8 @@ }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": true - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1157,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1174,10 +1184,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": true - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -1194,7 +1202,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1210,10 +1218,8 @@ }, { "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": true - }, + "execution_count": 73, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1238,7 +1244,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1260,10 +1266,8 @@ }, { "cell_type": "code", - "execution_count": 74, - "metadata": { - "collapsed": true - }, + "execution_count": 75, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", @@ -1276,7 +1280,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1286,7 +1290,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1302,7 +1306,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1314,7 +1318,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1335,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1344,7 +1348,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1353,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1367,7 +1371,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1379,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1391,7 +1395,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1410,10 +1414,8 @@ }, { "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": true - }, + "execution_count": 86, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", @@ -1425,7 +1427,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1439,7 +1441,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1451,7 +1453,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1463,7 +1465,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1475,7 +1477,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1489,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1499,7 +1501,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1515,7 +1517,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1531,7 +1533,7 @@ "forest_reg = RandomForestRegressor(random_state=42)\n", "# train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n", "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", - " scoring='neg_mean_squared_error')\n", + " scoring='neg_mean_squared_error', return_train_score=True)\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, @@ -1544,7 +1546,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1553,7 +1555,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1569,7 +1571,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1580,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1589,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1609,7 +1611,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1620,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1630,7 +1632,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1643,10 +1645,8 @@ }, { "cell_type": "code", - "execution_count": 102, - "metadata": { - "collapsed": true - }, + "execution_count": 103, + "metadata": {}, "outputs": [], "source": [ "final_model = grid_search.best_estimator_\n", @@ -1663,7 +1663,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1686,7 +1686,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1708,10 +1708,8 @@ }, { "cell_type": "code", - "execution_count": 105, - "metadata": { - "collapsed": true - }, + "execution_count": 106, + "metadata": {}, "outputs": [], "source": [ "my_model = full_pipeline_with_predictor" @@ -1719,10 +1717,8 @@ }, { "cell_type": "code", - "execution_count": 106, - "metadata": { - "collapsed": true - }, + "execution_count": 107, + "metadata": {}, "outputs": [], "source": [ "from sklearn.externals import joblib\n", @@ -1740,7 +1736,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1778,7 +1774,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1804,7 +1800,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1822,7 +1818,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1852,7 +1848,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1885,7 +1881,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1903,7 +1899,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1926,7 +1922,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1951,7 +1947,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -1990,10 +1986,8 @@ }, { "cell_type": "code", - "execution_count": 116, - "metadata": { - "collapsed": true - }, + "execution_count": 117, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -2028,10 +2022,8 @@ }, { "cell_type": "code", - "execution_count": 117, - "metadata": { - "collapsed": true - }, + "execution_count": 118, + "metadata": {}, "outputs": [], "source": [ "k = 5" @@ -2046,7 +2038,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -2056,7 +2048,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -2072,7 +2064,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -2088,10 +2080,8 @@ }, { "cell_type": "code", - "execution_count": 121, - "metadata": { - "collapsed": true - }, + "execution_count": 122, + "metadata": {}, "outputs": [], "source": [ "preparation_and_feature_selection_pipeline = Pipeline([\n", @@ -2102,10 +2092,8 @@ }, { "cell_type": "code", - "execution_count": 122, - "metadata": { - "collapsed": true - }, + "execution_count": 123, + "metadata": {}, "outputs": [], "source": [ "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)" @@ -2120,7 +2108,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -2136,7 +2124,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -2166,10 +2154,8 @@ }, { "cell_type": "code", - "execution_count": 125, - "metadata": { - "collapsed": true - }, + "execution_count": 126, + "metadata": {}, "outputs": [], "source": [ "prepare_select_and_predict_pipeline = Pipeline([\n", @@ -2181,7 +2167,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -2197,7 +2183,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -2231,13 +2217,13 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ "param_grid = [\n", " {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", - " 'feature_selection__k': [3, 4, 5, 6, 7]}\n", + " 'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n", "]\n", "\n", "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", @@ -2247,7 +2233,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2258,16 +2244,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Great! It seems that we had the right imputer strategy (median), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise." - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "housing.shape" + "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise." ] }, { @@ -2294,7 +2271,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.4" }, "nav_menu": { "height": "279px", diff --git a/03_classification.ipynb b/03_classification.ipynb index 1e7960a..284d0a6 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -241,7 +241,7 @@ "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(random_state=42)\n", + "sgd_clf = SGDClassifier(max_iter=5, random_state=42)\n", "sgd_clf.fit(X_train, y_train_5)" ] }, @@ -766,7 +766,7 @@ "outputs": [], "source": [ "from sklearn.multiclass import OneVsOneClassifier\n", - "ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))\n", + "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))\n", "ovo_clf.fit(X_train, y_train)\n", "ovo_clf.predict([some_digit])" ] @@ -948,7 +948,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)\n", + "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)\n", "f1_score(y_multilabel, y_train_knn_pred, average=\"macro\")" ] }, @@ -1185,7 +1185,7 @@ "param_grid = [{'weights': [\"uniform\", \"distance\"], 'n_neighbors': [3, 4, 5]}]\n", "\n", "knn_clf = KNeighborsClassifier()\n", - "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)\n", + "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)\n", "grid_search.fit(X_train, y_train)" ] }, @@ -2281,7 +2281,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It seems that the ham emails are more often plain text, while spam has quite a lot of HTML. Moreover, quite a few ham emails are signed using PGP, while no spam is. In short, it seems that the email structure is a usual information to have." + "It seems that the ham emails are more often plain text, while spam has quite a lot of HTML. Moreover, quite a few ham emails are signed using PGP, while no spam is. In short, it seems that the email structure is useful information to have." ] }, { @@ -2714,8 +2714,8 @@ "\n", "y_pred = log_clf.predict(X_test_transformed)\n", "\n", - "print(\"Precision: {:.2f}%\".format(precision_score(y_test, y_pred)))\n", - "print(\"Recall: {:.2f}%\".format(recall_score(y_test, y_pred)))" + "print(\"Precision: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n", + "print(\"Recall: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))" ] } ], diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index c0bea14..1845e8e 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -2,40 +2,28 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 4 – Training Linear Models**" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_This notebook contains all the sample code and solutions to the exercises in chapter 4._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -43,11 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -77,15 +61,16 @@ " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format='png', dpi=300)\n" + " plt.savefig(path, format='png', dpi=300)\n", + "\n", + "# Ignore useless warnings (see SciPy issue #5998)\n", + "import warnings\n", + "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Linear regression using the Normal Equation" ] @@ -93,11 +78,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -109,11 +90,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X, y, \"b.\")\n", @@ -127,11 +104,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance\n", @@ -141,11 +114,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta_best" @@ -154,11 +123,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_new = np.array([[0], [2]])\n", @@ -170,11 +135,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X_new, y_predict, \"r-\")\n", @@ -185,10 +146,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The figure in the book actually corresponds to the following code, with a legend and axis labels:" ] @@ -196,11 +154,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n", @@ -216,11 +170,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", @@ -232,11 +182,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_reg.predict(X_new)" @@ -244,22 +190,55 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, + "source": [ + "The `LinearRegression` class is based on the `scipy.linalg.lstsq()` function (the name stands for \"least squares\"), which you could call directly:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_b, y, rcond=1e-6)\n", + "theta_best_svd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function computes $\\mathbf{X}^+\\mathbf{y}$, where $\\mathbf{X}^{+}$ is the _pseudoinverse_ of $\\mathbf{X}$ (specifically the Moore-Penrose inverse). You can use `np.linalg.pinv()` to compute the pseudoinverse directly:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "np.linalg.pinv(X_b).dot(y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: the first releases of the book implied that the `LinearRegression` class was based on the Normal Equation. This was an error, my apologies: as explained above, it is based on the pseudoinverse, which ultimately relies on the SVD matrix decomposition of $\\mathbf{X}$ (see chapter 8 for details about the SVD decomposition). Its time complexity is $O(n^2)$ and it works even when $m < n$ or when some features are linear combinations of other features (in these cases, $\\mathbf{X}^T \\mathbf{X}$ is not invertible so the Normal Equation fails), see [issue #184](https://github.com/ageron/handson-ml/issues/184) for more details. However, this does not change the rest of the description of the `LinearRegression` class, in particular, it is based on an analytical solution, it does not scale well with the number of features, it scales linearly with the number of instances, all the data must fit in memory, it does not require feature scaling and the order of the instances in the training set does not matter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ "# Linear regression using batch gradient descent" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "eta = 0.1\n", @@ -274,12 +253,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "theta" @@ -287,12 +262,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [], "source": [ "X_new_b.dot(theta)" @@ -300,12 +271,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "theta_path_bgd = []\n", @@ -325,29 +292,13 @@ " theta_path.append(theta)\n", " plt.xlabel(\"$x_1$\", fontsize=18)\n", " plt.axis([0, 2, 0, 15])\n", - " plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)\n", - "\n", - "np.random.seed(42)\n", - "theta = np.random.randn(2,1) # random initialization\n", - "\n", - "plt.figure(figsize=(10,4))\n", - "plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n", - "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", - "plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n", - "plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n", - "\n", - "save_fig(\"gradient_descent_plot\")\n", - "plt.show()" + " plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -365,22 +316,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Stochastic Gradient Descent" ] }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 18, + "metadata": {}, "outputs": [], "source": [ "theta_path_sgd = []\n", @@ -390,12 +334,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ "n_epochs = 50\n", @@ -430,12 +370,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 20, + "metadata": {}, "outputs": [], "source": [ "theta" @@ -443,27 +379,19 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDRegressor\n", - "sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1, random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)\n", "sgd_reg.fit(X, y.ravel())" ] }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 22, + "metadata": {}, "outputs": [], "source": [ "sgd_reg.intercept_, sgd_reg.coef_" @@ -471,22 +399,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Mini-batch gradient descent" ] }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 23, + "metadata": {}, "outputs": [], "source": [ "theta_path_mgd = []\n", @@ -497,7 +418,7 @@ "np.random.seed(42)\n", "theta = np.random.randn(2,1) # random initialization\n", "\n", - "t0, t1 = 10, 1000\n", + "t0, t1 = 200, 1000\n", "def learning_schedule(t):\n", " return t0 / (t + t1)\n", "\n", @@ -510,7 +431,7 @@ " t += 1\n", " xi = X_b_shuffled[i:i+minibatch_size]\n", " yi = y_shuffled[i:i+minibatch_size]\n", - " gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n", + " gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)\n", " eta = learning_schedule(t)\n", " theta = theta - eta * gradients\n", " theta_path_mgd.append(theta)" @@ -518,12 +439,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [], "source": [ "theta" @@ -531,12 +448,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 25, + "metadata": {}, "outputs": [], "source": [ "theta_path_bgd = np.array(theta_path_bgd)\n", @@ -546,12 +459,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 26, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(7,4))\n", @@ -568,22 +477,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Polynomial regression" ] }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -594,12 +496,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [], "source": [ "m = 100\n", @@ -609,12 +507,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 29, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X, y, \"b.\")\n", @@ -627,12 +521,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 30, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import PolynomialFeatures\n", @@ -643,12 +533,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ "X_poly[0]" @@ -656,12 +542,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 32, + "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", @@ -671,12 +553,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n", @@ -694,12 +572,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 34, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", @@ -729,12 +603,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 35, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", @@ -747,8 +617,8 @@ " model.fit(X_train[:m], y_train[:m])\n", " y_train_predict = model.predict(X_train[:m])\n", " y_val_predict = model.predict(X_val)\n", - " train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))\n", - " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", + " train_errors.append(mean_squared_error(y_train[:m], y_train_predict))\n", + " val_errors.append(mean_squared_error(y_val, y_val_predict))\n", "\n", " plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"train\")\n", " plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"val\")\n", @@ -759,12 +629,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", @@ -776,12 +642,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 37, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -799,22 +661,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Regularized models" ] }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 38, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", @@ -856,12 +711,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 39, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", @@ -872,27 +723,19 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ - "sgd_reg = SGDRegressor(penalty=\"l2\", random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=5, penalty=\"l2\", random_state=42)\n", "sgd_reg.fit(X, y.ravel())\n", "sgd_reg.predict([[1.5]])" ] }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n", @@ -902,12 +745,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 42, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso\n", @@ -925,12 +764,8 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 43, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso\n", @@ -941,12 +776,8 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import ElasticNet\n", @@ -957,11 +788,8 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 45, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -981,7 +809,7 @@ "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n", "X_val_poly_scaled = poly_scaler.transform(X_val)\n", "\n", - "sgd_reg = SGDRegressor(n_iter=1,\n", + "sgd_reg = SGDRegressor(max_iter=1,\n", " penalty=None,\n", " eta0=0.0005,\n", " warm_start=True,\n", @@ -994,8 +822,8 @@ " sgd_reg.fit(X_train_poly_scaled, y_train)\n", " y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n", " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", - " train_errors.append(mean_squared_error(y_train_predict, y_train))\n", - " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", + " train_errors.append(mean_squared_error(y_train, y_train_predict))\n", + " val_errors.append(mean_squared_error(y_val, y_val_predict))\n", "\n", "best_epoch = np.argmin(val_errors)\n", "best_val_rmse = np.sqrt(val_errors[best_epoch])\n", @@ -1021,16 +849,12 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 46, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import clone\n", - "sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None,\n", + "sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,\n", " learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", "\n", "minimum_val_error = float(\"inf\")\n", @@ -1039,7 +863,7 @@ "for epoch in range(1000):\n", " sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n", " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", - " val_error = mean_squared_error(y_val_predict, y_val)\n", + " val_error = mean_squared_error(y_val, y_val_predict)\n", " if val_error < minimum_val_error:\n", " minimum_val_error = val_error\n", " best_epoch = epoch\n", @@ -1048,12 +872,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "best_epoch, best_model" @@ -1061,12 +881,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 48, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -1076,12 +892,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 49, + "metadata": {}, "outputs": [], "source": [ "t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5\n", @@ -1107,12 +919,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 50, + "metadata": {}, "outputs": [], "source": [ "def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):\n", @@ -1150,6 +958,9 @@ " plt.plot(t1_min, t2_min, \"rs\")\n", " plt.title(r\"$\\ell_{}$ penalty\".format(i + 1), fontsize=16)\n", " plt.axis([t1a, t1b, t2a, t2b])\n", + " if i == 1:\n", + " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", + " plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n", "\n", " plt.subplot(222 + i * 2)\n", " plt.grid(True)\n", @@ -1160,14 +971,8 @@ " plt.plot(t1r_min, t2r_min, \"rs\")\n", " plt.title(title, fontsize=16)\n", " plt.axis([t1a, t1b, t2a, t2b])\n", - "\n", - "for subplot in (221, 223):\n", - " plt.subplot(subplot)\n", - " plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n", - "\n", - "for subplot in (223, 224):\n", - " plt.subplot(subplot)\n", - " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", + " if i == 1:\n", + " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", "\n", "save_fig(\"lasso_vs_ridge_plot\")\n", "plt.show()" @@ -1175,22 +980,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Logistic regression" ] }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 51, + "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-10, 10, 100)\n", @@ -1210,12 +1008,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 52, + "metadata": {}, "outputs": [], "source": [ "from sklearn import datasets\n", @@ -1225,12 +1019,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 53, + "metadata": {}, "outputs": [], "source": [ "print(iris.DESCR)" @@ -1238,12 +1028,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, 3:] # petal width\n", @@ -1252,12 +1038,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", @@ -1267,12 +1049,8 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 56, + "metadata": {}, "outputs": [], "source": [ "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", @@ -1284,22 +1062,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The figure in the book actually is actually a bit fancier:" ] }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 57, + "metadata": {}, "outputs": [], "source": [ "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", @@ -1325,12 +1096,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 58, + "metadata": {}, "outputs": [], "source": [ "decision_boundary" @@ -1338,12 +1105,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 59, + "metadata": {}, "outputs": [], "source": [ "log_reg.predict([[1.7], [1.5]])" @@ -1351,12 +1114,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 60, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", @@ -1399,12 +1158,8 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 61, + "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", @@ -1416,12 +1171,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 62, + "metadata": {}, "outputs": [], "source": [ "x0, x1 = np.meshgrid(\n", @@ -1445,7 +1196,7 @@ "from matplotlib.colors import ListedColormap\n", "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", "\n", - "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n", + "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n", "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n", "plt.clabel(contour, inline=1, fontsize=12)\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", @@ -1458,12 +1209,8 @@ }, { "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 63, + "metadata": {}, "outputs": [], "source": [ "softmax_reg.predict([[5, 2]])" @@ -1471,12 +1218,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "softmax_reg.predict_proba([[5, 2]])" @@ -1484,40 +1227,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 1. to 11." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "See appendix A." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 12. Batch Gradient Descent with early stopping for Softmax Regression\n", "(without using Scikit-Learn)" @@ -1525,22 +1256,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's start by loading the data. We will just reuse the Iris dataset we loaded earlier." ] }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 65, + "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", @@ -1549,22 +1273,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "We need to add the bias term for every instance ($x_0 = 1$):" ] }, { "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "X_with_bias = np.c_[np.ones([len(X), 1]), X]" @@ -1572,22 +1289,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "And let's set the random seed so the output of this exercise solution is reproducible:" ] }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(2042)" @@ -1595,22 +1305,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The easiest option to split the dataset into a training set, a validation set and a test set would be to use Scikit-Learn's `train_test_split()` function, but the point of this exercise is to try understand the algorithms by implementing them manually. So here is one possible implementation:" ] }, { "cell_type": "code", - "execution_count": 66, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 68, + "metadata": {}, "outputs": [], "source": [ "test_ratio = 0.2\n", @@ -1633,22 +1336,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The targets are currently class indices (0, 1 or 2), but we need target class probabilities to train the Softmax Regression model. Each instance will have target class probabilities equal to 0.0 for all classes except for the target class which will have a probability of 1.0 (in other words, the vector of class probabilities for ay given instance is a one-hot vector). Let's write a small function to convert the vector of class indices into a matrix containing a one-hot vector for each instance:" ] }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "def to_one_hot(y):\n", @@ -1661,22 +1357,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's test this function on the first 10 instances:" ] }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 70, + "metadata": {}, "outputs": [], "source": [ "y_train[:10]" @@ -1684,12 +1373,8 @@ }, { "cell_type": "code", - "execution_count": 69, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "to_one_hot(y_train[:10])" @@ -1697,22 +1382,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Looks good, so let's create the target class probabilities matrix for the training set and the test set:" ] }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "Y_train_one_hot = to_one_hot(y_train)\n", @@ -1722,10 +1400,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's implement the Softmax function. Recall that it is defined by the following equation:\n", "\n", @@ -1734,12 +1409,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 73, + "metadata": {}, "outputs": [], "source": [ "def softmax(logits):\n", @@ -1750,22 +1421,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "We are almost ready to start training. Let's define the number of inputs and outputs:" ] }, { "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 74, + "metadata": {}, "outputs": [], "source": [ "n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)\n", @@ -1774,10 +1438,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now here comes the hardest part: training! Theoretically, it's simple: it's just a matter of translating the math equations into Python code. But in practice, it can be quite tricky: in particular, it's easy to mix up the order of the terms, or the indices. You can even end up with code that looks like it's working but is actually not computing exactly the right thing. When unsure, you should write down the shape of each term in the equation and make sure the corresponding terms in your code match closely. It can also help to evaluate each term independently and print them out. The good news it that you won't have to do this everyday, since all this is well implemented by Scikit-Learn, but it will help you understand what's going on under the hood.\n", "\n", @@ -1795,12 +1456,8 @@ }, { "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 75, + "metadata": {}, "outputs": [], "source": [ "eta = 0.01\n", @@ -1823,22 +1480,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "And that's it! The Softmax model is trained. Let's look at the model parameters:" ] }, { "cell_type": "code", - "execution_count": 74, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 76, + "metadata": {}, "outputs": [], "source": [ "Theta" @@ -1846,22 +1496,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's make predictions for the validation set and check the accuracy score:" ] }, { "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 77, + "metadata": {}, "outputs": [], "source": [ "logits = X_valid.dot(Theta)\n", @@ -1874,22 +1517,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Well, this model looks pretty good. For the sake of the exercise, let's add a bit of $\\ell_2$ regularization. The following training code is similar to the one above, but the loss now has an additional $\\ell_2$ penalty, and the gradients have the proper additional term (note that we don't regularize the first element of `Theta` since this corresponds to the bias term). Also, let's try increasing the learning rate `eta`." ] }, { "cell_type": "code", - "execution_count": 76, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 78, + "metadata": {}, "outputs": [], "source": [ "eta = 0.1\n", @@ -1909,28 +1545,21 @@ " error = Y_proba - Y_train_one_hot\n", " if iteration % 500 == 0:\n", " print(iteration, loss)\n", - " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_inputs]), alpha * Theta[1:]]\n", + " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n", " Theta = Theta - eta * gradients" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Because of the additional $\\ell_2$ penalty, the loss seems greater than earlier, but perhaps this model will perform better? Let's find out:" ] }, { "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 79, + "metadata": {}, "outputs": [], "source": [ "logits = X_valid.dot(Theta)\n", @@ -1943,32 +1572,22 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Cool, perfect accuracy! We probably just got lucky with this validation set, but still, it's pleasant." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's add early stopping. For this we just need to measure the loss on the validation set at every iteration and stop when the error starts growing." ] }, { "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 80, + "metadata": {}, "outputs": [], "source": [ "eta = 0.1 \n", @@ -1987,7 +1606,7 @@ " l2_loss = 1/2 * np.sum(np.square(Theta[1:]))\n", " loss = xentropy_loss + alpha * l2_loss\n", " error = Y_proba - Y_train_one_hot\n", - " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_inputs]), alpha * Theta[1:]]\n", + " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n", " Theta = Theta - eta * gradients\n", "\n", " logits = X_valid.dot(Theta)\n", @@ -2007,12 +1626,8 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 81, + "metadata": {}, "outputs": [], "source": [ "logits = X_valid.dot(Theta)\n", @@ -2025,32 +1640,22 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Still perfect, but faster." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's plot the model's predictions on the whole dataset:" ] }, { "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 82, + "metadata": {}, "outputs": [], "source": [ "x0, x1 = np.meshgrid(\n", @@ -2075,7 +1680,7 @@ "from matplotlib.colors import ListedColormap\n", "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", "\n", - "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n", + "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n", "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n", "plt.clabel(contour, inline=1, fontsize=12)\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", @@ -2087,22 +1692,15 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "And now let's measure the final model's accuracy on the test set:" ] }, { "cell_type": "code", - "execution_count": 81, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 83, + "metadata": {}, "outputs": [], "source": [ "logits = X_test.dot(Theta)\n", @@ -2115,10 +1713,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Our perfect model turns out to have slight imperfections. This variability is likely due to the very small size of the dataset: depending on how you sample the training set, validation set and the test set, you can get quite different results. Try changing the random seed and running the code again a few times, you will see that the results will vary." ] @@ -2126,11 +1721,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -2151,7 +1742,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.4" }, "nav_menu": {}, "toc": { @@ -2165,5 +1756,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index 687d74b..abbc1c1 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 5 – Support Vector Machines**\n", "\n", @@ -14,20 +11,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -35,11 +26,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -74,20 +61,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin classification" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after:" ] @@ -95,11 +76,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -121,11 +98,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Bad models\n", @@ -179,10 +152,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to feature scales" ] @@ -190,11 +160,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)\n", @@ -230,10 +196,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to outliers" ] @@ -241,11 +204,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])\n", @@ -295,20 +254,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin *vs* margin violations" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This is the first code example in chapter 5:" ] @@ -316,11 +269,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -344,11 +293,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf.predict([[5.5, 1.7]])" @@ -356,10 +301,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's generate the graph comparing different regularization settings:" ] @@ -367,11 +309,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -394,11 +332,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Convert to unscaled parameters\n", @@ -422,11 +356,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,3.2))\n", @@ -454,9 +384,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "# Non-linear classification" @@ -465,11 +393,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X1D = np.linspace(-4, 4, 9).reshape(-1, 1)\n", @@ -508,11 +432,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -533,11 +453,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -556,11 +472,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_predictions(clf, axes):\n", @@ -583,11 +495,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -602,11 +510,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "poly100_kernel_svm_clf = Pipeline([\n", @@ -619,11 +523,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(11, 4))\n", @@ -646,9 +546,6 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -716,11 +613,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "x1_example = X1D[3, 0]\n", @@ -732,11 +625,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rbf_kernel_svm_clf = Pipeline([\n", @@ -750,9 +639,6 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -787,10 +673,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Regression\n" ] @@ -798,11 +681,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -814,11 +693,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -830,11 +705,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n", @@ -857,11 +728,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_svm_regression(svm_reg, X, y, axes):\n", @@ -898,11 +765,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -914,11 +777,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -930,11 +789,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -948,11 +803,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9, 4))\n", @@ -969,10 +820,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Under the hood" ] @@ -980,11 +828,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "iris = datasets.load_iris()\n", @@ -995,11 +839,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", @@ -1042,10 +882,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Small weight vector results in a large margin" ] @@ -1053,11 +890,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_2D_decision_function(w, b, ylabel=True, x1_lim=[-3, 3]):\n", @@ -1091,11 +924,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -1112,10 +941,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Hinge loss" ] @@ -1123,11 +949,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-2, 4, 200)\n", @@ -1148,20 +970,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Extra material" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Training time" ] @@ -1169,11 +985,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n", @@ -1184,11 +996,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -1210,10 +1018,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Linear SVM classifier implementation using Batch Gradient Descent" ] @@ -1221,11 +1026,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Training set\n", @@ -1236,11 +1037,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", @@ -1286,7 +1083,7 @@ "\n", " self.intercept_ = np.array([b])\n", " self.coef_ = np.array([w])\n", - " support_vectors_idx = (X_t.dot(w) + b < 1).ravel()\n", + " support_vectors_idx = (X_t.dot(w) + t * b < 1).ravel()\n", " self.support_vectors_ = X[support_vectors_idx]\n", " return self\n", "\n", @@ -1305,11 +1102,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(range(svm_clf.n_epochs), svm_clf.Js)\n", @@ -1319,11 +1112,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "print(svm_clf.intercept_, svm_clf.coef_)" @@ -1332,11 +1121,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf2 = SVC(kernel=\"linear\", C=C)\n", @@ -1347,11 +1132,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "yr = y.ravel()\n", @@ -1378,9 +1159,6 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -1412,20 +1190,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 1. to 7." ] @@ -1433,9 +1205,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "See appendix A." @@ -1443,30 +1213,21 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 8." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train a `LinearSVC` on a linearly separable dataset. Then train an `SVC` and a `SGDClassifier` on the same dataset. See if you can get them to produce roughly the same model._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's use the Iris dataset: the Iris Setosa and Iris Versicolor classes are linearly separable." ] @@ -1475,9 +1236,7 @@ "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1495,11 +1254,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC, LinearSVC\n", @@ -1528,10 +1283,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's plot the decision boundaries of these three models:" ] @@ -1539,11 +1291,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Compute the slope and bias of each decision boundary\n", @@ -1576,40 +1324,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Close enough!" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 9." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's load the dataset and split it into a training set and a test set. We could use `train_test_split()` but people usually just take the first 60,000 instances for the training set, and the last 10,000 instances for the test set (this makes it possible to compare your model's performance with others): " ] @@ -1617,11 +1353,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_mldata\n", @@ -1638,10 +1370,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first:" ] @@ -1650,9 +1379,7 @@ "cell_type": "code", "execution_count": 48, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1664,10 +1391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!" ] @@ -1675,11 +1399,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1688,10 +1408,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's make predictions on the training set and measure the accuracy (we don't want to measure it on the test set yet, since we have not selected and trained the final model yet):" ] @@ -1699,11 +1416,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", @@ -1714,10 +1427,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Wow, 82% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" ] @@ -1725,11 +1435,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -1740,11 +1446,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1754,11 +1456,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = lin_clf.predict(X_train_scaled)\n", @@ -1767,10 +1465,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's much better (we cut the error rate in two), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default).\n", "\n", @@ -1780,11 +1475,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf = SVC(decision_function_shape=\"ovr\")\n", @@ -1794,11 +1485,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = svm_clf.predict(X_train_scaled)\n", @@ -1807,10 +1494,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's promising, we get better performance even though we trained the model on 6 times less data. Let's tune the hyperparameters by doing a randomized search with cross validation. We will do this on a small dataset just to speed up the process:" ] @@ -1818,11 +1502,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", @@ -1836,11 +1516,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -1849,11 +1525,7 @@ { "cell_type": "code", "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_score_" @@ -1861,10 +1533,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This looks pretty low but remember we only trained the model on 1,000 instances. Let's retrain the best estimator on the whole training set (run this at night, it will take hours):" ] @@ -1872,11 +1541,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)" @@ -1885,11 +1550,7 @@ { "cell_type": "code", "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -1898,10 +1559,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Ah, this looks good! Let's select this model. Now we can test it on the test set:" ] @@ -1909,11 +1567,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -1922,40 +1576,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Not too bad, but apparently the model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing `C` and/or `gamma`), but we would run the risk of overfitting the test set. Other people have found that the hyperparameters `C=5` and `gamma=0.005` yield even better performance (over 98% accuracy). By running the randomized search for longer and on a larger part of the training set, you may be able to find this as well." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 10." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM regressor on the California housing dataset._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's load the dataset using Scikit-Learn's `fetch_california_housing()` function:" ] @@ -1964,9 +1606,7 @@ "cell_type": "code", "execution_count": 62, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1979,10 +1619,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Split it into a training set and a test set:" ] @@ -1991,9 +1628,7 @@ "cell_type": "code", "execution_count": 63, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2004,10 +1639,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Don't forget to scale the data:" ] @@ -2016,9 +1648,7 @@ "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2031,10 +1661,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's train a simple `LinearSVR` first:" ] @@ -2042,11 +1669,7 @@ { "cell_type": "code", "execution_count": 65, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -2057,10 +1680,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's see how it performs on the training set:" ] @@ -2068,11 +1688,7 @@ { "cell_type": "code", "execution_count": 66, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", @@ -2084,10 +1700,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's look at the RMSE:" ] @@ -2095,11 +1708,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(mse)" @@ -2107,10 +1716,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In this training set, the targets are tens of thousands of dollars. The RMSE gives a rough idea of the kind of error you should expect (with a higher weight for large errors): so with this model we can expect errors somewhere around $10,000. Not great. Let's see if we can do better with an RBF Kernel. We will use randomized search with cross validation to find the appropriate hyperparameter values for `C` and `gamma`:" ] @@ -2118,11 +1724,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -2137,11 +1739,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -2149,10 +1747,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's measure the RMSE on the training set:" ] @@ -2160,11 +1755,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -2174,10 +1765,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Looks much better than the linear model. Let's select this model and evaluate it on the test set:" ] @@ -2185,11 +1773,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -2201,9 +1785,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [] @@ -2225,7 +1807,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.3" }, "nav_menu": {}, "toc": { @@ -2239,5 +1821,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index d1bf0d2..5f0de13 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -77,9 +75,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(4)\n", @@ -120,9 +116,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_centered = X - X.mean(axis=0)\n", @@ -134,9 +128,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "m, n = X.shape\n", @@ -157,9 +149,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "W2 = Vt.T[:, :2]\n", @@ -169,9 +159,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X2D_using_svd = X2D" @@ -194,9 +182,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", @@ -251,9 +237,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X3D_inv = pca.inverse_transform(X2D)" @@ -301,9 +285,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X3D_inv_using_svd = X2D_using_svd.dot(Vt[:2, :])" @@ -436,9 +418,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from matplotlib.patches import FancyArrowPatch\n", @@ -466,9 +446,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]\n", @@ -563,9 +541,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_swiss_roll\n", @@ -785,9 +761,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from six.moves import urllib\n", @@ -798,9 +772,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", @@ -814,9 +786,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pca = PCA()\n", @@ -837,9 +807,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components=0.95)\n", @@ -867,9 +835,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components = 154)\n", @@ -880,9 +846,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_digits(instances, images_per_row=5, **options):\n", @@ -921,9 +885,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_reduced_pca = X_reduced" @@ -956,9 +918,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)" @@ -981,9 +941,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_reduced_inc_pca = X_reduced" @@ -1038,9 +996,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "filename = \"my_mnist.data\"\n", @@ -1060,9 +1016,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "del X_mm" @@ -1091,9 +1045,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n", @@ -1221,9 +1173,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" @@ -1232,9 +1182,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import KernelPCA\n", @@ -1285,7 +1233,7 @@ "source": [ "plt.figure(figsize=(6, 5))\n", "\n", - "X_inverse = pca.inverse_transform(X_reduced_rbf)\n", + "X_inverse = rbf_pca.inverse_transform(X_reduced_rbf)\n", "\n", "ax = plt.subplot(111, projection='3d')\n", "ax.view_init(10, -70)\n", @@ -2339,7 +2287,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" }, "nav_menu": { "height": "352px", diff --git a/11_deep_learning.ipynb b/11_deep_learning.ipynb index c002217..d83e660 100644 --- a/11_deep_learning.ipynb +++ b/11_deep_learning.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -79,9 +77,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def logit(z):\n", @@ -134,9 +130,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf" @@ -145,9 +139,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -161,9 +153,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "he_init = tf.contrib.layers.variance_scaling_initializer()\n", @@ -188,9 +178,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def leaky_relu(z, alpha=0.01):\n", @@ -226,9 +214,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -239,9 +225,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def leaky_relu(z, name=None):\n", @@ -260,9 +244,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -276,9 +258,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", @@ -288,9 +268,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"dnn\"):\n", @@ -302,9 +280,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"loss\"):\n", @@ -315,9 +291,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "learning_rate = 0.01\n", @@ -330,9 +304,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"eval\"):\n", @@ -343,9 +315,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -404,9 +374,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def elu(z, alpha=1):\n", @@ -441,9 +409,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -454,9 +420,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=\"hidden1\")" @@ -479,9 +443,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def selu(z,\n", @@ -543,9 +505,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def selu(z,\n", @@ -571,9 +531,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -668,9 +626,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -702,9 +658,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -723,9 +677,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from functools import partial\n", @@ -753,9 +705,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -811,9 +761,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_epochs = 20\n", @@ -912,9 +860,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -946,9 +892,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "learning_rate = 0.01" @@ -964,9 +908,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "threshold = 1.0\n", @@ -988,9 +930,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"eval\"):\n", @@ -1001,9 +941,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -1013,9 +951,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_epochs = 20\n", @@ -1065,9 +1001,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()" @@ -1076,9 +1010,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "saver = tf.train.import_meta_graph(\"./my_model_final.ckpt.meta\")" @@ -1111,9 +1043,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output, Image, display, HTML\n", @@ -1175,9 +1105,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X = tf.get_default_graph().get_tensor_by_name(\"X:0\")\n", @@ -1198,9 +1126,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "for op in (X, y, accuracy, training_op):\n", @@ -1217,9 +1143,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X, y, accuracy, training_op = tf.get_collection(\"my_important_ops\")" @@ -1280,9 +1204,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1363,9 +1285,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1437,9 +1357,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1489,8 +1407,7 @@ "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()\n", @@ -1527,9 +1444,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1670,9 +1585,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1706,9 +1619,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"train\"): # not shown in the book\n", @@ -1721,9 +1632,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -1738,8 +1647,7 @@ "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()\n", @@ -1762,9 +1670,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1783,9 +1689,7 @@ { "cell_type": "code", "execution_count": 72, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"dnn\"):\n", @@ -1804,9 +1708,7 @@ { "cell_type": "code", "execution_count": 73, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"loss\"):\n", @@ -1837,8 +1739,7 @@ "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()\n", @@ -1868,9 +1769,7 @@ { "cell_type": "code", "execution_count": 75, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1913,15 +1812,12 @@ { "cell_type": "code", "execution_count": 76, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()" @@ -4941,7 +4837,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.4" }, "nav_menu": { "height": "360px", diff --git a/15_autoencoders.ipynb b/15_autoencoders.ipynb index 0629c1f..1e8299a 100644 --- a/15_autoencoders.ipynb +++ b/15_autoencoders.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -80,9 +78,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_image(image, shape=[28, 28]):\n", @@ -93,9 +89,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_multiple_images(images, n_rows, n_cols, pad=2):\n", @@ -126,9 +120,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy.random as rnd\n", @@ -419,9 +411,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -479,9 +469,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "saver = tf.train.Saver()" @@ -532,7 +520,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are many ways to train one Autoencoder at a time. The first approach it to train each Autoencoder using a different graph, then we create the Stacked Autoencoder by simply initializing it with the weights and biases copied from these Autoencoders." + "There are many ways to train one Autoencoder at a time. The first approach is to train each Autoencoder using a different graph, then we create the Stacked Autoencoder by simply initializing it with the weights and biases copied from these Autoencoders." ] }, { @@ -545,9 +533,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -555,8 +541,9 @@ "from functools import partial\n", "\n", "def train_autoencoder(X_train, n_neurons, n_epochs, batch_size,\n", - " learning_rate = 0.01, l2_reg = 0.0005,\n", - " activation=tf.nn.elu, seed=42):\n", + " learning_rate = 0.01, l2_reg = 0.0005, seed=42,\n", + " hidden_activation=tf.nn.elu,\n", + " output_activation=tf.nn.elu):\n", " graph = tf.Graph()\n", " with graph.as_default():\n", " tf.set_random_seed(seed)\n", @@ -567,12 +554,11 @@ " \n", " my_dense_layer = partial(\n", " tf.layers.dense,\n", - " activation=activation,\n", " kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),\n", " kernel_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))\n", "\n", - " hidden = my_dense_layer(X, n_neurons, name=\"hidden\")\n", - " outputs = my_dense_layer(hidden, n_inputs, activation=None, name=\"outputs\")\n", + " hidden = my_dense_layer(X, n_neurons, activation=hidden_activation, name=\"hidden\")\n", + " outputs = my_dense_layer(hidden, n_inputs, activation=output_activation, name=\"outputs\")\n", "\n", " reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))\n", "\n", @@ -614,7 +600,8 @@ "metadata": {}, "outputs": [], "source": [ - "hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150)\n", + "hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150,\n", + " output_activation=None)\n", "_, W2, b2, W3, b3 = train_autoencoder(hidden_output, n_neurons=150, n_epochs=4, batch_size=150)" ] }, @@ -1748,7 +1735,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" }, "nav_menu": { "height": "381px", diff --git a/16_reinforcement_learning.ipynb b/16_reinforcement_learning.ipynb index 15c258e..30f2ab2 100644 --- a/16_reinforcement_learning.ipynb +++ b/16_reinforcement_learning.ipynb @@ -574,6 +574,15 @@ " plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "openai_cart_pole_rendering = False # don't try, just use the safe way?" + ] + }, { "cell_type": "code", "execution_count": 26, diff --git a/README.md b/README.md index bbec5aa..faa749c 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,11 @@ Of course, you obviously need Python. Python 2 is already preinstalled on most s $ python --version # for Python 2 $ python3 --version # for Python 3 -Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: +Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). If you are using Python 3.6 on MacOSX, you need to run the following command to install the `certifi` package of certificates because Python 3.6 on MacOSX has no certificates to validate SSL connections (see this [StackOverflow question](https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error)): + + $ /Applications/Python\ 3.6/Install\ Certificates.command + +On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: $ sudo apt-get update $ sudo apt-get install python3 @@ -49,9 +53,9 @@ When using Anaconda, you can optionally create an isolated Python environment de This creates a fresh Python 3.5 environment called `mlbook` (you can change the name if you want to), and it activates it. This environment contains all the scientific libraries that come with Anaconda. This includes all the libraries we will need (NumPy, Matplotlib, Pandas, Jupyter and a few others), except for TensorFlow, so let's install it: - $ conda install -n mlbook -c conda-forge tensorflow=1.0.0 + $ conda install -n mlbook -c conda-forge tensorflow=1.4.0 -This installs TensorFlow 1.0.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. +This installs TensorFlow 1.4.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. Next, you can optionally install Jupyter extensions. These are useful to have nice tables of contents in the notebooks, but they are not required. @@ -105,3 +109,6 @@ This should open up your browser, and you should see Jupyter's tree view, with t Note: you can also visit [http://localhost:8888/nbextensions](http://localhost:8888/nbextensions) to activate and configure Jupyter extensions. Congrats! You are ready to learn Machine Learning, hands on! + +# Contributors +I would like to thank everyone who contributed to this project, either by providing useful feedback, filing issues or submitting Pull Requests. Special thanks go to Steven Bunkley and Ziembla who created the `docker` directory. diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000..16adf41 --- /dev/null +++ b/docker/.env @@ -0,0 +1 @@ +COMPOSE_PROJECT_NAME=handson-ml diff --git a/docker/Dockerfile b/docker/Dockerfile index 54e5510..72a16f2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,28 +1,33 @@ FROM continuumio/anaconda3 RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y \ - libpq-dev \ - build-essential \ - git \ - sudo \ - && rm -rf /var/lib/apt/lists/* + && apt-get install -y \ + libpq-dev \ + build-essential \ + git \ + sudo \ + cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev libboost-all-dev libsdl2-dev swig \ + && rm -rf /var/lib/apt/lists/* +RUN conda update -n base conda RUN conda install -y -c conda-forge \ - tensorflow=1.0.0 \ - jupyter_contrib_nbextensions + tensorflow \ + jupyter_contrib_nbextensions \ + pyopengl +RUN pip install "gym[atari,box2d,classic_control]" ARG username ARG userid +ARG home=/home/${username} +ARG workdir=${home}/handson-ml + RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ - && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ - && chmod 0440 /etc/sudoers.d/${username} + && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ + && chmod 0440 /etc/sudoers.d/${username} -ENV HOME /home/${username} - -WORKDIR ${HOME}/handson-ml -RUN chown ${username}:${username} ${HOME}/handson-ml +WORKDIR ${workdir} +RUN chown ${username}:${username} ${workdir} USER ${username} @@ -30,6 +35,55 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main +# INFO: Jupyter and nbdime extension are not totally integrated (anaconda image is py36, +# nbdime checks for py35 at the moment, still the config below enables diffing +# notebooks with nbdiff (and nbdiff support in git diff command) after connecting +# to the container by "make exec" (or "docker-compose exec handson-ml bash") +# You may also try running: +# nbd NOTEBOOK_NAME.ipynb +# to get nbdiff between checkpointed version and current version of the given notebook +USER root +WORKDIR / +RUN conda install -y -c conda-forge nbdime +USER ${username} +WORKDIR ${workdir} + +RUN git-nbdiffdriver config --enable --global + +# INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# metadata or details in nbdiff within git diff +#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' + + +# INFO: Dirty nbdime patching (ignored if not matching) +COPY docker/nbdime-*.patch /tmp/ +USER root +WORKDIR / +RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-1-details.patch || true \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch || true +RUN rm /tmp/nbdime-*.patch +USER ${username} +WORKDIR ${workdir} + + +COPY docker/bashrc.bash /tmp/ +RUN cat /tmp/bashrc.bash >> ${home}/.bashrc +RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc +RUN sudo rm /tmp/bashrc.bash + + +# INFO: Uncomment lines below to enable automatic save of python-only and html-only +# exports alongside the notebook +#COPY docker/jupyter_notebook_config.py /tmp/ +#RUN cat /tmp/jupyter_notebook_config.py >> ${home}/.jupyter/jupyter_notebook_config.py +#RUN sudo rm /tmp/jupyter_notebook_config.py + +# INFO: Uncomment the RUN command below to disable git diff paging +#RUN git config --global core.pager '' + # INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) # That will switch jupyter to using empty password instead of a token. # To avoid making a security hole you SHOULD in fact not only uncomment but @@ -38,34 +92,6 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py - -# INFO: Uncomment the RUN command below to disable git diff paging -#RUN git config --global core.pager '' - - -# INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing -# notebooks with nbdiff (and nbdiff support in git diff command) after connecting to -# the container by "make exec" (docker exec) -# Try: -# nbd NOTEBOOK_NAME.ipynb -# to get nbdiff between checkpointed version and current version of the given notebook -USER root -WORKDIR / - -RUN conda install -y -c conda-forge nbdime - -USER ${username} -WORKDIR ${HOME}/handson-ml - -RUN git-nbdiffdriver config --enable --global - -# INFO: Uncomment the RUN command below to ignore metadata in nbdiff within git diff -#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' - - -COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${HOME}/.bashrc -RUN sudo rm -rf /tmp/bashrc +#RUN mkdir -p ${home}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/Makefile b/docker/Makefile index 6078fc9..f85c49a 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -4,7 +4,7 @@ help: run: docker-compose up exec: - docker-compose exec handson-ml /bin/bash + docker-compose exec handson-ml bash build: stop .FORCE docker-compose build rebuild: stop .FORCE diff --git a/docker/README.md b/docker/README.md index 50b6f12..037ae22 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,5 +1,5 @@ -# Hands-on Machine Learning in Docker :-) +# Hands-on Machine Learning in Docker This is the Docker configuration which allows you to run and tweak the book's notebooks without installing any dependencies on your machine!
OK, any except `docker`. With `docker-compose`. Well, you may also want `make` (but it is only used as thin layer to call a few simple `docker-compose` commands). @@ -32,7 +32,9 @@ You can close the server just by pressing `Ctrl-C` in terminal window. Run `make exec` (or `docker-compose exec handson-ml bash`) while the server is running to run an additional `bash` shell inside the `handson-ml` container. Now you're inside the environment prepared within the image. -One of the usefull things that can be done there may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. +One of the usefull things that can be done there would be starting TensorBoard (for example with simple `tb` command, see bashrc file). + +Another one may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. You can see changes you made relative to the version in git using `git diff` which is integrated with `nbdiff`. diff --git a/docker/bashrc b/docker/bashrc deleted file mode 100644 index 3535389..0000000 --- a/docker/bashrc +++ /dev/null @@ -1,12 +0,0 @@ -alias ll="ls -l" - -nbd() { - DIRNAME=$(dirname "$1") - BASENAME=$(basename "$1" .ipynb) - - WORKING_COPY=$DIRNAME/$BASENAME.ipynb - CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb - - # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" -} diff --git a/docker/bashrc.bash b/docker/bashrc.bash new file mode 100644 index 0000000..ff19745 --- /dev/null +++ b/docker/bashrc.bash @@ -0,0 +1,3 @@ +alias ll="ls -alF" +alias nbd="nbdiff_checkpoint" +alias tb="tensorboard --logdir=tf_logs" diff --git a/docker/bin/nbclean_checkpoints b/docker/bin/nbclean_checkpoints new file mode 100755 index 0000000..ba4aaf9 --- /dev/null +++ b/docker/bin/nbclean_checkpoints @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +import collections +import glob +import hashlib +import os +import subprocess + + +class NotebookAnalyser: + + def __init__(self, dry_run=False, verbose=False, colorful=False): + self._dry_run = dry_run + self._verbose = verbose + self._colors = collections.defaultdict(lambda: "") + if colorful: + for color in [ + NotebookAnalyser.COLOR_WHITE, + NotebookAnalyser.COLOR_RED, + NotebookAnalyser.COLOR_GREEN, + NotebookAnalyser.COLOR_YELLOW, + ]: + self._colors[color] = "\033[{}m".format(color) + + NOTEBOOK_SUFFIX = ".ipynb" + CHECKPOINT_DIR = NOTEBOOK_SUFFIX + "_checkpoints" + CHECKPOINT_MASK = "*-checkpoint" + NOTEBOOK_SUFFIX + CHECKPOINT_MASK_LEN = len(CHECKPOINT_MASK) - 1 + + @staticmethod + def get_hash(file_path): + with open(file_path, "rb") as input: + hash = hashlib.md5() + for chunk in iter(lambda: input.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() + + MESSAGE_ORPHANED = "missing " + MESSAGE_MODIFIED = "modified" + MESSAGE_DELETED = "DELETING" + + COLOR_WHITE = "0" + COLOR_RED = "31" + COLOR_GREEN = "32" + COLOR_YELLOW = "33" + + def log(self, message, file, color=COLOR_WHITE): + color_on = self._colors[color] + color_off = self._colors[NotebookAnalyser.COLOR_WHITE] + print("{}{}{}: {}".format(color_on, message, color_off, file)) + + def clean_checkpoints(self, directory): + for checkpoint_path in sorted(glob.glob(os.path.join(directory, NotebookAnalyser.CHECKPOINT_MASK))): + + workfile_dir = os.path.dirname(os.path.dirname(checkpoint_path)) + workfile_name = os.path.basename(checkpoint_path)[:-NotebookAnalyser.CHECKPOINT_MASK_LEN] + NotebookAnalyser.NOTEBOOK_SUFFIX + workfile_path = os.path.join(workfile_dir, workfile_name) + + status = "" + if not os.path.isfile(workfile_path): + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_ORPHANED, workfile_path, NotebookAnalyser.COLOR_RED) + else: + checkpoint_stat = os.stat(checkpoint_path) + workfile_stat = os.stat(workfile_path) + + modified = workfile_stat.st_size != checkpoint_stat.st_size + + if not modified: + checkpoint_hash = NotebookAnalyser.get_hash(checkpoint_path) + workfile_hash = NotebookAnalyser.get_hash(workfile_path) + modified = checkpoint_hash != workfile_hash + + if modified: + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_MODIFIED, workfile_path, NotebookAnalyser.COLOR_YELLOW) + else: + self.log(NotebookAnalyser.MESSAGE_DELETED, checkpoint_path, NotebookAnalyser.COLOR_GREEN) + if not self._dry_run: + os.remove(checkpoint_path) + + if not self._dry_run and not os.listdir(directory): + self.log(NotebookAnalyser.MESSAGE_DELETED, directory, NotebookAnalyser.COLOR_GREEN) + os.rmdir(directory) + + def clean_checkpoints_recursively(self, directory): + for (root, subdirs, files) in os.walk(directory): + subdirs.sort() # INFO: traverse alphabetically + if NotebookAnalyser.CHECKPOINT_DIR in subdirs: + subdirs.remove(NotebookAnalyser.CHECKPOINT_DIR) # INFO: don't recurse there + self.clean_checkpoints(os.path.join(root, NotebookAnalyser.CHECKPOINT_DIR)) + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove checkpointed versions of those jupyter notebooks that are identical to their working copies.", + epilog="""Notebooks will be reported as either + "DELETED" if the working copy and checkpointed version are identical + (checkpoint will be deleted), + "missing" if there is a checkpoint but no corresponding working file can be found + or "modified" if notebook and the checkpoint are not byte-to-byte identical. + If removal of checkpoints results in empty ".ipynb_checkpoints" directory + that directory is also deleted. + """) #, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("dirs", metavar="DIR", type=str, nargs="*", default=".", help="directories to search") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") + parser.add_argument("-c", "--color", action="store_true", help="colorful mode") + args = parser.parse_args() + + analyser = NotebookAnalyser(args.dry_run, args.verbose, args.color) + for directory in args.dirs: + analyser.clean_checkpoints_recursively(directory) + +if __name__ == "__main__": + main() diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint new file mode 100755 index 0000000..9ce7cd0 --- /dev/null +++ b/docker/bin/nbdiff_checkpoint @@ -0,0 +1,17 @@ +#!/bin/bash +if [[ "$#" -lt 1 || "$1" =~ ^((-h)|(--help))$ ]] ; then + echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" + echo + echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" + exit +fi + +DIRNAME=$(dirname "$1") +BASENAME=$(basename "$1" .ipynb) +shift + +WORKING_COPY=$DIRNAME/$BASENAME.ipynb +CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb + +echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details "$@" diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs new file mode 100755 index 0000000..34f3ea9 --- /dev/null +++ b/docker/bin/rm_empty_subdirs @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import os + +def remove_empty_directories(initial_dir, + allow_initial_delete=False, ignore_nonexistant_initial=False, + dry_run=False, quiet=False): + + FORBIDDEN_SUBDIRS = set([".git"]) + + if not os.path.isdir(initial_dir) and not ignore_nonexistant_initial: + raise RuntimeError("Initial directory '{}' not found!".format(initial_dir)) + + message = "removed" + if dry_run: + message = "to be " + message + + deleted = set() + + for (directory, subdirs, files) in os.walk(initial_dir, topdown=False): + forbidden = False + parent = directory + while parent: + parent, dirname = os.path.split(parent) + if dirname in FORBIDDEN_SUBDIRS: + forbidden = True + break + if forbidden: + continue + + is_empty = len(files) < 1 and len(set([os.path.join(directory, s) for s in subdirs]) - deleted) < 1 + + if is_empty and (initial_dir != directory or allow_initial_delete): + if not quiet: + print("{}: {}".format(message, directory)) + deleted.add(directory) + if not dry_run: + os.rmdir(directory) + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") + parser.add_argument("dir", metavar="DIR", type=str, nargs="+", help="directory to be searched") + parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") + parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-q", "--quiet", action="store_true", help="don't print names of directories being removed") + args = parser.parse_args() + for directory in args.dir: + remove_empty_directories(directory, args.allow_dir_removal, args.ignore_nonexistent_dir, + args.dry_run, args.quiet) + +if __name__ == "__main__": + main() diff --git a/docker/bin/tensorboard b/docker/bin/tensorboard new file mode 100755 index 0000000..dd7294d --- /dev/null +++ b/docker/bin/tensorboard @@ -0,0 +1,2 @@ +#!/bin/bash +python -m tensorboard.main "$@" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8a9718c..d4b46e4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,6 +15,7 @@ services: max-size: 50m ports: - "8888:8888" + - "6006:6006" volumes: - ../:/home/devel/handson-ml command: /opt/conda/bin/jupyter notebook --ip='*' --port=8888 --no-browser diff --git a/docker/jupyter_notebook_config.py b/docker/jupyter_notebook_config.py new file mode 100644 index 0000000..971a49a --- /dev/null +++ b/docker/jupyter_notebook_config.py @@ -0,0 +1,15 @@ +import os +import subprocess + +def export_script_and_view(model, os_path, contents_manager): + if model["type"] != "notebook": + return + dir_name, file_name = os.path.split(os_path) + file_base, file_ext = os.path.splitext(file_name) + if file_base.startswith("Untitled"): + return + export_name = file_base if file_ext == ".ipynb" else file_name + subprocess.check_call(["jupyter", "nbconvert", "--to", "script", file_name, "--output", export_name + "_script"], cwd=dir_name) + subprocess.check_call(["jupyter", "nbconvert", "--to", "html", file_name, "--output", export_name + "_view"], cwd=dir_name) + +c.FileContentsManager.post_save_hook = export_script_and_view diff --git a/docker/nbdime-1-details.patch b/docker/nbdime-1-details.patch new file mode 100644 index 0000000..98f76d6 --- /dev/null +++ b/docker/nbdime-1-details.patch @@ -0,0 +1,17 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -548,8 +548,12 @@ def set_notebook_diff_targets(sources=True, outputs=True, attachments=True, meta + metadata_keys = ("/cells/*/metadata", "/metadata", "/cells/*/outputs/*/metadata") + if metadata: + for key in metadata_keys: +- if key in notebook_differs: +- del notebook_differs[key] ++ if details: ++ if key in notebook_differs: ++ del notebook_differs[key] ++ else: ++ notebook_differs[key] = diff_ignore_keys( ++ inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore diff --git a/docker/nbdime-2-toc.patch b/docker/nbdime-2-toc.patch new file mode 100644 index 0000000..4924e66 --- /dev/null +++ b/docker/nbdime-2-toc.patch @@ -0,0 +1,11 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -553,7 +553,7 @@ + del notebook_differs[key] + else: + notebook_differs[key] = diff_ignore_keys( +- inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) ++ inner_differ=diff, ignore_keys=['toc', 'collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore diff --git a/extra_capsnets.ipynb b/extra_capsnets.ipynb index 67e67bd..1569b4a 100644 --- a/extra_capsnets.ipynb +++ b/extra_capsnets.ipynb @@ -32,7 +32,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Watch [this video](https://www.youtube.com/embed/pPN8d0E3900) to understand the key ideas behind Capsule Networks:" + "Watch [this video](https://youtu.be/pPN8d0E3900) to understand the key ideas behind Capsule Networks:" ] }, { @@ -42,12 +42,23 @@ "outputs": [], "source": [ "from IPython.display import HTML\n", - "\n", - "# Display the video in an iframe:\n", - "HTML(\"\"\"\"\"\")" + "HTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may also want to watch [this video](https://youtu.be/2Kawrd5szHE), which presents the main difficulties in this notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "HTML(\"\"\"\"\"\")" ] }, { @@ -66,10 +77,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function, unicode_literals" @@ -84,10 +93,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -104,10 +111,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, + "execution_count": 5, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -130,10 +135,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()" @@ -148,10 +151,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -174,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -192,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -217,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -275,10 +276,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true - }, + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ "X = tf.placeholder(shape=[None, 28, 28, 1], dtype=tf.float32, name=\"X\")" @@ -300,10 +299,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, + "execution_count": 12, + "metadata": {}, "outputs": [], "source": [ "caps1_n_maps = 32\n", @@ -320,10 +317,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "conv1_params = {\n", @@ -345,10 +340,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "conv1 = tf.layers.conv2d(X, name=\"conv1\", **conv1_params)\n", @@ -371,10 +364,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [], "source": [ "caps1_raw = tf.reshape(conv2, [-1, caps1_n_caps, caps1_n_dims],\n", @@ -396,10 +387,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "def squash(s, axis=-1, epsilon=1e-7, name=None):\n", @@ -421,10 +410,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "caps1_output = squash(caps1_raw, name=\"caps1_output\")" @@ -467,10 +454,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": true - }, + "execution_count": 18, + "metadata": {}, "outputs": [], "source": [ "caps2_n_caps = 10\n", @@ -510,6 +495,7 @@ "metadata": {}, "source": [ "We can apply this function to compute $\\hat{\\mathbf{u}}_{j|i}$ for every pair of capsules ($i$, $j$) like this (recall that there are 6×6×32=1152 capsules in the first layer, and 10 in the second layer):\n", + "\n", "$\n", "\\pmatrix{\n", " \\mathbf{W}_{1,1} & \\mathbf{W}_{1,2} & \\cdots & \\mathbf{W}_{1,10} \\\\\n", @@ -551,18 +537,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Okay, let's start by creating a trainable variable of shape (1, 1152, 10, 16, 8) that will hold all the transformation matrices. The first dimension of size 1 will make this array easy to tile. We initialize this variable randomly using a normal distribution with a standard deviation to 0.01." + "Okay, let's start by creating a trainable variable of shape (1, 1152, 10, 16, 8) that will hold all the transformation matrices. The first dimension of size 1 will make this array easy to tile. We initialize this variable randomly using a normal distribution with a standard deviation to 0.1." ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true - }, + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ - "init_sigma = 0.01\n", + "init_sigma = 0.1\n", "\n", "W_init = tf.random_normal(\n", " shape=(1, caps1_n_caps, caps2_n_caps, caps2_n_dims, caps1_n_dims),\n", @@ -579,10 +563,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true - }, + "execution_count": 20, + "metadata": {}, "outputs": [], "source": [ "batch_size = tf.shape(X)[0]\n", @@ -598,10 +580,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "caps1_output_expanded = tf.expand_dims(caps1_output, -1,\n", @@ -621,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -637,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -653,10 +633,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [], "source": [ "caps2_predicted = tf.matmul(W_tiled, caps1_output_tiled,\n", @@ -672,7 +650,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -702,10 +680,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true - }, + "execution_count": 26, + "metadata": {}, "outputs": [], "source": [ "raw_weights = tf.zeros([batch_size, caps1_n_caps, caps2_n_caps, 1, 1],\n", @@ -735,10 +711,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "routing_weights = tf.nn.softmax(raw_weights, dim=2, name=\"routing_weights\")" @@ -753,10 +727,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [], "source": [ "weighted_predictions = tf.multiply(routing_weights, caps2_predicted,\n", @@ -785,10 +757,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true - }, + "execution_count": 29, + "metadata": {}, "outputs": [], "source": [ "caps2_output_round_1 = squash(weighted_sum, axis=-2,\n", @@ -797,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -841,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -857,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -873,10 +843,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "caps2_output_round_1_tiled = tf.tile(\n", @@ -893,10 +861,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": true - }, + "execution_count": 34, + "metadata": {}, "outputs": [], "source": [ "agreement = tf.matmul(caps2_predicted, caps2_output_round_1_tiled,\n", @@ -912,10 +878,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": true - }, + "execution_count": 35, + "metadata": {}, "outputs": [], "source": [ "raw_weights_round_2 = tf.add(raw_weights, agreement,\n", @@ -931,10 +895,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": true - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "routing_weights_round_2 = tf.nn.softmax(raw_weights_round_2,\n", @@ -960,10 +922,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": true - }, + "execution_count": 37, + "metadata": {}, "outputs": [], "source": [ "caps2_output = caps2_output_round_2" @@ -991,7 +951,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1031,7 +991,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1061,10 +1021,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": true - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ "def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):\n", @@ -1076,10 +1034,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": true - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ "y_proba = safe_norm(caps2_output, axis=-2, name=\"y_proba\")" @@ -1094,10 +1050,8 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": true - }, + "execution_count": 42, + "metadata": {}, "outputs": [], "source": [ "y_proba_argmax = tf.argmax(y_proba, axis=2, name=\"y_proba\")" @@ -1112,7 +1066,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1128,10 +1082,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": true - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "y_pred = tf.squeeze(y_proba_argmax, axis=[1,2], name=\"y_pred\")" @@ -1139,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1169,10 +1121,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": true - }, + "execution_count": 46, + "metadata": {}, "outputs": [], "source": [ "y = tf.placeholder(shape=[None], dtype=tf.int64, name=\"y\")" @@ -1191,7 +1141,7 @@ "source": [ "The paper uses a special margin loss to make it possible to detect two or more different digits in each image:\n", "\n", - "$ L_k = T_k \\max(0, m^{+} - \\|\\mathbf{v}_k\\|)^2 - \\lambda (1 - T_k) \\max(0, \\|\\mathbf{v}_k\\| - m^{-})^2$\n", + "$ L_k = T_k \\max(0, m^{+} - \\|\\mathbf{v}_k\\|)^2 + \\lambda (1 - T_k) \\max(0, \\|\\mathbf{v}_k\\| - m^{-})^2$\n", "\n", "* $T_k$ is equal to 1 if the digit of class $k$ is present, or 0 otherwise.\n", "* In the paper, $m^{+} = 0.9$, $m^{-} = 0.1$ and $\\lambda = 0.5$.\n", @@ -1200,10 +1150,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": true - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "m_plus = 0.9\n", @@ -1220,10 +1168,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": true - }, + "execution_count": 48, + "metadata": {}, "outputs": [], "source": [ "T = tf.one_hot(y, depth=caps2_n_caps, name=\"T\")" @@ -1238,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1255,7 +1201,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -1271,10 +1217,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": true - }, + "execution_count": 51, + "metadata": {}, "outputs": [], "source": [ "caps2_output_norm = safe_norm(caps2_output, axis=-2, keep_dims=True,\n", @@ -1290,10 +1234,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": true - }, + "execution_count": 52, + "metadata": {}, "outputs": [], "source": [ "present_error_raw = tf.square(tf.maximum(0., m_plus - caps2_output_norm),\n", @@ -1311,10 +1253,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true - }, + "execution_count": 53, + "metadata": {}, "outputs": [], "source": [ "absent_error_raw = tf.square(tf.maximum(0., caps2_output_norm - m_minus),\n", @@ -1332,10 +1272,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": true - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "L = tf.add(T * present_error, lambda_ * (1.0 - T) * absent_error,\n", @@ -1351,10 +1289,8 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": true - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "margin_loss = tf.reduce_mean(tf.reduce_sum(L, axis=1), name=\"margin_loss\")" @@ -1397,10 +1333,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": true - }, + "execution_count": 56, + "metadata": {}, "outputs": [], "source": [ "mask_with_labels = tf.placeholder_with_default(False, shape=(),\n", @@ -1416,10 +1350,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": true - }, + "execution_count": 57, + "metadata": {}, "outputs": [], "source": [ "reconstruction_targets = tf.cond(mask_with_labels, # condition\n", @@ -1446,10 +1378,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": true - }, + "execution_count": 58, + "metadata": {}, "outputs": [], "source": [ "reconstruction_mask = tf.one_hot(reconstruction_targets,\n", @@ -1466,7 +1396,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1482,7 +1412,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1498,10 +1428,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": true - }, + "execution_count": 61, + "metadata": {}, "outputs": [], "source": [ "reconstruction_mask_reshaped = tf.reshape(\n", @@ -1518,10 +1446,8 @@ }, { "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": true - }, + "execution_count": 62, + "metadata": {}, "outputs": [], "source": [ "caps2_output_masked = tf.multiply(\n", @@ -1531,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1547,10 +1473,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "decoder_input = tf.reshape(caps2_output_masked,\n", @@ -1567,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1590,10 +1514,8 @@ }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": true - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "n_hidden1 = 512\n", @@ -1603,10 +1525,8 @@ }, { "cell_type": "code", - "execution_count": 66, - "metadata": { - "collapsed": true - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"decoder\"):\n", @@ -1637,16 +1557,14 @@ }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": true - }, + "execution_count": 68, + "metadata": {}, "outputs": [], "source": [ "X_flat = tf.reshape(X, [-1, n_output], name=\"X_flat\")\n", "squared_difference = tf.square(X_flat - decoder_output,\n", " name=\"squared_difference\")\n", - "reconstruction_loss = tf.reduce_sum(squared_difference,\n", + "reconstruction_loss = tf.reduce_mean(squared_difference,\n", " name=\"reconstruction_loss\")" ] }, @@ -1666,10 +1584,8 @@ }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": true - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "alpha = 0.0005\n", @@ -1700,10 +1616,8 @@ }, { "cell_type": "code", - "execution_count": 69, - "metadata": { - "collapsed": true - }, + "execution_count": 70, + "metadata": {}, "outputs": [], "source": [ "correct = tf.equal(y, y_pred, name=\"correct\")\n", @@ -1726,10 +1640,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": true - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "optimizer = tf.train.AdamOptimizer()\n", @@ -1752,10 +1664,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": true - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -1792,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1858,7 +1768,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Training is finished, we reached over 99.3% accuracy on the validation set after just 5 epochs, things are looking good. Now let's evaluate the model on the test set." + "Training is finished, we reached over 99.4% accuracy on the validation set after just 5 epochs, things are looking good. Now let's evaluate the model on the test set." ] }, { @@ -1870,7 +1780,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1903,7 +1813,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We reach 99.43% accuracy on the test set. Pretty nice. :)" + "We reach 99.53% accuracy on the test set. Pretty nice. :)" ] }, { @@ -1922,7 +1832,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1954,7 +1864,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -2010,7 +1920,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -2026,10 +1936,8 @@ }, { "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": true - }, + "execution_count": 78, + "metadata": {}, "outputs": [], "source": [ "def tweak_pose_parameters(output_vectors, min=-0.5, max=0.5, n_steps=11):\n", @@ -2050,10 +1958,8 @@ }, { "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": true - }, + "execution_count": 79, + "metadata": {}, "outputs": [], "source": [ "n_steps = 11\n", @@ -2072,7 +1978,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -2096,10 +2002,8 @@ }, { "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": true - }, + "execution_count": 81, + "metadata": {}, "outputs": [], "source": [ "tweak_reconstructions = decoder_output_value.reshape(\n", @@ -2115,7 +2019,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -2149,9 +2053,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -2172,7 +2074,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.3" } }, "nbformat": 4, diff --git a/tools_numpy.ipynb b/tools_numpy.ipynb index 5ec032d..ed2d81b 100644 --- a/tools_numpy.ipynb +++ b/tools_numpy.ipynb @@ -459,7 +459,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "NumPy first creates three `ndarrays` (one per dimension), each of shape `(3, 2, 10)`. Each array has values equal to the coordinate along a specific axis. For example, all elements in the `z` array are equal to their z-coordinate:\n", + "NumPy first creates three `ndarrays` (one per dimension), each of shape `(2, 10)`. Each array has values equal to the coordinate along a specific axis. For example, all elements in the `z` array are equal to their z-coordinate:\n", "\n", " [[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n", diff --git a/tools_pandas.ipynb b/tools_pandas.ipynb index 379443e..6580f20 100644 --- a/tools_pandas.ipynb +++ b/tools_pandas.ipynb @@ -23,9 +23,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function, unicode_literals" @@ -41,9 +39,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd" @@ -71,9 +67,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s = pd.Series([2,-1,3,5])\n", @@ -91,9 +85,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -110,9 +102,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s + [1000,2000,3000,4000]" @@ -128,9 +118,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s + 1000" @@ -146,9 +134,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s < 0" @@ -165,9 +151,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2 = pd.Series([68, 83, 112, 68], index=[\"alice\", \"bob\", \"charles\", \"darwin\"])\n", @@ -184,9 +168,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2[\"bob\"]" @@ -202,9 +184,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2[1]" @@ -220,9 +200,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.loc[\"bob\"]" @@ -231,9 +209,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.iloc[1]" @@ -249,9 +225,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.iloc[1:3]" @@ -267,9 +241,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise = pd.Series([1000, 1001, 1002, 1003])\n", @@ -279,9 +251,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise_slice = surprise[2:]\n", @@ -298,9 +268,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -319,9 +287,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise_slice.iloc[0]" @@ -338,9 +304,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "weights = {\"alice\": 68, \"bob\": 83, \"colin\": 86, \"darwin\": 68}\n", @@ -358,9 +322,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s4 = pd.Series(weights, index = [\"colin\", \"alice\"])\n", @@ -378,9 +340,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print(s2.keys())\n", @@ -401,9 +361,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s5 = pd.Series([1000,1000,1000,1000])\n", @@ -431,9 +389,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "meaning = pd.Series(42, [\"life\", \"universe\", \"everything\"])\n", @@ -451,9 +407,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s6 = pd.Series([83, 68], index=[\"bob\", \"alice\"], name=\"weights\")\n", @@ -465,14 +419,13 @@ "metadata": {}, "source": [ "## Plotting a `Series`\n", - "Pandas makes it easy to plot `Series` data using matplotlib (for more details on matplotlib, check out the [matplotlib tutorial](tools_matplotlib.ipynb)). Just import matplotlib and call the `plot` method:" + "Pandas makes it easy to plot `Series` data using matplotlib (for more details on matplotlib, check out the [matplotlib tutorial](tools_matplotlib.ipynb)). Just import matplotlib and call the `plot()` method:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -504,15 +457,13 @@ "* it can handle timezones.\n", "\n", "## Time range\n", - "Let's start by creating a time series using `timerange`. This returns a `DatetimeIndex` containing one datetime per hour for 12 hours starting on October 29th 2016 at 5:30pm." + "Let's start by creating a time series using `pd.date_range()`. This returns a `DatetimeIndex` containing one datetime per hour for 12 hours starting on October 29th 2016 at 5:30pm." ] }, { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dates = pd.date_range('2016/10/29 5:30pm', periods=12, freq='H')\n", @@ -529,9 +480,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series = pd.Series(temperatures, dates)\n", @@ -548,9 +497,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series.plot(kind=\"bar\")\n", @@ -564,15 +511,13 @@ "metadata": {}, "source": [ "## Resampling\n", - "Pandas let's us resample a time series very simply. Just call the `resample` method and specify a new frequency:" + "Pandas lets us resample a time series very simply. Just call the `resample()` method and specify a new frequency:" ] }, { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series_freq_2H = temp_series.resample(\"2H\")\n", @@ -583,15 +528,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's take a look at the result:" + "The resampling operation is actually a deferred operation, which is why we did not get a `Series` object, but a `DatetimeIndexResampler` object instead. To actually perform the resampling operation, we can simply call the `mean()` method: Pandas will compute the mean of every pair of consecutive hours:" ] }, { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, + "outputs": [], + "source": [ + "temp_series_freq_2H = temp_series_freq_2H.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, "outputs": [], "source": [ "temp_series_freq_2H.plot(kind=\"bar\")\n", @@ -602,18 +561,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note how the values have automatically been aggregated into 2-hour periods. If we look at the 6-8pm period, for example, we had a value of `5.1` at 6:30pm, and `6.1` at 7:30pm. After resampling, we just have one value of `5.6`, which is the mean of `5.1` and `6.1`. Computing the mean is the default behavior, but it is also possible to use a different aggregation function, for example we can decide to keep the minimum value of each period:" + "Note how the values have automatically been aggregated into 2-hour periods. If we look at the 6-8pm period, for example, we had a value of `5.1` at 6:30pm, and `6.1` at 7:30pm. After resampling, we just have one value of `5.6`, which is the mean of `5.1` and `6.1`. Rather than computing the mean, we could have used any other aggregation function, for example we can decide to keep the minimum value of each period:" ] }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ - "temp_series_freq_2H = temp_series.resample(\"2H\", how=np.min)\n", + "temp_series_freq_2H = temp_series.resample(\"2H\").min()\n", + "temp_series_freq_2H" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or, equivalently, we could use the `apply()` method instead:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "temp_series_freq_2H = temp_series.resample(\"2H\").apply(np.min)\n", "temp_series_freq_2H" ] }, @@ -627,13 +601,11 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ - "temp_series_freq_15min = temp_series.resample(\"15Min\")\n", + "temp_series_freq_15min = temp_series.resample(\"15Min\").mean()\n", "temp_series_freq_15min.head(n=10) # `head` displays the top n values" ] }, @@ -641,14 +613,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "One solution is to fill the gaps by interpolating. We just call the `interpolate` method. The default is to use linear interpolation, but we can also select another method, such as cubic interpolation:" + "One solution is to fill the gaps by interpolating. We just call the `interpolate()` method. The default is to use linear interpolation, but we can also select another method, such as cubic interpolation:" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -659,10 +630,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, + "execution_count": 35, + "metadata": {}, "outputs": [], "source": [ "temp_series.plot(label=\"Period: 1 hour\")\n", @@ -676,15 +645,13 @@ "metadata": {}, "source": [ "## Timezones\n", - "By default datetimes are *naive*: they are not aware of timezones, so 2016-10-30 02:30 might mean October 30th 2016 at 2:30am in Paris or in New York. We can make datetimes timezone *aware* by calling the `tz_localize` method:" + "By default datetimes are *naive*: they are not aware of timezones, so 2016-10-30 02:30 might mean October 30th 2016 at 2:30am in Paris or in New York. We can make datetimes timezone *aware* by calling the `tz_localize()` method:" ] }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": false - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "temp_series_ny = temp_series.tz_localize(\"America/New_York\")\n", @@ -702,10 +669,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, + "execution_count": 37, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris = temp_series_ny.tz_convert(\"Europe/Paris\")\n", @@ -721,10 +686,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": false - }, + "execution_count": 38, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris_naive = temp_series_paris.tz_localize(None)\n", @@ -740,10 +703,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false - }, + "execution_count": 39, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -762,10 +723,8 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": false - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris_naive.tz_localize(\"Europe/Paris\", ambiguous=\"infer\")" @@ -776,15 +735,13 @@ "metadata": {}, "source": [ "## Periods\n", - "The `period_range` function returns a `PeriodIndex` instead of a `DatetimeIndex`. For example, let's get all quarters in 2016 and 2017:" + "The `pd.period_range()` function returns a `PeriodIndex` instead of a `DatetimeIndex`. For example, let's get all quarters in 2016 and 2017:" ] }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ "quarters = pd.period_range('2016Q1', periods=8, freq='Q')\n", @@ -800,10 +757,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, + "execution_count": 42, + "metadata": {}, "outputs": [], "source": [ "quarters + 3" @@ -813,15 +768,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `asfreq` method lets us change the frequency of the `PeriodIndex`. All periods are lengthened or shortened accordingly. For example, let's convert all the quarterly periods to monthly periods (zooming in):" + "The `asfreq()` method lets us change the frequency of the `PeriodIndex`. All periods are lengthened or shortened accordingly. For example, let's convert all the quarterly periods to monthly periods (zooming in):" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, + "execution_count": 43, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"M\")" @@ -836,10 +789,8 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"M\", how=\"start\")" @@ -854,10 +805,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": false - }, + "execution_count": 45, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"A\")" @@ -872,10 +821,8 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": false - }, + "execution_count": 46, + "metadata": {}, "outputs": [], "source": [ "quarterly_revenue = pd.Series([300, 320, 290, 390, 320, 360, 310, 410], index = quarters)\n", @@ -884,10 +831,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": false - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "quarterly_revenue.plot(kind=\"line\")\n", @@ -903,10 +848,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false - }, + "execution_count": 48, + "metadata": {}, "outputs": [], "source": [ "last_hours = quarterly_revenue.to_timestamp(how=\"end\", freq=\"H\")\n", @@ -922,10 +865,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [], "source": [ "last_hours.to_period()" @@ -940,10 +881,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, + "execution_count": 50, + "metadata": {}, "outputs": [], "source": [ "months_2016 = pd.period_range(\"2016\", periods=12, freq=\"M\")\n", @@ -965,10 +904,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, + "execution_count": 51, + "metadata": {}, "outputs": [], "source": [ "people_dict = {\n", @@ -1001,10 +938,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false - }, + "execution_count": 52, + "metadata": {}, "outputs": [], "source": [ "people[\"birthyear\"]" @@ -1019,10 +954,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, + "execution_count": 53, + "metadata": {}, "outputs": [], "source": [ "people[[\"birthyear\", \"hobby\"]]" @@ -1037,10 +970,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "d2 = pd.DataFrame(\n", @@ -1060,10 +991,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "values = [\n", @@ -1088,16 +1017,14 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, + "execution_count": 56, + "metadata": {}, "outputs": [], "source": [ "masked_array = np.ma.asarray(values, dtype=np.object)\n", "masked_array[(0, 2), (1, 2)] = np.ma.masked\n", "d3 = pd.DataFrame(\n", - " values,\n", + " masked_array,\n", " columns=[\"birthyear\", \"children\", \"hobby\", \"weight\"],\n", " index=[\"alice\", \"bob\", \"charles\"]\n", " )\n", @@ -1113,10 +1040,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false - }, + "execution_count": 57, + "metadata": {}, "outputs": [], "source": [ "d4 = pd.DataFrame(\n", @@ -1136,10 +1061,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, + "execution_count": 58, + "metadata": {}, "outputs": [], "source": [ "people = pd.DataFrame({\n", @@ -1161,10 +1084,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, + "execution_count": 59, + "metadata": {}, "outputs": [], "source": [ "d5 = pd.DataFrame(\n", @@ -1191,10 +1112,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, + "execution_count": 60, + "metadata": {}, "outputs": [], "source": [ "d5[\"public\"]" @@ -1202,13 +1121,11 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false - }, + "execution_count": 61, + "metadata": {}, "outputs": [], "source": [ - "d5[\"public\", \"hobby\"] # Same result as d4[\"public\"][\"hobby\"]" + "d5[\"public\", \"hobby\"] # Same result as d5[\"public\"][\"hobby\"]" ] }, { @@ -1221,10 +1138,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false - }, + "execution_count": 62, + "metadata": {}, "outputs": [], "source": [ "d5" @@ -1234,15 +1149,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are two levels of columns, and two levels of indices. We can drop a column level by calling `droplevel` (the same goes for indices):" + "There are two levels of columns, and two levels of indices. We can drop a column level by calling `droplevel()` (the same goes for indices):" ] }, { "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": false - }, + "execution_count": 63, + "metadata": {}, "outputs": [], "source": [ "d5.columns = d5.columns.droplevel(level = 0)\n", @@ -1259,10 +1172,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": false - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "d6 = d5.T\n", @@ -1274,15 +1185,13 @@ "metadata": {}, "source": [ "## Stacking and unstacking levels\n", - "Calling the `stack` method will push the lowest column level after the lowest index:" + "Calling the `stack()` method will push the lowest column level after the lowest index:" ] }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": false - }, + "execution_count": 65, + "metadata": {}, "outputs": [], "source": [ "d7 = d6.stack()\n", @@ -1295,15 +1204,13 @@ "source": [ "Note that many `NaN` values appeared. This makes sense because many new combinations did not exist before (eg. there was no `bob` in `London`).\n", "\n", - "Calling `unstack` will do the reverse, once again creating many `NaN` values." + "Calling `unstack()` will do the reverse, once again creating many `NaN` values." ] }, { "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": false - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "d8 = d7.unstack()\n", @@ -1319,10 +1226,8 @@ }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": false - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "d9 = d8.unstack()\n", @@ -1333,14 +1238,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `stack` and `unstack` methods let you select the `level` to stack/unstack. You can even stack/unstack multiple levels at once:" + "The `stack()` and `unstack()` methods let you select the `level` to stack/unstack. You can even stack/unstack multiple levels at once:" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 68, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -1354,7 +1258,7 @@ "metadata": {}, "source": [ "## Most methods return modified copies\n", - "As you may have noticed, the `stack` and `unstack` methods do not modify the object they apply to. Instead, they work on a copy and return that copy. This is true of most methods in pandas." + "As you may have noticed, the `stack()` and `unstack()` methods do not modify the object they apply to. Instead, they work on a copy and return that copy. This is true of most methods in pandas." ] }, { @@ -1367,10 +1271,8 @@ }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": false - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "people" @@ -1380,15 +1282,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `loc` attribute lets you access rows instead of columns. The result is `Series` object in which the `DataFrame`'s column names are mapped to row index labels:" + "The `loc` attribute lets you access rows instead of columns. The result is a `Series` object in which the `DataFrame`'s column names are mapped to row index labels:" ] }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": false - }, + "execution_count": 70, + "metadata": {}, "outputs": [], "source": [ "people.loc[\"charles\"]" @@ -1403,10 +1303,8 @@ }, { "cell_type": "code", - "execution_count": 69, - "metadata": { - "collapsed": false - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "people.iloc[2]" @@ -1421,10 +1319,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": false - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "people.iloc[1:3]" @@ -1439,10 +1335,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": false - }, + "execution_count": 73, + "metadata": {}, "outputs": [], "source": [ "people[np.array([True, False, True])]" @@ -1457,10 +1351,8 @@ }, { "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, + "execution_count": 74, + "metadata": {}, "outputs": [], "source": [ "people[people[\"birthyear\"] < 1990]" @@ -1476,10 +1368,8 @@ }, { "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": false - }, + "execution_count": 75, + "metadata": {}, "outputs": [], "source": [ "people" @@ -1487,13 +1377,11 @@ }, { "cell_type": "code", - "execution_count": 74, - "metadata": { - "collapsed": false - }, + "execution_count": 76, + "metadata": {}, "outputs": [], "source": [ - "people[\"age\"] = 2016 - people[\"birthyear\"] # adds a new column \"age\"\n", + "people[\"age\"] = 2018 - people[\"birthyear\"] # adds a new column \"age\"\n", "people[\"over 30\"] = people[\"age\"] > 30 # adds another column \"over 30\"\n", "birthyears = people.pop(\"birthyear\")\n", "del people[\"children\"]\n", @@ -1503,10 +1391,8 @@ }, { "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, + "execution_count": 77, + "metadata": {}, "outputs": [], "source": [ "birthyears" @@ -1521,10 +1407,8 @@ }, { "cell_type": "code", - "execution_count": 76, - "metadata": { - "collapsed": false - }, + "execution_count": 78, + "metadata": {}, "outputs": [], "source": [ "people[\"pets\"] = pd.Series({\"bob\": 0, \"charles\": 5, \"eugene\":1}) # alice is missing, eugene is ignored\n", @@ -1535,15 +1419,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When adding a new column, it is added at the end (on the right) by default. You can also insert a column anywhere else using the `insert` method:" + "When adding a new column, it is added at the end (on the right) by default. You can also insert a column anywhere else using the `insert()` method:" ] }, { "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": false - }, + "execution_count": 79, + "metadata": {}, "outputs": [], "source": [ "people.insert(1, \"height\", [172, 181, 185])\n", @@ -1555,15 +1437,13 @@ "metadata": {}, "source": [ "## Assigning new columns\n", - "You can also create new columns by calling the `assign` method. Note that this returns a new `DataFrame` object, the original is not modified:" + "You can also create new columns by calling the `assign()` method. Note that this returns a new `DataFrame` object, the original is not modified:" ] }, { "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": false - }, + "execution_count": 80, + "metadata": {}, "outputs": [], "source": [ "people.assign(\n", @@ -1581,10 +1461,8 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false - }, + "execution_count": 81, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -1605,10 +1483,8 @@ }, { "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": false - }, + "execution_count": 82, + "metadata": {}, "outputs": [], "source": [ "d6 = people.assign(body_mass_index = people[\"weight\"] / (people[\"height\"] / 100) ** 2)\n", @@ -1624,10 +1500,8 @@ }, { "cell_type": "code", - "execution_count": 81, - "metadata": { - "collapsed": false - }, + "execution_count": 83, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -1643,15 +1517,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "But fear not, there is a simple solution. You can pass a function to the `assign` method (typically a `lambda` function), and this function will be called with the `DataFrame` as a parameter:" + "But fear not, there is a simple solution. You can pass a function to the `assign()` method (typically a `lambda` function), and this function will be called with the `DataFrame` as a parameter:" ] }, { "cell_type": "code", - "execution_count": 82, - "metadata": { - "collapsed": false - }, + "execution_count": 84, + "metadata": {}, "outputs": [], "source": [ "(people\n", @@ -1677,10 +1549,8 @@ }, { "cell_type": "code", - "execution_count": 83, - "metadata": { - "collapsed": false - }, + "execution_count": 85, + "metadata": {}, "outputs": [], "source": [ "people.eval(\"weight / (height/100) ** 2 > 25\")" @@ -1690,18 +1560,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Assignment expressions are also supported, and contrary to the `assign` method, this does not create a copy of the `DataFrame`, instead it directly modifies it:" + "Assignment expressions are also supported. Let's set `inplace=True` to directly modify the `DataFrame` rather than getting a modified copy:" ] }, { "cell_type": "code", - "execution_count": 84, - "metadata": { - "collapsed": false - }, + "execution_count": 86, + "metadata": {}, "outputs": [], "source": [ - "people.eval(\"body_mass_index = weight / (height/100) ** 2\")\n", + "people.eval(\"body_mass_index = weight / (height/100) ** 2\", inplace=True)\n", "people" ] }, @@ -1714,14 +1582,12 @@ }, { "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, + "execution_count": 87, + "metadata": {}, "outputs": [], "source": [ "overweight_threshold = 30\n", - "people.eval(\"overweight = body_mass_index > @overweight_threshold\")\n", + "people.eval(\"overweight = body_mass_index > @overweight_threshold\", inplace=True)\n", "people" ] }, @@ -1730,15 +1596,13 @@ "metadata": {}, "source": [ "## Querying a `DataFrame`\n", - "The `query` method lets you filter a `DataFrame` based on a query expression:" + "The `query()` method lets you filter a `DataFrame` based on a query expression:" ] }, { "cell_type": "code", - "execution_count": 86, - "metadata": { - "collapsed": false - }, + "execution_count": 88, + "metadata": {}, "outputs": [], "source": [ "people.query(\"age > 30 and pets == 0\")" @@ -1754,10 +1618,8 @@ }, { "cell_type": "code", - "execution_count": 87, - "metadata": { - "collapsed": false - }, + "execution_count": 89, + "metadata": {}, "outputs": [], "source": [ "people.sort_index(ascending=False)" @@ -1772,10 +1634,8 @@ }, { "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": false - }, + "execution_count": 90, + "metadata": {}, "outputs": [], "source": [ "people.sort_index(axis=1, inplace=True)\n", @@ -1791,10 +1651,8 @@ }, { "cell_type": "code", - "execution_count": 89, - "metadata": { - "collapsed": false - }, + "execution_count": 91, + "metadata": {}, "outputs": [], "source": [ "people.sort_values(by=\"age\", inplace=True)\n", @@ -1813,10 +1671,8 @@ }, { "cell_type": "code", - "execution_count": 90, - "metadata": { - "collapsed": false - }, + "execution_count": 92, + "metadata": {}, "outputs": [], "source": [ "people.plot(kind = \"line\", x = \"body_mass_index\", y = [\"height\", \"weight\"])\n", @@ -1827,14 +1683,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can pass extra arguments supported by matplotlib's functions. For example, we can create scatterplot and pass it a list of sizes using the `s` argument of matplotlib's `scatter` function:" + "You can pass extra arguments supported by matplotlib's functions. For example, we can create scatterplot and pass it a list of sizes using the `s` argument of matplotlib's `scatter()` function:" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 93, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -1860,10 +1715,8 @@ }, { "cell_type": "code", - "execution_count": 92, - "metadata": { - "collapsed": false - }, + "execution_count": 94, + "metadata": {}, "outputs": [], "source": [ "grades_array = np.array([[8,8,9],[10,9,9],[4, 8, 2], [9, 10, 10]])\n", @@ -1880,10 +1733,8 @@ }, { "cell_type": "code", - "execution_count": 93, - "metadata": { - "collapsed": false - }, + "execution_count": 95, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(grades)" @@ -1898,10 +1749,8 @@ }, { "cell_type": "code", - "execution_count": 94, - "metadata": { - "collapsed": false - }, + "execution_count": 96, + "metadata": {}, "outputs": [], "source": [ "grades + 1" @@ -1916,9 +1765,8 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 97, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -1935,10 +1783,8 @@ }, { "cell_type": "code", - "execution_count": 96, - "metadata": { - "collapsed": false - }, + "execution_count": 98, + "metadata": {}, "outputs": [], "source": [ "grades.mean()" @@ -1953,10 +1799,8 @@ }, { "cell_type": "code", - "execution_count": 97, - "metadata": { - "collapsed": false - }, + "execution_count": 99, + "metadata": {}, "outputs": [], "source": [ "(grades > 5).all()" @@ -1971,10 +1815,8 @@ }, { "cell_type": "code", - "execution_count": 98, - "metadata": { - "collapsed": false - }, + "execution_count": 100, + "metadata": {}, "outputs": [], "source": [ "(grades > 5).all(axis = 1)" @@ -1989,10 +1831,8 @@ }, { "cell_type": "code", - "execution_count": 99, - "metadata": { - "collapsed": false - }, + "execution_count": 101, + "metadata": {}, "outputs": [], "source": [ "(grades == 10).any(axis = 1)" @@ -2007,10 +1847,8 @@ }, { "cell_type": "code", - "execution_count": 100, - "metadata": { - "collapsed": false - }, + "execution_count": 102, + "metadata": {}, "outputs": [], "source": [ "grades - grades.mean() # equivalent to: grades - [7.75, 8.75, 7.50]" @@ -2025,10 +1863,8 @@ }, { "cell_type": "code", - "execution_count": 101, - "metadata": { - "collapsed": false - }, + "execution_count": 103, + "metadata": {}, "outputs": [], "source": [ "pd.DataFrame([[7.75, 8.75, 7.50]]*4, index=grades.index, columns=grades.columns)" @@ -2043,9 +1879,8 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 104, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2063,10 +1898,8 @@ }, { "cell_type": "code", - "execution_count": 103, - "metadata": { - "collapsed": false - }, + "execution_count": 105, + "metadata": {}, "outputs": [], "source": [ "bonus_array = np.array([[0,np.nan,2],[np.nan,1,0],[0, 1, 0], [3, 3, 0]])\n", @@ -2076,9 +1909,8 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 106, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2095,14 +1927,13 @@ "## Handling missing data\n", "Dealing with missing data is a frequent task when working with real life data. Pandas offers a few tools to handle missing data.\n", " \n", - "Let's try to fix the problem above. For example, we can decide that missing data should result in a zero, instead of `NaN`. We can replace all `NaN` values by a any value using the `fillna` method:" + "Let's try to fix the problem above. For example, we can decide that missing data should result in a zero, instead of `NaN`. We can replace all `NaN` values by a any value using the `fillna()` method:" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 107, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2119,10 +1950,8 @@ }, { "cell_type": "code", - "execution_count": 106, - "metadata": { - "collapsed": false - }, + "execution_count": 108, + "metadata": {}, "outputs": [], "source": [ "fixed_bonus_points = bonus_points.fillna(0)\n", @@ -2142,10 +1971,8 @@ }, { "cell_type": "code", - "execution_count": 107, - "metadata": { - "collapsed": false - }, + "execution_count": 109, + "metadata": {}, "outputs": [], "source": [ "bonus_points" @@ -2160,9 +1987,8 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 110, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2179,10 +2005,8 @@ }, { "cell_type": "code", - "execution_count": 109, - "metadata": { - "collapsed": false - }, + "execution_count": 111, + "metadata": {}, "outputs": [], "source": [ "better_bonus_points = bonus_points.copy()\n", @@ -2201,10 +2025,8 @@ }, { "cell_type": "code", - "execution_count": 110, - "metadata": { - "collapsed": false - }, + "execution_count": 112, + "metadata": {}, "outputs": [], "source": [ "grades + better_bonus_points" @@ -2219,9 +2041,8 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 113, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2235,15 +2056,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There's not much we can do about December and Colin: it's bad enough that we are making up bonus points, but we can't reasonably make up grades (well I guess some teachers probably do). So let's call the `dropna` method to get rid of rows that are full of `NaN`s:" + "There's not much we can do about December and Colin: it's bad enough that we are making up bonus points, but we can't reasonably make up grades (well I guess some teachers probably do). So let's call the `dropna()` method to get rid of rows that are full of `NaN`s:" ] }, { "cell_type": "code", - "execution_count": 112, - "metadata": { - "collapsed": false - }, + "execution_count": 114, + "metadata": {}, "outputs": [], "source": [ "final_grades_clean = final_grades.dropna(how=\"all\")\n", @@ -2259,10 +2078,8 @@ }, { "cell_type": "code", - "execution_count": 113, - "metadata": { - "collapsed": false - }, + "execution_count": 115, + "metadata": {}, "outputs": [], "source": [ "final_grades_clean = final_grades_clean.dropna(axis=1, how=\"all\")\n", @@ -2281,9 +2098,8 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 116, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2301,10 +2117,8 @@ }, { "cell_type": "code", - "execution_count": 115, - "metadata": { - "collapsed": false - }, + "execution_count": 117, + "metadata": {}, "outputs": [], "source": [ "grouped_grades = final_grades.groupby(\"hobby\")\n", @@ -2320,10 +2134,8 @@ }, { "cell_type": "code", - "execution_count": 116, - "metadata": { - "collapsed": false - }, + "execution_count": 118, + "metadata": {}, "outputs": [], "source": [ "grouped_grades.mean()" @@ -2346,10 +2158,8 @@ }, { "cell_type": "code", - "execution_count": 117, - "metadata": { - "collapsed": false - }, + "execution_count": 119, + "metadata": {}, "outputs": [], "source": [ "bonus_points" @@ -2357,10 +2167,8 @@ }, { "cell_type": "code", - "execution_count": 118, - "metadata": { - "collapsed": false - }, + "execution_count": 120, + "metadata": {}, "outputs": [], "source": [ "more_grades = final_grades_clean.stack().reset_index()\n", @@ -2373,15 +2181,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can call the `pivot_table` function for this `DataFrame`, asking to group by the `name` column. By default, `pivot_table` computes the `mean` of each numeric column:" + "Now we can call the `pd.pivot_table()` function for this `DataFrame`, asking to group by the `name` column. By default, `pivot_table()` computes the mean of each numeric column:" ] }, { "cell_type": "code", - "execution_count": 119, - "metadata": { - "collapsed": false - }, + "execution_count": 121, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\")" @@ -2391,15 +2197,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can change the aggregation function by setting the `aggfunc` attribute, and we can also specify the list of columns whose values will be aggregated:" + "We can change the aggregation function by setting the `aggfunc` argument, and we can also specify the list of columns whose values will be aggregated:" ] }, { "cell_type": "code", - "execution_count": 120, - "metadata": { - "collapsed": false - }, + "execution_count": 122, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\", values=[\"grade\",\"bonus\"], aggfunc=np.max)" @@ -2414,10 +2218,8 @@ }, { "cell_type": "code", - "execution_count": 121, - "metadata": { - "collapsed": false - }, + "execution_count": 123, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\", values=\"grade\", columns=\"month\", margins=True)" @@ -2432,10 +2234,8 @@ }, { "cell_type": "code", - "execution_count": 122, - "metadata": { - "collapsed": false - }, + "execution_count": 124, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=(\"name\", \"month\"), margins=True)" @@ -2451,9 +2251,8 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 125, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2469,14 +2268,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `head` method returns the top 5 rows:" + "The `head()` method returns the top 5 rows:" ] }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 126, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2488,15 +2286,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Of course there's also a `tail` function to view the bottom 5 rows. You can pass the number of rows you want:" + "Of course there's also a `tail()` function to view the bottom 5 rows. You can pass the number of rows you want:" ] }, { "cell_type": "code", - "execution_count": 125, - "metadata": { - "collapsed": false - }, + "execution_count": 127, + "metadata": {}, "outputs": [], "source": [ "large_df.tail(n=2)" @@ -2506,14 +2302,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `info` method prints out a summary of each columns contents:" + "The `info()` method prints out a summary of each columns contents:" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 128, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2525,7 +2320,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, the `describe` method gives a nice overview of the main aggregated values over each column:\n", + "Finally, the `describe()` method gives a nice overview of the main aggregated values over each column:\n", "* `count`: number of non-null (not NaN) values\n", "* `mean`: mean of non-null values\n", "* `std`: [standard deviation](https://en.wikipedia.org/wiki/Standard_deviation) of non-null values\n", @@ -2536,9 +2331,8 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 129, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2556,10 +2350,8 @@ }, { "cell_type": "code", - "execution_count": 128, - "metadata": { - "collapsed": false - }, + "execution_count": 130, + "metadata": {}, "outputs": [], "source": [ "my_df = pd.DataFrame(\n", @@ -2580,10 +2372,8 @@ }, { "cell_type": "code", - "execution_count": 129, - "metadata": { - "collapsed": true - }, + "execution_count": 131, + "metadata": {}, "outputs": [], "source": [ "my_df.to_csv(\"my_df.csv\")\n", @@ -2600,10 +2390,8 @@ }, { "cell_type": "code", - "execution_count": 130, - "metadata": { - "collapsed": false - }, + "execution_count": 132, + "metadata": {}, "outputs": [], "source": [ "for filename in (\"my_df.csv\", \"my_df.html\", \"my_df.json\"):\n", @@ -2624,10 +2412,8 @@ }, { "cell_type": "code", - "execution_count": 131, - "metadata": { - "collapsed": false - }, + "execution_count": 133, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -2646,10 +2432,8 @@ }, { "cell_type": "code", - "execution_count": 132, - "metadata": { - "collapsed": false - }, + "execution_count": 134, + "metadata": {}, "outputs": [], "source": [ "my_df_loaded = pd.read_csv(\"my_df.csv\", index_col=0)\n", @@ -2665,10 +2449,8 @@ }, { "cell_type": "code", - "execution_count": 133, - "metadata": { - "collapsed": false - }, + "execution_count": 135, + "metadata": {}, "outputs": [], "source": [ "us_cities = None\n", @@ -2700,10 +2482,8 @@ }, { "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": false - }, + "execution_count": 136, + "metadata": {}, "outputs": [], "source": [ "city_loc = pd.DataFrame(\n", @@ -2719,10 +2499,8 @@ }, { "cell_type": "code", - "execution_count": 135, - "metadata": { - "collapsed": false - }, + "execution_count": 137, + "metadata": {}, "outputs": [], "source": [ "city_pop = pd.DataFrame(\n", @@ -2739,15 +2517,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's join these `DataFrame`s using the `merge` function:" + "Now let's join these `DataFrame`s using the `merge()` function:" ] }, { "cell_type": "code", - "execution_count": 136, - "metadata": { - "collapsed": false - }, + "execution_count": 138, + "metadata": {}, "outputs": [], "source": [ "pd.merge(left=city_loc, right=city_pop, on=\"city\")" @@ -2764,10 +2540,8 @@ }, { "cell_type": "code", - "execution_count": 137, - "metadata": { - "collapsed": false - }, + "execution_count": 139, + "metadata": {}, "outputs": [], "source": [ "all_cities = pd.merge(left=city_loc, right=city_pop, on=\"city\", how=\"outer\")\n", @@ -2783,10 +2557,8 @@ }, { "cell_type": "code", - "execution_count": 138, - "metadata": { - "collapsed": false - }, + "execution_count": 140, + "metadata": {}, "outputs": [], "source": [ "pd.merge(left=city_loc, right=city_pop, on=\"city\", how=\"right\")" @@ -2801,10 +2573,8 @@ }, { "cell_type": "code", - "execution_count": 139, - "metadata": { - "collapsed": false - }, + "execution_count": 141, + "metadata": {}, "outputs": [], "source": [ "city_pop2 = city_pop.copy()\n", @@ -2817,15 +2587,13 @@ "metadata": {}, "source": [ "## Concatenation\n", - "Rather than joining `DataFrame`s, we may just want to concatenate them. That's what `concat` is for:" + "Rather than joining `DataFrame`s, we may just want to concatenate them. That's what `concat()` is for:" ] }, { "cell_type": "code", - "execution_count": 140, - "metadata": { - "collapsed": false - }, + "execution_count": 142, + "metadata": {}, "outputs": [], "source": [ "result_concat = pd.concat([city_loc, city_pop])\n", @@ -2841,10 +2609,8 @@ }, { "cell_type": "code", - "execution_count": 141, - "metadata": { - "collapsed": false - }, + "execution_count": 143, + "metadata": {}, "outputs": [], "source": [ "result_concat.loc[3]" @@ -2859,10 +2625,8 @@ }, { "cell_type": "code", - "execution_count": 142, - "metadata": { - "collapsed": false - }, + "execution_count": 144, + "metadata": {}, "outputs": [], "source": [ "pd.concat([city_loc, city_pop], ignore_index=True)" @@ -2877,10 +2641,8 @@ }, { "cell_type": "code", - "execution_count": 143, - "metadata": { - "collapsed": false - }, + "execution_count": 145, + "metadata": {}, "outputs": [], "source": [ "pd.concat([city_loc, city_pop], join=\"inner\")" @@ -2895,9 +2657,8 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 146, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2914,9 +2675,8 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 147, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2935,15 +2695,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `append` method is a useful shorthand for concatenating `DataFrame`s vertically:" + "The `append()` method is a useful shorthand for concatenating `DataFrame`s vertically:" ] }, { "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": false - }, + "execution_count": 148, + "metadata": {}, "outputs": [], "source": [ "city_loc.append(city_pop)" @@ -2953,7 +2711,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As always in pandas, the `append` method does *not* actually modify `city_loc`: it works on a copy and returns the modified copy." + "As always in pandas, the `append()` method does *not* actually modify `city_loc`: it works on a copy and returns the modified copy." ] }, { @@ -2966,10 +2724,8 @@ }, { "cell_type": "code", - "execution_count": 147, - "metadata": { - "collapsed": false - }, + "execution_count": 149, + "metadata": {}, "outputs": [], "source": [ "city_eco = city_pop.copy()\n", @@ -2986,10 +2742,8 @@ }, { "cell_type": "code", - "execution_count": 148, - "metadata": { - "collapsed": false - }, + "execution_count": 150, + "metadata": {}, "outputs": [], "source": [ "city_eco[\"economy\"] = city_eco[\"eco_code\"].astype('category')\n", @@ -3005,10 +2759,8 @@ }, { "cell_type": "code", - "execution_count": 149, - "metadata": { - "collapsed": false - }, + "execution_count": 151, + "metadata": {}, "outputs": [], "source": [ "city_eco[\"economy\"].cat.categories = [\"Finance\", \"Energy\", \"Tourism\"]\n", @@ -3024,10 +2776,8 @@ }, { "cell_type": "code", - "execution_count": 150, - "metadata": { - "collapsed": false - }, + "execution_count": 152, + "metadata": {}, "outputs": [], "source": [ "city_eco.sort_values(by=\"economy\", ascending=False)" @@ -3042,25 +2792,32 @@ "# What next?\n", "As you probably noticed by now, pandas is quite a large library with *many* features. Although we went through the most important features, there is still a lot to discover. Probably the best way to learn more is to get your hands dirty with some real-life data. It is also a good idea to go through pandas' excellent [documentation](http://pandas.pydata.org/pandas-docs/stable/index.html), in particular the [Cookbook](http://pandas.pydata.org/pandas-docs/stable/cookbook.html)." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.6.3" }, "toc": { "toc_cell": false, @@ -3071,5 +2828,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }