handson-ml/02_end_to_end_machine_learn...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "**Chapter 2 – End-to-end Machine Learning project**\n",
    "\n",
    "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*\n",
    "\n",
    "*This notebook contains all the sample code and solutions to the exercices in chapter 2.*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# To support both python 2 and python 3\n",
    "from __future__ import division, print_function, unicode_literals\n",
    "\n",
    "# Common imports\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# to make this notebook's output stable across runs\n",
    "np.random.seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams['axes.labelsize'] = 14\n",
    "plt.rcParams['xtick.labelsize'] = 12\n",
    "plt.rcParams['ytick.labelsize'] = 12\n",
    "\n",
    "# Where to save the figures\n",
    "PROJECT_ROOT_DIR = \".\"\n",
    "CHAPTER_ID = \"end_to_end_project\"\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True):\n",
    "    path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
    "    print(\"Saving figure\", fig_id)\n",
    "    if tight_layout:\n",
    "        plt.tight_layout()\n",
    "    plt.savefig(path, format='png', dpi=300)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Get the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import tarfile\n",
    "from six.moves import urllib\n",
    "\n",
    "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n",
    "HOUSING_PATH = \"datasets/housing\"\n",
    "HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + \"/housing.tgz\"\n",
    "\n",
    "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
    "    if not os.path.isdir(housing_path):\n",
    "        os.makedirs(housing_path)\n",
    "    tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
    "    urllib.request.urlretrieve(housing_url, tgz_path)\n",
    "    housing_tgz = tarfile.open(tgz_path)\n",
    "    housing_tgz.extractall(path=housing_path)\n",
    "    housing_tgz.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "fetch_housing_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "def load_housing_data(housing_path=HOUSING_PATH):\n",
    "    csv_path = os.path.join(housing_path, \"housing.csv\")\n",
    "    return pd.read_csv(csv_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing = load_housing_data()\n",
    "housing.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing[\"ocean_proximity\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "housing.hist(bins=50, figsize=(20,15))\n",
    "save_fig(\"attribute_histogram_plots\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# to make this notebook's output identical at every run\n",
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def split_train_test(data, test_ratio):\n",
    "    shuffled_indices = np.random.permutation(len(data))\n",
    "    test_set_size = int(len(data) * test_ratio)\n",
    "    test_indices = shuffled_indices[:test_set_size]\n",
    "    train_indices = shuffled_indices[test_set_size:]\n",
    "    return data.iloc[train_indices], data.iloc[test_indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "train_set, test_set = split_train_test(housing, 0.2)\n",
    "print(len(train_set), \"train +\", len(test_set), \"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import hashlib\n",
    "\n",
    "def test_set_check(identifier, test_ratio, hash):\n",
    "    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n",
    "\n",
    "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
    "    ids = data[id_column]\n",
    "    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n",
    "    return data.loc[~in_test_set], data.loc[in_test_set]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# This version supports both Python 2 and Python 3, instead of just Python 3.\n",
    "def test_set_check(identifier, test_ratio, hash):\n",
    "    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_with_id = housing.reset_index()   # adds an `index` column\n",
    "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
    "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "test_set.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "test_set.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing[\"median_income\"].hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n",
    "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing[\"income_cat\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "\n",
    "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
    "for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
    "    strat_train_set = housing.loc[train_index]\n",
    "    strat_test_set = housing.loc[test_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing[\"income_cat\"].value_counts() / len(housing)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def income_cat_proportions(data):\n",
    "    return data[\"income_cat\"].value_counts() / len(data)\n",
    "\n",
    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
    "\n",
    "compare_props = pd.DataFrame({\n",
    "    \"Overall\": income_cat_proportions(housing),\n",
    "    \"Stratified\": income_cat_proportions(strat_test_set),\n",
    "    \"Random\": income_cat_proportions(test_set),\n",
    "}).sort_index()\n",
    "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n",
    "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "compare_props"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "for set_ in (strat_train_set, strat_test_set):\n",
    "    set_.drop(\"income_cat\", axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Discover and visualize the data to gain insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing = strat_train_set.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n",
    "save_fig(\"bad_visualization_plot\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n",
    "save_fig(\"better_visualization_plot\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.4,\n",
    "    s=housing[\"population\"]/100, label=\"population\", figsize=(10,7),\n",
    "    c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True,\n",
    ")\n",
    "plt.legend()\n",
    "save_fig(\"housing_prices_scatterplot\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.image as mpimg\n",
    "california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n",
    "ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
    "                       s=housing['population']/100, label=\"Population\",\n",
    "                       c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
    "                       colorbar=False, alpha=0.4,\n",
    "                      )\n",
    "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n",
    "plt.ylabel(\"Latitude\", fontsize=14)\n",
    "plt.xlabel(\"Longitude\", fontsize=14)\n",
    "\n",
    "prices = housing[\"median_house_value\"]\n",
    "tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
    "cbar = plt.colorbar()\n",
    "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n",
    "cbar.set_label('Median House Value', fontsize=16)\n",
    "\n",
    "plt.legend(fontsize=16)\n",
    "save_fig(\"california_housing_prices_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "corr_matrix = housing.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
    "             alpha=0.1)\n",
    "plt.axis([0, 16, 0, 550000])\n",
    "save_fig(\"income_vs_house_value_scatterplot\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from pandas.tools.plotting import scatter_matrix\n",
    "\n",
    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
    "              \"housing_median_age\"]\n",
    "scatter_matrix(housing[attributes], figsize=(12, 8))\n",
    "save_fig(\"scatter_matrix_plot\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n",
    "housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"]/housing[\"total_rooms\"]\n",
    "housing[\"population_per_household\"]=housing[\"population\"]/housing[\"households\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "corr_matrix = housing.corr()\n",
    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n",
    "             alpha=0.2)\n",
    "plt.axis([0, 5, 0, 520000])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Prepare the data for Machine Learning algorithms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
    "housing_labels = strat_train_set[\"median_house_value\"].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.iloc[21:24]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_copy = housing.copy().iloc[21:24]\n",
    "housing_copy.dropna(subset=[\"total_bedrooms\"])    # option 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_copy = housing.copy().iloc[21:24]\n",
    "housing_copy.drop(\"total_bedrooms\", axis=1)       # option 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_copy = housing.copy().iloc[21:24]\n",
    "median = housing_copy[\"total_bedrooms\"].median()\n",
    "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
    "housing_copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_copy.drop(\"total_bedrooms\", axis=1)       # option 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "median = housing_copy[\"total_bedrooms\"].median()\n",
    "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
    "housing_copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Summary...\n",
    "housing_copy = housing.copy().iloc[21:24]\n",
    "housing_copy.dropna(subset=[\"total_bedrooms\"])    # option 1\n",
    "\n",
    "housing_copy = housing.copy().iloc[21:24]\n",
    "housing_copy.drop(\"total_bedrooms\", axis=1)       # option 2\n",
    "\n",
    "housing_copy = housing.copy().iloc[21:24]\n",
    "median = housing_copy[\"total_bedrooms\"].median()\n",
    "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Imputer\n",
    "\n",
    "imputer = Imputer(strategy=\"median\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_num = housing.drop(\"ocean_proximity\", axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "imputer.fit(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "imputer.statistics_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_num.median().values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "X = imputer.transform(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_tr = pd.DataFrame(X, columns=housing_num.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_tr.iloc[21:24]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "imputer.strategy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
    "housing_tr.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "encoder = LabelEncoder()\n",
    "housing_cat = housing[\"ocean_proximity\"]\n",
    "housing_cat_encoded = encoder.fit_transform(housing_cat)\n",
    "housing_cat_encoded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "print(encoder.classes_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "encoder = OneHotEncoder()\n",
    "housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n",
    "housing_cat_1hot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_cat_1hot.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelBinarizer\n",
    "\n",
    "encoder = LabelBinarizer()\n",
    "housing_cat_1hot = encoder.fit_transform(housing_cat)\n",
    "housing_cat_1hot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n",
    "\n",
    "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n",
    "        self.add_bedrooms_per_room = add_bedrooms_per_room\n",
    "    def fit(self, X, y=None):\n",
    "        return self  # nothing else to do\n",
    "    def transform(self, X, y=None):\n",
    "        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n",
    "        population_per_household = X[:, population_ix] / X[:, household_ix]\n",
    "        if self.add_bedrooms_per_room:\n",
    "            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
    "            return np.c_[X, rooms_per_household, population_per_household,\n",
    "                         bedrooms_per_room]\n",
    "        else:\n",
    "            return np.c_[X, rooms_per_household, population_per_household]\n",
    "\n",
    "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n",
    "housing_extra_attribs = attr_adder.transform(housing.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n",
    "housing_extra_attribs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "num_pipeline = Pipeline([\n",
    "        ('imputer', Imputer(strategy=\"median\")),\n",
    "        ('attribs_adder', CombinedAttributesAdder()),\n",
    "        ('std_scaler', StandardScaler()),\n",
    "    ])\n",
    "\n",
    "housing_num_tr = num_pipeline.fit_transform(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_num_tr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, attribute_names):\n",
    "        self.attribute_names = attribute_names\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    "    def transform(self, X):\n",
    "        return X[self.attribute_names].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "num_attribs = list(housing_num)\n",
    "cat_attribs = [\"ocean_proximity\"]\n",
    "\n",
    "num_pipeline = Pipeline([\n",
    "        ('selector', DataFrameSelector(num_attribs)),\n",
    "        ('imputer', Imputer(strategy=\"median\")),\n",
    "        ('attribs_adder', CombinedAttributesAdder()),\n",
    "        ('std_scaler', StandardScaler()),\n",
    "    ])\n",
    "\n",
    "cat_pipeline = Pipeline([\n",
    "        ('selector', DataFrameSelector(cat_attribs)),\n",
    "        ('label_binarizer', LabelBinarizer()),\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import FeatureUnion\n",
    "\n",
    "full_pipeline = FeatureUnion(transformer_list=[\n",
    "        (\"num_pipeline\", num_pipeline),\n",
    "        (\"cat_pipeline\", cat_pipeline),\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_prepared = full_pipeline.fit_transform(housing)\n",
    "housing_prepared"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_prepared.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Select and train a model "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "lin_reg = LinearRegression()\n",
    "lin_reg.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# let's try the full pipeline on a few training instances\n",
    "some_data = housing.iloc[:5]\n",
    "some_labels = housing_labels.iloc[:5]\n",
    "some_data_prepared = full_pipeline.transform(some_data)\n",
    "\n",
    "print(\"Predictions:\", lin_reg.predict(some_data_prepared))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "print(\"Labels:\", list(some_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "some_data_prepared"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "housing_predictions = lin_reg.predict(housing_prepared)\n",
    "lin_mse = mean_squared_error(housing_labels, housing_predictions)\n",
    "lin_rmse = np.sqrt(lin_mse)\n",
    "lin_rmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import mean_absolute_error\n",
    "\n",
    "lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n",
    "lin_mae"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
    "tree_reg = DecisionTreeRegressor(random_state=42)\n",
    "tree_reg.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_predictions = tree_reg.predict(housing_prepared)\n",
    "tree_mse = mean_squared_error(housing_labels, housing_predictions)\n",
    "tree_rmse = np.sqrt(tree_mse)\n",
    "tree_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Fine-tune your model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
    "                         scoring=\"neg_mean_squared_error\", cv=10)\n",
    "tree_rmse_scores = np.sqrt(-scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def display_scores(scores):\n",
    "    print(\"Scores:\", scores)\n",
    "    print(\"Mean:\", scores.mean())\n",
    "    print(\"Standard deviation:\", scores.std())\n",
    "\n",
    "display_scores(tree_rmse_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
    "                             scoring=\"neg_mean_squared_error\", cv=10)\n",
    "lin_rmse_scores = np.sqrt(-lin_scores)\n",
    "display_scores(lin_rmse_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "forest_reg = RandomForestRegressor(random_state=42)\n",
    "forest_reg.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_predictions = forest_reg.predict(housing_prepared)\n",
    "forest_mse = mean_squared_error(housing_labels, housing_predictions)\n",
    "forest_rmse = np.sqrt(forest_mse)\n",
    "forest_rmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
    "                                scoring=\"neg_mean_squared_error\", cv=10)\n",
    "forest_rmse_scores = np.sqrt(-forest_scores)\n",
    "display_scores(forest_rmse_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
    "pd.Series(np.sqrt(-scores)).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.svm import SVR\n",
    "\n",
    "svm_reg = SVR(kernel=\"linear\")\n",
    "svm_reg.fit(housing_prepared, housing_labels)\n",
    "housing_predictions = svm_reg.predict(housing_prepared)\n",
    "svm_mse = mean_squared_error(housing_labels, housing_predictions)\n",
    "svm_rmse = np.sqrt(svm_mse)\n",
    "svm_rmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_grid = [\n",
    "    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
    "    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
    "  ]\n",
    "\n",
    "forest_reg = RandomForestRegressor(random_state=42)\n",
    "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n",
    "                           scoring='neg_mean_squared_error')\n",
    "grid_search.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "grid_search.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "cvres = grid_search.cv_results_\n",
    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
    "    print(np.sqrt(-mean_score), params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(grid_search.cv_results_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from scipy.stats import randint\n",
    "\n",
    "param_distribs = {\n",
    "        'n_estimators': randint(low=1, high=200),\n",
    "        'max_features': randint(low=1, high=8),\n",
    "    }\n",
    "\n",
    "forest_reg = RandomForestRegressor(random_state=42)\n",
    "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
    "                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)\n",
    "rnd_search.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "cvres = rnd_search.cv_results_\n",
    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
    "    print(np.sqrt(-mean_score), params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "feature_importances = grid_search.best_estimator_.feature_importances_\n",
    "feature_importances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "extra_attribs = [\"rooms_per_hhold\", \"pop_per_hhold\", \"bedrooms_per_room\"]\n",
    "cat_one_hot_attribs = list(encoder.classes_)\n",
    "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n",
    "sorted(zip(feature_importances, attributes), reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "final_model = grid_search.best_estimator_\n",
    "\n",
    "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
    "y_test = strat_test_set[\"median_house_value\"].copy()\n",
    "\n",
    "X_test_prepared = full_pipeline.transform(X_test)\n",
    "final_predictions = final_model.predict(X_test_prepared)\n",
    "\n",
    "final_mse = mean_squared_error(y_test, final_predictions)\n",
    "final_rmse = np.sqrt(final_mse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "final_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Extra material"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Label Binarizer hack\n",
    "`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
    "\n",
    "This hack creates a supervision-friendly `LabelBinarizer`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
    "    def fit_transform(self, X, y=None):\n",
    "        return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
    "\n",
    "# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
    "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
    "\n",
    "# Now you can create a full pipeline with a supervised predictor at the end.\n",
    "full_pipeline_with_predictor = Pipeline([\n",
    "        (\"preparation\", full_pipeline),\n",
    "        (\"linear\", LinearRegression())\n",
    "    ])\n",
    "\n",
    "full_pipeline_with_predictor.fit(housing, housing_labels)\n",
    "full_pipeline_with_predictor.predict(some_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Model persistence using joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "my_model = full_pipeline_with_predictor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.externals import joblib\n",
    "joblib.dump(my_model, \"my_model.pkl\") # DIFF\n",
    "#...\n",
    "my_model_loaded = joblib.load(\"my_model.pkl\") # DIFF"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Example SciPy distributions for `RandomizedSearchCV`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from scipy.stats import geom, expon\n",
    "geom_distrib=geom(0.5).rvs(10000, random_state=42)\n",
    "expon_distrib=expon(scale=1).rvs(10000, random_state=42)\n",
    "plt.hist(geom_distrib, bins=50)\n",
    "plt.show()\n",
    "plt.hist(expon_distrib, bins=50)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Exercise solutions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 1."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "source": [
    "Question: Try a Support Vector Machine regressor (`sklearn.svm.SVR`), with various hyperparameters such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best `SVR` predictor perform?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_grid = [\n",
    "        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},\n",
    "        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],\n",
    "         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n",
    "    ]\n",
    "\n",
    "svm_reg = SVR()\n",
    "grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
    "grid_search.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The best model achieves the following score (evaluated using 5-fold cross validation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "negative_mse = grid_search.best_score_\n",
    "rmse = np.sqrt(-negative_mse)\n",
    "rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "That's much worse than the `RandomForestRegressor`. Let's check the best hyperparameters found:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The linear kernel seems better than the RBF kernel. Notice that the value of `C` is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for `C` (removing the smallest values), because it is likely that higher values of `C` will be better."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 2."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Question: Try replacing `GridSearchCV` with `RandomizedSearchCV`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from scipy.stats import expon, reciprocal\n",
    "\n",
    "# see https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html\n",
    "# for `expon()` and `reciprocal()` documentation and more probability distribution functions.\n",
    "\n",
    "# Note: gamma is ignored when kernel is \"linear\"\n",
    "param_distribs = {\n",
    "        'kernel': ['linear', 'rbf'],\n",
    "        'C': reciprocal(20, 200000),\n",
    "        'gamma': expon(scale=1.0),\n",
    "    }\n",
    "\n",
    "svm_reg = SVR()\n",
    "rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,\n",
    "                                n_iter=50, cv=5, scoring='neg_mean_squared_error',\n",
    "                                verbose=2, n_jobs=4, random_state=42)\n",
    "rnd_search.fit(housing_prepared, housing_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The best model achieves the following score (evaluated using 5-fold cross validation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "negative_mse = rnd_search.best_score_\n",
    "rmse = np.sqrt(-negative_mse)\n",
    "rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now this is much closer to the performance of the `RandomForestRegressor` (but not quite there yet). Let's check the best hyperparameters found:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "rnd_search.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "This time the search found a good set of hyperparameters for the RBF kernel. Randomized search tends to find better hyperparameters than grid search in the same amount of time."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Let's look at the exponential distribution we used, with `scale=1.0`. Note that some samples are much larger or smaller than 1.0, but when you look at the log of the distribution, you can see that most values are actually concentrated roughly in the range of exp(-2) to exp(+2), which is about 0.1 to 7.4."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "expon_distrib = expon(scale=1.)\n",
    "samples = expon_distrib.rvs(10000, random_state=42)\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.subplot(121)\n",
    "plt.title(\"Exponential distribution (scale=1.0)\")\n",
    "plt.hist(samples, bins=50)\n",
    "plt.subplot(122)\n",
    "plt.title(\"Log of this distribution\")\n",
    "plt.hist(np.log(samples), bins=50)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The distribution we used for `C` looks quite different: the scale of the samples is picked from a uniform distribution within a given range, which is why the right graph, which represents the log of the samples, looks roughly constant. This distribution is useful when you don't have a clue of what the target scale is:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "reciprocal_distrib = reciprocal(20, 200000)\n",
    "samples = reciprocal_distrib.rvs(10000, random_state=42)\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.subplot(121)\n",
    "plt.title(\"Reciprocal distribution (scale=1.0)\")\n",
    "plt.hist(samples, bins=50)\n",
    "plt.subplot(122)\n",
    "plt.title(\"Log of this distribution\")\n",
    "plt.hist(np.log(samples), bins=50)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "The reciprocal distribution is useful when you have no idea what the scale of the hyperparameter should be (indeed, as you can see on the figure on the right, all scales are equally likely, within the given range), whereas the exponential distribution is best when you know (more or less) what the scale of the hyperparameter should be."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 3."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Question: Try adding a transformer in the preparation pipeline to select only the most important attributes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "def indices_of_top_k(arr, k):\n",
    "    return np.sort(np.argpartition(np.array(arr), -k)[-k:])\n",
    "\n",
    "class TopFeatureSelector(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, feature_importances, k):\n",
    "        self.feature_importances = feature_importances\n",
    "        self.k = k\n",
    "    def fit(self, X, y=None):\n",
    "        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)\n",
    "        return self\n",
    "    def transform(self, X):\n",
    "        return X[:, self.feature_indices_]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Note: this feature selector assumes that you have already computed the feature importances somehow (for example using a `RandomForestRegressor`). You may be tempted to compute them directly in the `TopFeatureSelector`'s `fit()` method, however this would likely slow down grid/randomized search since the feature importances would have to be computed for every hyperparameter combination (unless you implement some sort of cache)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Let's define the number of top features we want to keep:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "k = 5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now let's look for the indices of the top k features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "top_k_feature_indices = indices_of_top_k(feature_importances, k)\n",
    "top_k_feature_indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "np.array(attributes)[top_k_feature_indices]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Let's double check that these are indeed the top k features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "sorted(zip(feature_importances, attributes), reverse=True)[:k]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Looking good... Now let's create a new pipeline that runs the previously defined preparation pipeline, and adds top k feature selection:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "preparation_and_feature_selection_pipeline = Pipeline([\n",
    "    ('preparation', full_pipeline),\n",
    "    ('feature_selection', TopFeatureSelector(feature_importances, k))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Let's look at the features of the first 3 instances:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_prepared_top_k_features[0:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Now let's double check that these are indeed the top k features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing_prepared[0:3, top_k_feature_indices]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Works great!  :)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 4."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Question: Try creating a single pipeline that does the full data preparation plus the final prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "prepare_select_and_predict_pipeline = Pipeline([\n",
    "    ('preparation', full_pipeline),\n",
    "    ('feature_selection', TopFeatureSelector(feature_importances, k)),\n",
    "    ('svm_reg', SVR(**rnd_search.best_params_))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "prepare_select_and_predict_pipeline.fit(housing, housing_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Let's try the full pipeline on a few instances:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "some_data = housing.iloc[:4]\n",
    "some_labels = housing_labels.iloc[:4]\n",
    "\n",
    "print(\"Predictions:\\t\", prepare_select_and_predict_pipeline.predict(some_data))\n",
    "print(\"Labels:\\t\\t\", list(some_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Well, the full pipeline seems to work fine. Of course, the predictions are not fantastic: they would be better if we used the best `RandomForestRegressor` that we found earlier, rather than the best `SVR`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## 5."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Question: Automatically explore some preparation options using `GridSearchCV`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "param_grid = [\n",
    "        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
    "         'feature_selection__k': [3, 4, 5, 6, 7]}\n",
    "]\n",
    "\n",
    "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n",
    "                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
    "grid_search_prep.fit(housing, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "grid_search_prep.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Great! It seems that we had the right imputer stragegy (mean), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "housing.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Congratulations! You already know quite a lot about Machine Learning. :)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  },
  "nav_menu": {
   "height": "279px",
   "width": "309px"
  },
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								{
 								 "cells": [
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Add notebooks for chapters 5 to 14

											
										
										
											2016-09-27 23:31:21 +02:00
+								    "**Chapter 2 – End-to-end Machine Learning project**\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*\n",
 								    "\n",
 								    "*This notebook contains all the sample code and solutions to the exercices in chapter 2.*"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "# Setup"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 1,
 								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "# To support both python 2 and python 3\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "from __future__ import division, print_function, unicode_literals\n",
 								    "\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "# Common imports\n",
 								    "import numpy as np\n",
-												Add import os

											
										
										
											2016-05-22 18:07:41 +02:00
+								    "import os\n",
 								    "\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "# to make this notebook's output stable across runs\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "np.random.seed(42)\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "\n",
 								    "# To plot pretty figures\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "%matplotlib inline\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "import matplotlib\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "import matplotlib.pyplot as plt\n",
 								    "plt.rcParams['axes.labelsize'] = 14\n",
 								    "plt.rcParams['xtick.labelsize'] = 12\n",
 								    "plt.rcParams['ytick.labelsize'] = 12\n",
 								    "\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "# Where to save the figures\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "PROJECT_ROOT_DIR = \".\"\n",
 								    "CHAPTER_ID = \"end_to_end_project\"\n",
 								    "\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "def save_fig(fig_id, tight_layout=True):\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "    path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
 								    "    print(\"Saving figure\", fig_id)\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "    if tight_layout:\n",
 								    "        plt.tight_layout()\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "    plt.savefig(path, format='png', dpi=300)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "# Get the data"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 2,
 								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "import os\n",
 								    "import tarfile\n",
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "from six.moves import urllib\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "HOUSING_PATH = \"datasets/housing\"\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + \"/housing.tgz\"\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
 								    "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    if not os.path.isdir(housing_path):\n",
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "        os.makedirs(housing_path)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "    tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
 								    "    urllib.request.urlretrieve(housing_url, tgz_path)\n",
 								    "    housing_tgz = tarfile.open(tgz_path)\n",
 								    "    housing_tgz.extractall(path=housing_path)\n",
 								    "    housing_tgz.close()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 3,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "fetch_housing_data()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 4,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "import pandas as pd\n",
 								    "\n",
 								    "def load_housing_data(housing_path=HOUSING_PATH):\n",
 								    "    csv_path = os.path.join(housing_path, \"housing.csv\")\n",
 								    "    return pd.read_csv(csv_path)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 5,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing = load_housing_data()\n",
 								    "housing.head()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 6,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 7,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing[\"ocean_proximity\"].value_counts()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 8,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing.describe()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 9,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "%matplotlib inline\n",
 								    "import matplotlib.pyplot as plt\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing.hist(bins=50, figsize=(20,15))\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "save_fig(\"attribute_histogram_plots\")\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "plt.show()"
 								   ]
 								  },
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								  {
 								   "cell_type": "code",
 								   "execution_count": 10,
 								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "# to make this notebook's output identical at every run\n",
 								    "np.random.seed(42)"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
 								   "execution_count": 11,
 								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "import numpy as np\n",
 								    "\n",
 								    "def split_train_test(data, test_ratio):\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    shuffled_indices = np.random.permutation(len(data))\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "    test_set_size = int(len(data) * test_ratio)\n",
 								    "    test_indices = shuffled_indices[:test_set_size]\n",
 								    "    train_indices = shuffled_indices[test_set_size:]\n",
 								    "    return data.iloc[train_indices], data.iloc[test_indices]"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 12,
 								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "train_set, test_set = split_train_test(housing, 0.2)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "print(len(train_set), \"train +\", len(test_set), \"test\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 13,
 								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "import hashlib\n",
 								    "\n",
 								    "def test_set_check(identifier, test_ratio, hash):\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
 								    "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
 								    "    ids = data[id_column]\n",
 								    "    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n",
 								    "    return data.loc[~in_test_set], data.loc[in_test_set]"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 14,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "# This version supports both Python 2 and Python 3, instead of just Python 3.\n",
 								    "def test_set_check(identifier, test_ratio, hash):\n",
 								    "    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 15,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_with_id = housing.reset_index()   # adds an `index` column\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 16,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
 								    "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"id\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 17,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "test_set.head()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 18,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "from sklearn.model_selection import train_test_split\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 19,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "test_set.head()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 20,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing[\"median_income\"].hist()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 21,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 22,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing[\"income_cat\"].value_counts()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 23,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "from sklearn.model_selection import StratifiedShuffleSplit\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
 								    "for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
 								    "    strat_train_set = housing.loc[train_index]\n",
 								    "    strat_test_set = housing.loc[test_index]"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 24,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing[\"income_cat\"].value_counts() / len(housing)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 25,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "def income_cat_proportions(data):\n",
 								    "    return data[\"income_cat\"].value_counts() / len(data)\n",
 								    "\n",
 								    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
 								    "\n",
 								    "compare_props = pd.DataFrame({\n",
 								    "    \"Overall\": income_cat_proportions(housing),\n",
 								    "    \"Stratified\": income_cat_proportions(strat_test_set),\n",
 								    "    \"Random\": income_cat_proportions(test_set),\n",
 								    "}).sort_index()\n",
 								    "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n",
 								    "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 26,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "compare_props"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 27,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "for set_ in (strat_train_set, strat_test_set):\n",
 								    "    set_.drop(\"income_cat\", axis=1, inplace=True)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "# Discover and visualize the data to gain insights"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 28,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing = strat_train_set.copy()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 29,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "save_fig(\"bad_visualization_plot\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 30,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "save_fig(\"better_visualization_plot\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 31,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "collapsed": false,
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "deletable": true,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.4,\n",
 								    "    s=housing[\"population\"]/100, label=\"population\", figsize=(10,7),\n",
 								    "    c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True,\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    ")\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "plt.legend()\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "save_fig(\"housing_prices_scatterplot\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 32,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								   "source": [
 								    "import matplotlib.image as mpimg\n",
 								    "california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n",
 								    "ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
 								    "                       s=housing['population']/100, label=\"Population\",\n",
 								    "                       c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
 								    "                       colorbar=False, alpha=0.4,\n",
 								    "                      )\n",
 								    "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n",
 								    "plt.ylabel(\"Latitude\", fontsize=14)\n",
 								    "plt.xlabel(\"Longitude\", fontsize=14)\n",
 								    "\n",
 								    "prices = housing[\"median_house_value\"]\n",
 								    "tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
 								    "cbar = plt.colorbar()\n",
 								    "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n",
 								    "cbar.set_label('Median House Value', fontsize=16)\n",
 								    "\n",
 								    "plt.legend(fontsize=16)\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "save_fig(\"california_housing_prices_plot\")\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 33,
 								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "corr_matrix = housing.corr()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 34,
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								   },
 								   "outputs": [],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 35,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "             alpha=0.1)\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "plt.axis([0, 16, 0, 550000])\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "save_fig(\"income_vs_house_value_scatterplot\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 36,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from pandas.tools.plotting import scatter_matrix\n",
 								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
 								    "              \"housing_median_age\"]\n",
 								    "scatter_matrix(housing[attributes], figsize=(12, 8))\n",
 								    "save_fig(\"scatter_matrix_plot\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 37,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n",
 								    "housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"]/housing[\"total_rooms\"]\n",
 								    "housing[\"population_per_household\"]=housing[\"population\"]/housing[\"households\"]"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 38,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "corr_matrix = housing.corr()\n",
 								    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 39,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n",
 								    "             alpha=0.2)\n",
 								    "plt.axis([0, 5, 0, 520000])\n",
 								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 40,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing.describe()"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "# Prepare the data for Machine Learning algorithms"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 41,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
 								    "housing_labels = strat_train_set[\"median_house_value\"].copy()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 42,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing.iloc[21:24]"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 43,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_copy = housing.copy().iloc[21:24]\n",
 								    "housing_copy.dropna(subset=[\"total_bedrooms\"])    # option 1"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 44,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_copy = housing.copy().iloc[21:24]\n",
 								    "housing_copy.drop(\"total_bedrooms\", axis=1)       # option 2"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 45,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_copy = housing.copy().iloc[21:24]\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "median = housing_copy[\"total_bedrooms\"].median()\n",
 								    "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing_copy"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 46,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_copy.drop(\"total_bedrooms\", axis=1)       # option 2"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 47,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "median = housing_copy[\"total_bedrooms\"].median()\n",
 								    "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
 								    "housing_copy"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 48,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "# Summary...\n",
 								    "housing_copy = housing.copy().iloc[21:24]\n",
 								    "housing_copy.dropna(subset=[\"total_bedrooms\"])    # option 1\n",
 								    "\n",
 								    "housing_copy = housing.copy().iloc[21:24]\n",
 								    "housing_copy.drop(\"total_bedrooms\", axis=1)       # option 2\n",
 								    "\n",
 								    "housing_copy = housing.copy().iloc[21:24]\n",
 								    "median = housing_copy[\"total_bedrooms\"].median()\n",
 								    "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 49,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import Imputer\n",
 								    "\n",
 								    "imputer = Imputer(strategy=\"median\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 50,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_num = housing.drop(\"ocean_proximity\", axis=1)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 51,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "imputer.fit(housing_num)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 52,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "imputer.statistics_"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 53,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_num.median().values"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 54,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "collapsed": true,
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "X = imputer.transform(housing_num)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 55,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "collapsed": true,
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_tr = pd.DataFrame(X, columns=housing_num.columns)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 56,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_tr.iloc[21:24]"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 57,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "imputer.strategy"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 58,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
 								    "housing_tr.head()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 59,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import LabelEncoder\n",
 								    "\n",
 								    "encoder = LabelEncoder()\n",
 								    "housing_cat = housing[\"ocean_proximity\"]\n",
 								    "housing_cat_encoded = encoder.fit_transform(housing_cat)\n",
 								    "housing_cat_encoded"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 60,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "print(encoder.classes_)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 61,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import OneHotEncoder\n",
 								    "\n",
 								    "encoder = OneHotEncoder()\n",
 								    "housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n",
 								    "housing_cat_1hot"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 62,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_cat_1hot.toarray()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 63,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import LabelBinarizer\n",
 								    "\n",
 								    "encoder = LabelBinarizer()\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_cat_1hot = encoder.fit_transform(housing_cat)\n",
 								    "housing_cat_1hot"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 64,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.base import BaseEstimator, TransformerMixin\n",
 								    "\n",
 								    "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n",
 								    "\n",
 								    "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n",
 								    "        self.add_bedrooms_per_room = add_bedrooms_per_room\n",
 								    "    def fit(self, X, y=None):\n",
 								    "        return self  # nothing else to do\n",
 								    "    def transform(self, X, y=None):\n",
 								    "        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n",
 								    "        population_per_household = X[:, population_ix] / X[:, household_ix]\n",
 								    "        if self.add_bedrooms_per_room:\n",
 								    "            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "            return np.c_[X, rooms_per_household, population_per_household,\n",
 								    "                         bedrooms_per_room]\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "        else:\n",
 								    "            return np.c_[X, rooms_per_household, population_per_household]\n",
 								    "\n",
 								    "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_extra_attribs = attr_adder.transform(housing.values)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 65,
 								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n",
 								    "housing_extra_attribs.head()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 66,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.pipeline import Pipeline\n",
 								    "from sklearn.preprocessing import StandardScaler\n",
 								    "\n",
 								    "num_pipeline = Pipeline([\n",
 								    "        ('imputer', Imputer(strategy=\"median\")),\n",
 								    "        ('attribs_adder', CombinedAttributesAdder()),\n",
 								    "        ('std_scaler', StandardScaler()),\n",
 								    "    ])\n",
 								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_num_tr = num_pipeline.fit_transform(housing_num)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 67,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_num_tr"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 68,
 								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.base import BaseEstimator, TransformerMixin\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
 								    "class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, attribute_names):\n",
 								    "        self.attribute_names = attribute_names\n",
 								    "    def fit(self, X, y=None):\n",
 								    "        return self\n",
 								    "    def transform(self, X):\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "        return X[self.attribute_names].values"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 69,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "num_attribs = list(housing_num)\n",
 								    "cat_attribs = [\"ocean_proximity\"]\n",
 								    "\n",
 								    "num_pipeline = Pipeline([\n",
 								    "        ('selector', DataFrameSelector(num_attribs)),\n",
 								    "        ('imputer', Imputer(strategy=\"median\")),\n",
 								    "        ('attribs_adder', CombinedAttributesAdder()),\n",
 								    "        ('std_scaler', StandardScaler()),\n",
 								    "    ])\n",
 								    "\n",
 								    "cat_pipeline = Pipeline([\n",
 								    "        ('selector', DataFrameSelector(cat_attribs)),\n",
 								    "        ('label_binarizer', LabelBinarizer()),\n",
-												Make notebook code match book examples more closely in chapter 2

											
										
										
											2017-06-01 09:53:20 +02:00
+								    "    ])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 70,
-												Make notebook code match book examples more closely in chapter 2

											
										
										
											2017-06-01 09:53:20 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.pipeline import FeatureUnion\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "full_pipeline = FeatureUnion(transformer_list=[\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "        (\"num_pipeline\", num_pipeline),\n",
 								    "        (\"cat_pipeline\", cat_pipeline),\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    ])"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 71,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_prepared = full_pipeline.fit_transform(housing)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing_prepared"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 72,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_prepared.shape"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Make notebook code match book examples more closely in chapter 2

											
										
										
											2017-06-01 09:53:20 +02:00
+								    "# Select and train a model "
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 73,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.linear_model import LinearRegression\n",
 								    "\n",
 								    "lin_reg = LinearRegression()\n",
 								    "lin_reg.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 74,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "# let's try the full pipeline on a few training instances\n",
 								    "some_data = housing.iloc[:5]\n",
 								    "some_labels = housing_labels.iloc[:5]\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "some_data_prepared = full_pipeline.transform(some_data)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "print(\"Predictions:\", lin_reg.predict(some_data_prepared))"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 75,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "print(\"Labels:\", list(some_labels))"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 76,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "some_data_prepared"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 77,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.metrics import mean_squared_error\n",
 								    "\n",
 								    "housing_predictions = lin_reg.predict(housing_prepared)\n",
 								    "lin_mse = mean_squared_error(housing_labels, housing_predictions)\n",
 								    "lin_rmse = np.sqrt(lin_mse)\n",
 								    "lin_rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 78,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.metrics import mean_absolute_error\n",
 								    "\n",
 								    "lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n",
 								    "lin_mae"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 79,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.tree import DecisionTreeRegressor\n",
 								    "\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "tree_reg = DecisionTreeRegressor(random_state=42)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "tree_reg.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 80,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing_predictions = tree_reg.predict(housing_prepared)\n",
 								    "tree_mse = mean_squared_error(housing_labels, housing_predictions)\n",
 								    "tree_rmse = np.sqrt(tree_mse)\n",
 								    "tree_rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "# Fine-tune your model"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 81,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "from sklearn.model_selection import cross_val_score\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
 								    "                         scoring=\"neg_mean_squared_error\", cv=10)\n",
 								    "tree_rmse_scores = np.sqrt(-scores)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 82,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "def display_scores(scores):\n",
 								    "    print(\"Scores:\", scores)\n",
 								    "    print(\"Mean:\", scores.mean())\n",
 								    "    print(\"Standard deviation:\", scores.std())\n",
 								    "\n",
 								    "display_scores(tree_rmse_scores)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 83,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "                             scoring=\"neg_mean_squared_error\", cv=10)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "lin_rmse_scores = np.sqrt(-lin_scores)\n",
 								    "display_scores(lin_rmse_scores)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 84,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.ensemble import RandomForestRegressor\n",
 								    "\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "forest_reg = RandomForestRegressor(random_state=42)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "forest_reg.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 85,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing_predictions = forest_reg.predict(housing_prepared)\n",
 								    "forest_mse = mean_squared_error(housing_labels, housing_predictions)\n",
 								    "forest_rmse = np.sqrt(forest_mse)\n",
 								    "forest_rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 86,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "from sklearn.model_selection import cross_val_score\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
 								    "forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "                                scoring=\"neg_mean_squared_error\", cv=10)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "forest_rmse_scores = np.sqrt(-forest_scores)\n",
 								    "display_scores(forest_rmse_scores)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 87,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "pd.Series(np.sqrt(-scores)).describe()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 88,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.svm import SVR\n",
 								    "\n",
 								    "svm_reg = SVR(kernel=\"linear\")\n",
 								    "svm_reg.fit(housing_prepared, housing_labels)\n",
 								    "housing_predictions = svm_reg.predict(housing_prepared)\n",
 								    "svm_mse = mean_squared_error(housing_labels, housing_predictions)\n",
 								    "svm_rmse = np.sqrt(svm_mse)\n",
 								    "svm_rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 89,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "from sklearn.model_selection import GridSearchCV\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
 								    "param_grid = [\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
 								    "    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
 								    "  ]\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "forest_reg = RandomForestRegressor(random_state=42)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n",
 								    "                           scoring='neg_mean_squared_error')\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "grid_search.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 90,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "grid_search.best_params_"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 91,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "grid_search.best_estimator_"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 92,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "cvres = grid_search.cv_results_\n",
 								    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
 								    "    print(np.sqrt(-mean_score), params)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 93,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "pd.DataFrame(grid_search.cv_results_)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 94,
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.model_selection import RandomizedSearchCV\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "from scipy.stats import randint\n",
 								    "\n",
 								    "param_distribs = {\n",
 								    "        'n_estimators': randint(low=1, high=200),\n",
 								    "        'max_features': randint(low=1, high=8),\n",
 								    "    }\n",
 								    "\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "forest_reg = RandomForestRegressor(random_state=42)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "rnd_search.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 95,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								    "cvres = rnd_search.cv_results_\n",
 								    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
 								    "    print(np.sqrt(-mean_score), params)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 96,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "feature_importances = grid_search.best_estimator_.feature_importances_\n",
 								    "feature_importances"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 97,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "extra_attribs = [\"rooms_per_hhold\", \"pop_per_hhold\", \"bedrooms_per_room\"]\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "cat_one_hot_attribs = list(encoder.classes_)\n",
 								    "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n",
 								    "sorted(zip(feature_importances, attributes), reverse=True)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 98,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "collapsed": true,
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "final_model = grid_search.best_estimator_\n",
 								    "\n",
 								    "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
 								    "y_test = strat_test_set[\"median_house_value\"].copy()\n",
 								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "X_test_prepared = full_pipeline.transform(X_test)\n",
 								    "final_predictions = final_model.predict(X_test_prepared)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
 								    "final_mse = mean_squared_error(y_test, final_predictions)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "final_rmse = np.sqrt(final_mse)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 99,
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "final_rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "# Extra material"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "## Label Binarizer hack\n",
 								    "`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
 								    "\n",
 								    "This hack creates a supervision-friendly `LabelBinarizer`."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 100,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
 								    "    def fit_transform(self, X, y=None):\n",
 								    "        return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
 								    "\n",
 								    "# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
 								    "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
 								    "\n",
 								    "# Now you can create a full pipeline with a supervised predictor at the end.\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "full_pipeline_with_predictor = Pipeline([\n",
 								    "        (\"preparation\", full_pipeline),\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "        (\"linear\", LinearRegression())\n",
 								    "    ])\n",
 								    "\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "full_pipeline_with_predictor.fit(housing, housing_labels)\n",
 								    "full_pipeline_with_predictor.predict(some_data)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "## Model persistence using joblib"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 101,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "my_model = full_pipeline_with_predictor"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 102,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "collapsed": true,
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "from sklearn.externals import joblib\n",
 								    "joblib.dump(my_model, \"my_model.pkl\") # DIFF\n",
 								    "#...\n",
 								    "my_model_loaded = joblib.load(\"my_model.pkl\") # DIFF"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "## Example SciPy distributions for `RandomizedSearchCV`"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 103,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "metadata": {
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   },
 								   "outputs": [],
 								   "source": [
 								    "from scipy.stats import geom, expon\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "geom_distrib=geom(0.5).rvs(10000, random_state=42)\n",
 								    "expon_distrib=expon(scale=1).rvs(10000, random_state=42)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "plt.hist(geom_distrib, bins=50)\n",
 								    "plt.show()\n",
 								    "plt.hist(expon_distrib, bins=50)\n",
 								    "plt.show()"
 								   ]
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "source": [
 								    "# Exercise solutions"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "source": [
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "## 1."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Question: Try a Support Vector Machine regressor (`sklearn.svm.SVR`), with various hyperparameters such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best `SVR` predictor perform?"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 104,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.model_selection import GridSearchCV\n",
 								    "\n",
 								    "param_grid = [\n",
 								    "        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},\n",
 								    "        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],\n",
 								    "         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n",
 								    "    ]\n",
 								    "\n",
 								    "svm_reg = SVR()\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "grid_search.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "The best model achieves the following score (evaluated using 5-fold cross validation):"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 105,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "negative_mse = grid_search.best_score_\n",
 								    "rmse = np.sqrt(-negative_mse)\n",
 								    "rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "That's much worse than the `RandomForestRegressor`. Let's check the best hyperparameters found:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 106,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "grid_search.best_params_"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "The linear kernel seems better than the RBF kernel. Notice that the value of `C` is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for `C` (removing the smallest values), because it is likely that higher values of `C` will be better."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "## 2."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Question: Try replacing `GridSearchCV` with `RandomizedSearchCV`."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 107,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.model_selection import RandomizedSearchCV\n",
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "from scipy.stats import expon, reciprocal\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "\n",
 								    "# see https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html\n",
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "# for `expon()` and `reciprocal()` documentation and more probability distribution functions.\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "\n",
 								    "# Note: gamma is ignored when kernel is \"linear\"\n",
 								    "param_distribs = {\n",
 								    "        'kernel': ['linear', 'rbf'],\n",
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "        'C': reciprocal(20, 200000),\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "        'gamma': expon(scale=1.0),\n",
 								    "    }\n",
 								    "\n",
 								    "svm_reg = SVR()\n",
 								    "rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "                                n_iter=50, cv=5, scoring='neg_mean_squared_error',\n",
 								    "                                verbose=2, n_jobs=4, random_state=42)\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "rnd_search.fit(housing_prepared, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "The best model achieves the following score (evaluated using 5-fold cross validation):"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 108,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "negative_mse = rnd_search.best_score_\n",
 								    "rmse = np.sqrt(-negative_mse)\n",
 								    "rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Now this is much closer to the performance of the `RandomForestRegressor` (but not quite there yet). Let's check the best hyperparameters found:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 109,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "rnd_search.best_params_"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "This time the search found a good set of hyperparameters for the RBF kernel. Randomized search tends to find better hyperparameters than grid search in the same amount of time."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Let's look at the exponential distribution we used, with `scale=1.0`. Note that some samples are much larger or smaller than 1.0, but when you look at the log of the distribution, you can see that most values are actually concentrated roughly in the range of exp(-2) to exp(+2), which is about 0.1 to 7.4."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 110,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "expon_distrib = expon(scale=1.)\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "samples = expon_distrib.rvs(10000, random_state=42)\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "plt.figure(figsize=(10, 4))\n",
 								    "plt.subplot(121)\n",
 								    "plt.title(\"Exponential distribution (scale=1.0)\")\n",
 								    "plt.hist(samples, bins=50)\n",
 								    "plt.subplot(122)\n",
 								    "plt.title(\"Log of this distribution\")\n",
 								    "plt.hist(np.log(samples), bins=50)\n",
 								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "The distribution we used for `C` looks quite different: the scale of the samples is picked from a uniform distribution within a given range, which is why the right graph, which represents the log of the samples, looks roughly constant. This distribution is useful when you don't have a clue of what the target scale is:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 111,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "reciprocal_distrib = reciprocal(20, 200000)\n",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								    "samples = reciprocal_distrib.rvs(10000, random_state=42)\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "plt.figure(figsize=(10, 4))\n",
 								    "plt.subplot(121)\n",
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "plt.title(\"Reciprocal distribution (scale=1.0)\")\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "plt.hist(samples, bins=50)\n",
 								    "plt.subplot(122)\n",
 								    "plt.title(\"Log of this distribution\")\n",
 								    "plt.hist(np.log(samples), bins=50)\n",
 								    "plt.show()"
 								   ]
 								  },
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								  {
 								   "cell_type": "markdown",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								   "source": [
 								    "The reciprocal distribution is useful when you have no idea what the scale of the hyperparameter should be (indeed, as you can see on the figure on the right, all scales are equally likely, within the given range), whereas the exponential distribution is best when you know (more or less) what the scale of the hyperparameter should be."
 								   ]
 								  },
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "## 3."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Question: Try adding a transformer in the preparation pipeline to select only the most important attributes."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 112,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.base import BaseEstimator, TransformerMixin\n",
 								    "\n",
 								    "def indices_of_top_k(arr, k):\n",
 								    "    return np.sort(np.argpartition(np.array(arr), -k)[-k:])\n",
 								    "\n",
 								    "class TopFeatureSelector(BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, feature_importances, k):\n",
 								    "        self.feature_importances = feature_importances\n",
 								    "        self.k = k\n",
 								    "    def fit(self, X, y=None):\n",
 								    "        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)\n",
 								    "        return self\n",
 								    "    def transform(self, X):\n",
 								    "        return X[:, self.feature_indices_]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Note: this feature selector assumes that you have already computed the feature importances somehow (for example using a `RandomForestRegressor`). You may be tempted to compute them directly in the `TopFeatureSelector`'s `fit()` method, however this would likely slow down grid/randomized search since the feature importances would have to be computed for every hyperparameter combination (unless you implement some sort of cache)."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Let's define the number of top features we want to keep:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 113,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "k = 5"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Now let's look for the indices of the top k features:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 114,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "top_k_feature_indices = indices_of_top_k(feature_importances, k)\n",
 								    "top_k_feature_indices"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 115,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "np.array(attributes)[top_k_feature_indices]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Let's double check that these are indeed the top k features:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 116,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "sorted(zip(feature_importances, attributes), reverse=True)[:k]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Looking good... Now let's create a new pipeline that runs the previously defined preparation pipeline, and adds top k feature selection:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 117,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "preparation_and_feature_selection_pipeline = Pipeline([\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    ('preparation', full_pipeline),\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "    ('feature_selection', TopFeatureSelector(feature_importances, k))\n",
 								    "])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 118,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": true,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Let's look at the features of the first 3 instances:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 119,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_prepared_top_k_features[0:3]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Now let's double check that these are indeed the top k features:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Make notebook for ch2 output the same result every time

											
										
										
											2017-06-06 13:21:19 +02:00
+								   "execution_count": 120,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing_prepared[0:3, top_k_feature_indices]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Works great!  :)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "## 4."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Question: Try creating a single pipeline that does the full data preparation plus the final prediction."
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 121,
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "metadata": {
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "collapsed": false,
-												Upgrade notebooks to TensorFlow 1.0.0

											
										
										
											2017-02-17 11:51:26 +01:00
+								    "deletable": true,
 								    "editable": true
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   },
 								   "outputs": [],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "prepare_select_and_predict_pipeline = Pipeline([\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "    ('preparation', full_pipeline),\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "    ('feature_selection', TopFeatureSelector(feature_importances, k)),\n",
 								    "    ('svm_reg', SVR(**rnd_search.best_params_))\n",
 								    "])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 122,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "prepare_select_and_predict_pipeline.fit(housing, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Let's try the full pipeline on a few instances:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 123,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "some_data = housing.iloc[:4]\n",
 								    "some_labels = housing_labels.iloc[:4]\n",
 								    "\n",
 								    "print(\"Predictions:\\t\", prepare_select_and_predict_pipeline.predict(some_data))\n",
 								    "print(\"Labels:\\t\\t\", list(some_labels))"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Well, the full pipeline seems to work fine. Of course, the predictions are not fantastic: they would be better if we used the best `RandomForestRegressor` that we found earlier, rather than the best `SVR`."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "## 5."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Question: Automatically explore some preparation options using `GridSearchCV`."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 124,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "param_grid = [\n",
 								    "        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
 								    "         'feature_selection__k': [3, 4, 5, 6, 7]}\n",
 								    "]\n",
 								    "\n",
 								    "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n",
 								    "                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
 								    "grid_search_prep.fit(housing, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 125,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "grid_search_prep.best_params_"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Great! It seems that we had the right imputer stragegy (mean), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "execution_count": 126,
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "metadata": {
 								    "collapsed": false,
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "housing.shape"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {
 								    "deletable": true,
 								    "editable": true
 								   },
 								   "source": [
 								    "Congratulations! You already know quite a lot about Machine Learning. :)"
 								   ]
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  }
 								 ],
 								 "metadata": {
 								  "kernelspec": {
 								   "display_name": "Python 3",
 								   "language": "python",
 								   "name": "python3"
 								  },
 								  "language_info": {
 								   "codemirror_mode": {
 								    "name": "ipython",
 								    "version": 3
 								   },
 								   "file_extension": ".py",
 								   "mimetype": "text/x-python",
 								   "name": "python",
 								   "nbconvert_exporter": "python",
 								   "pygments_lexer": "ipython3",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "version": "3.5.3"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  },
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								  "nav_menu": {
 								   "height": "279px",
 								   "width": "309px"
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  "toc": {
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "navigate_menu": true,
 								   "number_sections": true,
 								   "sideBar": true,
 								   "threshold": 6,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "toc_cell": false,
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "toc_section_display": "block",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "toc_window_display": false
 								  }
 								 },
 								 "nbformat": 4,
 								 "nbformat_minor": 0
 								}