handson-ml/02_end_to_end_machine_learn...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Chapter 2 – End-to-end Machine Learning project**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*This notebook contains all the sample code and solutions to the exercises in chapter 2.*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<table align=\"left\">\n",
    "  <td>\n",
    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml3/blob/main/02_end_to_end_machine_learning_project.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
    "  </td>\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml3/blob/main/02_end_to_end_machine_learning_project.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
    "  </td>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Welcome to Machine Learning!\n"
     ]
    }
   ],
   "source": [
    "print(\"Welcome to Machine Learning!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This project requires Python 3.7 or above:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "assert sys.version_info >= (3, 7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It also requires Scikit-Learn ≥ 1.0.1:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn\n",
    "\n",
    "assert sklearn.__version__ >= \"1.0.1\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get the Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import tarfile\n",
    "import urllib.request\n",
    "\n",
    "def load_housing_data():\n",
    "    tarball_path = Path(\"datasets/housing.tgz\")\n",
    "    if not tarball_path.is_file():\n",
    "        Path(\"datasets\").mkdir(parents=True, exist_ok=True)\n",
    "        url = \"https://github.com/ageron/data/raw/main/housing.tgz\"\n",
    "        urllib.request.urlretrieve(url, tarball_path)\n",
    "        with tarfile.open(tarball_path) as housing_tarball:\n",
    "            housing_tarball.extractall(path=\"datasets\")\n",
    "    return pd.read_csv(Path(\"datasets/housing/housing.csv\"))\n",
    "\n",
    "housing = load_housing_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Take a Quick Look at the Data Structure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>median_house_value</th>\n",
       "      <th>ocean_proximity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-122.23</td>\n",
       "      <td>37.88</td>\n",
       "      <td>41.0</td>\n",
       "      <td>880.0</td>\n",
       "      <td>129.0</td>\n",
       "      <td>322.0</td>\n",
       "      <td>126.0</td>\n",
       "      <td>8.3252</td>\n",
       "      <td>452600.0</td>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-122.22</td>\n",
       "      <td>37.86</td>\n",
       "      <td>21.0</td>\n",
       "      <td>7099.0</td>\n",
       "      <td>1106.0</td>\n",
       "      <td>2401.0</td>\n",
       "      <td>1138.0</td>\n",
       "      <td>8.3014</td>\n",
       "      <td>358500.0</td>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-122.24</td>\n",
       "      <td>37.85</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1467.0</td>\n",
       "      <td>190.0</td>\n",
       "      <td>496.0</td>\n",
       "      <td>177.0</td>\n",
       "      <td>7.2574</td>\n",
       "      <td>352100.0</td>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-122.25</td>\n",
       "      <td>37.85</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1274.0</td>\n",
       "      <td>235.0</td>\n",
       "      <td>558.0</td>\n",
       "      <td>219.0</td>\n",
       "      <td>5.6431</td>\n",
       "      <td>341300.0</td>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-122.25</td>\n",
       "      <td>37.85</td>\n",
       "      <td>52.0</td>\n",
       "      <td>1627.0</td>\n",
       "      <td>280.0</td>\n",
       "      <td>565.0</td>\n",
       "      <td>259.0</td>\n",
       "      <td>3.8462</td>\n",
       "      <td>342200.0</td>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "0    -122.23     37.88                41.0        880.0           129.0   \n",
       "1    -122.22     37.86                21.0       7099.0          1106.0   \n",
       "2    -122.24     37.85                52.0       1467.0           190.0   \n",
       "3    -122.25     37.85                52.0       1274.0           235.0   \n",
       "4    -122.25     37.85                52.0       1627.0           280.0   \n",
       "\n",
       "   population  households  median_income  median_house_value ocean_proximity  \n",
       "0       322.0       126.0         8.3252            452600.0        NEAR BAY  \n",
       "1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  \n",
       "2       496.0       177.0         7.2574            352100.0        NEAR BAY  \n",
       "3       558.0       219.0         5.6431            341300.0        NEAR BAY  \n",
       "4       565.0       259.0         3.8462            342200.0        NEAR BAY  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 20640 entries, 0 to 20639\n",
      "Data columns (total 10 columns):\n",
      " #   Column              Non-Null Count  Dtype  \n",
      "---  ------              --------------  -----  \n",
      " 0   longitude           20640 non-null  float64\n",
      " 1   latitude            20640 non-null  float64\n",
      " 2   housing_median_age  20640 non-null  float64\n",
      " 3   total_rooms         20640 non-null  float64\n",
      " 4   total_bedrooms      20433 non-null  float64\n",
      " 5   population          20640 non-null  float64\n",
      " 6   households          20640 non-null  float64\n",
      " 7   median_income       20640 non-null  float64\n",
      " 8   median_house_value  20640 non-null  float64\n",
      " 9   ocean_proximity     20640 non-null  object \n",
      "dtypes: float64(9), object(1)\n",
      "memory usage: 1.6+ MB\n"
     ]
    }
   ],
   "source": [
    "housing.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1H OCEAN     9136\n",
       "INLAND        6551\n",
       "NEAR OCEAN    2658\n",
       "NEAR BAY      2290\n",
       "ISLAND           5\n",
       "Name: ocean_proximity, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing[\"ocean_proximity\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>median_house_value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20433.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "      <td>20640.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>-119.569704</td>\n",
       "      <td>35.631861</td>\n",
       "      <td>28.639486</td>\n",
       "      <td>2635.763081</td>\n",
       "      <td>537.870553</td>\n",
       "      <td>1425.476744</td>\n",
       "      <td>499.539680</td>\n",
       "      <td>3.870671</td>\n",
       "      <td>206855.816909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>2.003532</td>\n",
       "      <td>2.135952</td>\n",
       "      <td>12.585558</td>\n",
       "      <td>2181.615252</td>\n",
       "      <td>421.385070</td>\n",
       "      <td>1132.462122</td>\n",
       "      <td>382.329753</td>\n",
       "      <td>1.899822</td>\n",
       "      <td>115395.615874</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-124.350000</td>\n",
       "      <td>32.540000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.499900</td>\n",
       "      <td>14999.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>-121.800000</td>\n",
       "      <td>33.930000</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>1447.750000</td>\n",
       "      <td>296.000000</td>\n",
       "      <td>787.000000</td>\n",
       "      <td>280.000000</td>\n",
       "      <td>2.563400</td>\n",
       "      <td>119600.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>-118.490000</td>\n",
       "      <td>34.260000</td>\n",
       "      <td>29.000000</td>\n",
       "      <td>2127.000000</td>\n",
       "      <td>435.000000</td>\n",
       "      <td>1166.000000</td>\n",
       "      <td>409.000000</td>\n",
       "      <td>3.534800</td>\n",
       "      <td>179700.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>-118.010000</td>\n",
       "      <td>37.710000</td>\n",
       "      <td>37.000000</td>\n",
       "      <td>3148.000000</td>\n",
       "      <td>647.000000</td>\n",
       "      <td>1725.000000</td>\n",
       "      <td>605.000000</td>\n",
       "      <td>4.743250</td>\n",
       "      <td>264725.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>-114.310000</td>\n",
       "      <td>41.950000</td>\n",
       "      <td>52.000000</td>\n",
       "      <td>39320.000000</td>\n",
       "      <td>6445.000000</td>\n",
       "      <td>35682.000000</td>\n",
       "      <td>6082.000000</td>\n",
       "      <td>15.000100</td>\n",
       "      <td>500001.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          longitude      latitude  housing_median_age   total_rooms  \\\n",
       "count  20640.000000  20640.000000        20640.000000  20640.000000   \n",
       "mean    -119.569704     35.631861           28.639486   2635.763081   \n",
       "std        2.003532      2.135952           12.585558   2181.615252   \n",
       "min     -124.350000     32.540000            1.000000      2.000000   \n",
       "25%     -121.800000     33.930000           18.000000   1447.750000   \n",
       "50%     -118.490000     34.260000           29.000000   2127.000000   \n",
       "75%     -118.010000     37.710000           37.000000   3148.000000   \n",
       "max     -114.310000     41.950000           52.000000  39320.000000   \n",
       "\n",
       "       total_bedrooms    population    households  median_income  \\\n",
       "count    20433.000000  20640.000000  20640.000000   20640.000000   \n",
       "mean       537.870553   1425.476744    499.539680       3.870671   \n",
       "std        421.385070   1132.462122    382.329753       1.899822   \n",
       "min          1.000000      3.000000      1.000000       0.499900   \n",
       "25%        296.000000    787.000000    280.000000       2.563400   \n",
       "50%        435.000000   1166.000000    409.000000       3.534800   \n",
       "75%        647.000000   1725.000000    605.000000       4.743250   \n",
       "max       6445.000000  35682.000000   6082.000000      15.000100   \n",
       "\n",
       "       median_house_value  \n",
       "count        20640.000000  \n",
       "mean        206855.816909  \n",
       "std         115395.615874  \n",
       "min          14999.000000  \n",
       "25%         119600.000000  \n",
       "50%         179700.000000  \n",
       "75%         264725.000000  \n",
       "max         500001.000000  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following cell is not shown either in the book. It creates the `images/end_to_end_project` folder (if it doesn't already exist), and it defines the `save_fig()` function which is used through this notebook to save the figures in high-res for the book."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extra code – code to save the figures as high-res PNGs for the book\n",
    "\n",
    "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n",
    "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
    "    path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n",
    "    if tight_layout:\n",
    "        plt.tight_layout()\n",
    "    plt.savefig(path, format=fig_extension, dpi=resolution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIwCAYAAACx/zuEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACBGklEQVR4nOzde7xcVX3//9ebO4IoFDkNCTWoQblEQVIKxepRRKJQwQt+4w8FFBu1WNHGSqKt19JGK6hopUZFQuUWL5SUixiQU0vLRUAkXEQCpBASCSpCghpJ+Pz+WGvIzmTOmTlnbntm3s/HYx5n9tq3z9pnZs1ee6+1tiICMzMzMzMzG90W3Q7AzMzMzMys7FxxMjMzMzMzq8MVJzMzMzMzszpccTIzMzMzM6vDFSczMzMzM7M6XHEyMzMzMzOrwxWnPifpHEmXdnifJ0pa28btr5V0Yru2b2at04oySNInJN3eqpiqtr2rpJA03I7tm/USSSOSvtzlGJZL+lA3Y2in6nOYXP68uYsh2Ti44mTtcBHwvMpEO096zKx/SJqaTyJmVM36HPCKwnIdvyBkZh3zp8BXuh1EB00C/rPbQVhjtup2ANZ/IuJ3wO+6HYeZ9YeIWAu07S62mZVHRDzS7Rg6KSJ+0e0YrHG+4zRAJG0r6QuSHpb0e0nXS3pZYf5wvtp7mKQbJP1W0k2SXlq1nXdKeiDP/09Jfy0pCvOfbqqXb0d/HNg3bzsqt6hr3Z6uvkUv6QW56cDvJd0t6aga+Zos6UJJj+bXZZKmteaomVmrSJop6b/z9/TXkq6UtHdhkfvz3x/n8mEkr/f0XWtJnwBOAI4slCnDo92tqi5nJP2ppJtzmfIT4M9qxLlPLkfWSFot6QJJf9zSg2FWXltI+idJv8yf/89J2gJA0s6SFubv8O8kXSVp38qKtZrqF84tds3Tz5L073nbv5d0n6QPFJavPg8ISbMlfVvSE3n5t1Xt488k3VL5Xkt6XaNNcAvxvTaXDb/L5dQUSa+Q9FOl5nWXSvqjqnXfIenOvN+fS/pg5Vjl+Y2cw1SXUfPzsr/Lx+KzkrYrzP+EpNslzZJ0by6n/qNyfBvI759K+kH+/z4u6VpJh1Qts5ek/yrE/Tpt3sRwIM+9XHEaLJ8F/h/wTuAAYCnwfUmTqpb7Z2Au8FLgV8B5kgSQv1xfB/4V2B9YDHxyjH1eBJwO3E26HT0pp9WVC5+LSZ/TQ3LcnwC2LSzzDOAa4PekpjyHAKuAq/I8MyuPHYAvAAcBw8BjwH9K2ibPPyj/nUkqK95YYxufAxYBV7GxTPnfRnYuaQfgMuA+YAapnPtc1TKTgB8Bt+d4Xg3sCCwunhCZ9bHjgPXAnwPvAz5AOncAOId0seFo0vfjt6TziO3Hsf1/BKYDRwEvIv22P1RnnY8BlwAvIZ1DnC3puQCSdgQuBX4GHAh8GPiXccRT8UlSXv8M2Dnv52PAbFJ5tS/pHIS8378C/ikvszcwBzgV+Os8v+45zCieyMvunbc1C/ho1TJTSf+TNwCvIZ3TndZgPp8J/DvwF6T/4a3A5YWKbSXu9cDBwImkC+A+9wKICL/6+EUq5C4lnbD8ATi+MG9L4F7gH/P0MBDAEYVlDs1pU/L0BcD3q/axIH2Unp4+EVhbmP4EcHuN2AJ4c1XacuBD+f1rgA3AnxTmvyyvd2KefidwD6CqfP0KeEu3j79ffg36q1IGjTJvh/wdf1menpq/3zOqltukDKm1zTHWfbqcIZ0A/QbYsTD/bXmZ4Tz9KeDqqm3snJc5qNvH0y+/2vkCRoDrqtKWkC6YTsvfg5cX5j2LdAHkXXl6k9//nFY5t9g1Ty8GvjlGDE+fB+TpAP65ML0VqcL2tjz9buDXwPaFZf6/4ve6Tp4r8RXPfd6X015aSKsuhx4A3l61rQ8Ad+b3dc9hCvl78xjxvQdYVhXH74FnFdI+WlxmnP9zkSo9leN5BKnSNLmwzJ/jcy8iwn2cBsjzga2B/6kkRMQGSdcB+1Qte1vh/cr8dzdgBenqUHUnxhuAv2pptMnewEMR8UDVvp4qTB8I7AmsyTfFKp5ByrOZlYSk5wOfJl3RfQ7pSuwWwJ90KIS9gdsi9ZmquK5qmQOBl1c3N8qeD9zYruDMSuK2qumVpHOAvUm/v09/ZyLiMUlL2fw8YixnAd9R6gawBPjPiPivRmOKiPWSHskxQTovuT1S/+qKG8YRz2b7AB7Of5dWpe0GIOk5wB7AVyWdVVhmK1JFBBo7h9lMbrb3AeAFpLvdW+ZX0f9FxGOF6cr/qC5Ju5HK4VcCQ3nb27OxHH4RsDIiincBf4zPvQAPDjFIKp/sqDGvOu3JGvMqTVQ0yjYmIgpxVWxdeF89r5YtSLeZZ9WY9+uJhWVmbfKfpCY5785/1wN3AtuMtVKDKj/qT5cbkrauWqbRMuUyoNZwyA/XSDPrN09WTQfpezHW96dyXvBUjeU2+R5GxBW5md1rgcOAyyR9OyLeMYGYoHXnJZud+0REdVpln5W/72H0psKNlDebriAdDFxIajb4QdId8tdT1aSYsY9HPQtJFaYPku7urQOuZmM53MjxHNhzL1ecBscyUlO9l5Ha9yNpS1K71PPHsZ272NgPoaJ6utof2PxqCcAjpP4J5HiGitOkE6rJkvaIiAcL+yoWDrcAbwV+GRG/qRu9mXVF7lS9N3ByRFyT017Kpr9Df8h/a5UXRbXKlMpIXMUyZP+qZe4ETpC0Q0Q8kdMOrlrmFuAtpCu61ScnZoPsTjb21/kRgKSdSP2VvpmXeQR4hqSdIuLxnLZ/9YYi4pekfjb/LukK4AJJ74mIdROI6y7geEnbF+461TsvaUpEPCzpIeD5EXHuKIs1cg5T7VDSXapPVxIqfbla6GXA+yPisrz96nOvu0hx7x4RlVZHM/C5F+DBIQZGPkk4C5ifR0fZO08PMb7nJZwJvEbS30maJukkUufEsSwHnivppUoPm6x0MPwhcLKkGZIOIPVb+H1hvatInT3PlbR/Hpji86Sr1BXnka4CX5JHv9lT0sslnT4Io7uY9ZBHgV8Cf5VHmnoF8G9s+n1eTXqUwRGShiQ9a5RtLQf2k/TCXKZsnU+YrgdOlbSvpD9n86u05+f9nZ2XOZzNO13/K6nfxkVKI3U9T9KrJS2Q9MyJZ9+st0XEPaQBGr4q6S8kTQe+BTzOxguwN5AGN/jn/D1/E3mwhApJn5J0TD6H2Js0CMx9E6w0QToP2AB8TWlEzFcDH6mEPcFtNuITwIeVRtJ7oaT9JB0vaV6e38g5TLWfkyotx+Wy572kCkor/Rx4Wz5Wf0q6w/WHwvwlpAG9Fkp6Sb4LdkaOu3I8B/bcyxWnwXIqaTSqb5Jusb4YmBkRqxrdQERcR+rP9H5Se+BjgM+waYWn2neBy0m3gh9hYyEwh3T3awT4Dqnz6erCvp4iVcq2IBXG55JG41lXWOa3wMvzdr5NKqQWkjpzP9povsysvfL3+f+Ryp3bSRWUf2DT7/N6UtnyLlKb/UtG2dzXSFdFbyKVKYfm9Hfmvz8Gvgr8fVUMa0kjeU0jXTH9HKlcLC6zMm/vKeD7wB051nXFWM0G1DtI/fwW57/PIJ1H/A4gIn5NGpXvcFL/oNmk73nROtIIcD8l9bt+JvCXEw0of6//kjTq3U9II+p9Is8e69ykKRHxdVKZ83ZSXv6blN/78/y65zA1tvmfpPi/QDrHOpw0al8rvZPUd+pmUqXpbNLFqEoMlbi3Jf2PF5L+X0E+noN87qU8EobZhEn6PPDqiJje7VjMzMxssEk6mjSk9m65WaA1QdJLSBfcZ0TEzV0Op6vcx8nGTdLfkW7lriU94+Q9bLwtbmZmZtYxkk4g3f14ENiPdMfmP11pmhhJbyA1ubyH9KiHM0h31W7pYlil4IqTTcQM0ohTzyLdkp4HfLGrEZmZmdmgGiKNRDcJ+AVpZMxTAST9G+l5bbV8KyLe05EIO2SURylUvDYi/ruBzTyT1A1jD1LTuxHgg+Fmam6qZ2ZmZmb9KT+3aKdRZj8eEatHmdeTJL1gjNkPVT3vysbJFSczMzMzM7M6PKqemZm
      "text/plain": [
       "<Figure size 864x576 with 9 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# extra code – the next 5 lines define the default font sizes\n",
    "plt.rc('font', size=14)\n",
    "plt.rc('axes', labelsize=14, titlesize=14)\n",
    "plt.rc('legend', fontsize=14)\n",
    "plt.rc('xtick', labelsize=10)\n",
    "plt.rc('ytick', labelsize=10)\n",
    "\n",
    "housing.hist(bins=50, figsize=(12, 8))\n",
    "save_fig(\"attribute_histogram_plots\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a Test Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def shuffle_and_split_data(data, test_ratio):\n",
    "    shuffled_indices = np.random.permutation(len(data))\n",
    "    test_set_size = int(len(data) * test_ratio)\n",
    "    test_indices = shuffled_indices[:test_set_size]\n",
    "    train_indices = shuffled_indices[test_set_size:]\n",
    "    return data.iloc[train_indices], data.iloc[test_indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16512"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_set, test_set = shuffle_and_split_data(housing, 0.2)\n",
    "len(train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4128"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To ensure that this notebook's outputs remain the same every time we run it, we need to set the random seed:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sadly, this won't guarantee that this notebook will output exactly the same results as in the book, since there are other possible sources of variation. The most important is the fact that algorithms get tweaked over time when libraries evolve. So please tolerate some minor differences: hopefully, most of the outputs should be the same, or at least in the right ballpark."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: another source of randomness is the order of Python sets: it is based on Python's `hash()` function, which is randomly \"salted\" when Python starts up (this started in Python 3.3, to prevent some denial-of-service attacks). To remove this randomness, the solution is to set the `PYTHONHASHSEED` environment variable to `\"0\"` _before_ Python even starts up. Nothing will happen if you do it after that. Luckily, if you're running this notebook on Colab, the variable is already set for you."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from zlib import crc32\n",
    "\n",
    "def is_id_in_test_set(identifier, test_ratio):\n",
    "    return crc32(np.int64(identifier)) < test_ratio * 2**32\n",
    "\n",
    "def split_data_with_id_hash(data, test_ratio, id_column):\n",
    "    ids = data[id_column]\n",
    "    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))\n",
    "    return data.loc[~in_test_set], data.loc[in_test_set]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing_with_id = housing.reset_index()  # adds an `index` column\n",
    "train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, \"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
    "train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, \"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "44"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_set[\"total_bedrooms\"].isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To find the probability that a random sample of 1,000 people contains less than 48.5% female or more than 53.5% female when the population's female ratio is 51.1%, we use the [binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution). The `cdf()` method of the binomial distribution gives us the probability that the number of females will be equal or less than the given value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.10736798530929946\n"
     ]
    }
   ],
   "source": [
    "# extra code – shows how to compute the 10.7% proba of getting a bad sample\n",
    "\n",
    "from scipy.stats import binom\n",
    "\n",
    "sample_size = 1000\n",
    "ratio_female = 0.511\n",
    "proba_too_small = binom(sample_size, ratio_female).cdf(485 - 1)\n",
    "proba_too_large = 1 - binom(sample_size, ratio_female).cdf(535)\n",
    "print(proba_too_small + proba_too_large)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you prefer simulations over maths, here's how you could get roughly the same result:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.1071"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – shows another way to estimate the probability of bad sample\n",
    "\n",
    "np.random.seed(42)\n",
    "\n",
    "samples = (np.random.rand(100_000, sample_size) < ratio_female).sum(axis=1)\n",
    "((samples < 485) | (samples > 535)).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing[\"income_cat\"] = pd.cut(housing[\"median_income\"],\n",
    "                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],\n",
    "                               labels=[1, 2, 3, 4, 5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAiEUlEQVR4nO3de5gV1Znv8e9PvBEVL1F7EFA4CXFEeDTSQ3AcTauZiNETnOQYyTERo4aJMUYnZBSSY66HSC460ZnRCcYEGTUMJ8YDx0sSRFsnEy8BxRDxAioqQsC7tBdM43v+qNVh2+7urrb37tp79+/zPPvZVauqVr17KbxU1aq1FBGYmZn1t22KDsDMzAYmJyAzMyuEE5CZmRXCCcjMzArhBGRmZoVwAjIzs0JsW3QA/WXPPfeMkSNHFh0GAK+88go77bRT0WHUHLdLeW6Xrrltyquldlm2bNmzEbFXuW0DJgGNHDmSpUuXFh0GAK2trbS0tBQdRs1xu5Tnduma26a8WmoXSU90tc234MzMrBBOQGZmVggnIDMzK4QTkJmZFcIJyMzMCuEEZGZmhXACMjOzQjgBmZlZIZyAzMysEANmJASzIoyccWOf65g+rp1TK1DPmtnH9bkOs0ryFZCZmRXCCcjMzArRbwlI0v6Slpd8XpZ0rqQ9JC2WtCp9715yzExJqyU9LOmYkvLxklakbZdKUn/9DjMzq4x+S0AR8XBEHBwRBwPjgVeB64EZwJKIGA0sSetIGgNMAQ4EJgGXSRqUqrscmAaMTp9J/fU7zMysMoq6BXc08GhEPAFMBq5K5VcBJ6TlycD8iNgcEY8Dq4EJkoYCQyLizogIYF7JMWZmVieKSkBTgJ+l5aaIWA+QvvdO5cOAp0qOWZvKhqXlzuVmZlZH+r0btqTtgY8CM3vatUxZdFNe7lzTyG7V0dTURGtra/5Aq6itra1mYqkljdgu08e197mOpsGVqafR2hYa8/+ZSqiXdiniPaBjgXsjYkNa3yBpaESsT7fXNqbytcCIkuOGA+tS+fAy5W8TEXOAOQDNzc1RKzME1tJshbWkEdulEu/vTB/XzkUr+v5Hdc3JLX2uo9Y04v8zlVAv7VLELbhPsvX2G8AiYGpangosLCmfImkHSaPIOhvck27TbZI0MfV+O6XkGDMzqxP9egUk6V3A3wJ/X1I8G1gg6XTgSeBEgIh4QNICYCXQDpwVEVvSMWcCc4HBwM3pY2ZmdaRfE1BEvAq8u1PZc2S94srtPwuYVaZ8KTC2GjGamVn/8EgIZmZWCCcgMzMrhBOQmZkVwgnIzMwK4QRkZmaFcAIyM7NCOAGZmVkhPCW3VYSnnjaz3vIVkJmZFcIJyMzMCuEEZGZmhXACMjOzQjgBmZlZIZyAzMysEE5AZmZWCCcgMzMrhBOQmZkVwgnIzMwK4QRkZmaFcAIyM7NC9GsCkrSbpJ9LekjSg5IOlbSHpMWSVqXv3Uv2nylptaSHJR1TUj5e0oq07VJJ6s/fYWZmfdffV0CXAL+MiL8EDgIeBGYASyJiNLAkrSNpDDAFOBCYBFwmaVCq53JgGjA6fSb1548wM7O+e8cJSNJ7Je3Yi/2HAEcAVwJExBsR8SIwGbgq7XYVcEJangzMj4jNEfE4sBqYIGkoMCQi7oyIAOaVHGNmZnUiVwKS9B1JU9OyJC0GHgHWS/pAznP9N+AZ4KeS7pP0Y0k7AU0RsR4gfe+d9h8GPFVy/NpUNiwtdy43M7M6kndCupOBk9LyscDBwMRUPhs4Mue5DgHOjoi7JV1Cut3WhXLPdaKb8rdXIE0ju1VHU1MTra2tOcKsvra2tpqJpVKmj2vvcx1NgytTTy21rduluhrxz1Il1Eu75E1ATWy96vgIsCAi7pH0PLA0Zx1rgbURcXda/zlZAtogaWhErE+31zaW7D+i5PjhwLpUPrxM+dtExBxgDkBzc3O0tLTkDLW6WltbqZVYKqUSM5lOH9fORSv6PknvmpNb+lxHpbhdqqsR/yxVQr20S95nQM8B+6XlDwO3puVtKX9F8jYR8UfgKUn7p6KjgZXAImBqKpsKLEzLi4ApknaQNIqss8E96TbdJkkTU++3U0qOMTOzOpH3n1XXAddKegTYA/hlKj+YrHNAXmcD10jaHngM+AxZElwg6XTgSeBEgIh4QNICsiTVDpwVEVtSPWcCc4HBwM3pY2ZmdSRvAvoS8ASwL3BeRLySyoeSdYnOJSKWA81lNh3dxf6zgFllypcCY/Oe18zMak/eBLQP8E8R8Wan8h/y1uc0ZmZmueR9BvQ4sGeZ8j3SNjMzs17Jm4BE+a7OOwOvVy4cMzMbKLq9BSfp0rQYwIWSXi3ZPAiYACyvTmhmZtbIenoGNC59CzgAeKNk2xvAvcAPqhCXmZk1uG4TUEQcCSDpp8A5EfFyv0RlZmYNL+8zoK8AQzoXShouqamyIZmZ2UCQNwHNIxsDrrNjgH+vXDhmZjZQ5E1AfwXcUab8Pyn/YqmZmVm38iagbYEdypTv2EW5mZlZt/ImoLvJxl/r7Czgd5ULx8zMBoq8Q/F8FbhV0kFk02YDHAW8H/hQNQIzM7PGlusKKCLuAg4lG8H6Y8DHyYbgOTQiflu98MzMrFHlnuUqIu4HPlXFWMzMbADpMgFJ2iMinu9Y7q6Sjv3MzMzy6u4K6Jk0VfZG4FnKD0baMUjpoGoEZ2Zmjau7BHQU0HFlc2Q/xGJmZgNIlwkoIm4HkLQtcCDwfyNiXX8FZmZmja3HXnAR0Q58H9iu+uGYmdlAkfdF1LuA8dUMxMzMBpa83bCvAH4gaV9gGfBK6caIuDdPJZLWAJuALUB7RDSnHnb/AYwE1gCfiIgX0v4zgdPT/l+MiF+l8vHAXGAwcBPZVBHlOkmYmVmNypuArk3fF5fZ1ttecEdGxLMl6zOAJRExW9KMtH6+pDHAFLLnT/sAt0h6X0RsAS4HppFdmd0ETAJu7kUMZmZWsLwJaFQVY5gMtKTlq4BW4PxUPj8iNgOPS1oNTEhXUUMi4k4ASfOAE3ACMjOrK3mfAe0HPB0RT5R+gKfTtrwC+LWkZZKmpbKmiFgPkL73TuXDgKdKjl2byoal5c7lZmZWR/JeAd0GDAU2dirfNW3LewvusIhYJ2lvYLGkh7rZV2XKopvyt1eQJblpAE1NTbS2tuYMs7ra2tpqJpZKmT6uvc91NA2uTD211LZul+pqxD9LlVAv7ZI3AXWMeNDZu+nUIaE7He8RRcRGSdcDE4ANacSF9ZJKk9xaYETJ4cOBdal8eJnycuebA8wBaG5ujpaWlryhVlVrayu1EkulnDrjxj7XMX1cOxetyD08YZfWnNzS5zoqxe1SXY34Z6kS6qVduv2/WtKitBjA1ZI2l2weBIwFco2GLWknYJuI2JSWPwx8C1gETAVmp++F6ZBFwLWSLibrhDAauCcitkjaJGki2TxFpwD/nCcGMzOrHT39s+q59C3gBeC1km1vAL8h66KdRxNwvaSO814bEb+U9DtggaTTgSeBEwEi4gFJC4CVQDtwVuoBB9nkeHPJumHfjDsgmJnVnW4TUER8Bv78/s4PIiL37bYydT0GHFSm/Dng6C6OmQXMKlO+lOzqy8zM6lTeXnDfpuTqR9JfSDpD0l9XJywzM2t0eRPQjcDZAJJ2BpaSjQ93u6RTqhSbmZk1sLwJaDxwa1r+GPAy2fs6nwW+XIW4zMysweVNQLsAL6blDwPXR8SfyJLSe6oQl5mZNbi8CehJ4LDUffoYYHEq3wN4tRqBmZlZY8v7dtvFwL8DbcATwB2p/AhgRRXiMjOzBpcrAUXEjyQtIxuZYHFEvJk2PQpcUK3gzMysceUe3yO9e7O0U1nfxxkxM7MBqcsEJOlLwGUR8Xpa7lJElJsnyMzMrEvdXQGdTTY/z+tpuStB+YnqzMzMutRlAoqIUeWWzczMKiFvN2wzM7OK6u4Z0NfyVhIR36pMOGZmNlB09wzoxE7r+wHvYuvkb/uQvYS6hmxeHzMzs9y6ewY0rmNZ0mfIJn6bGhFPprJ9gZ8C11Q7SDMzazx5nwF9DTi3I/kApOXpwNerEZiZmTW2vAm
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "housing[\"income_cat\"].value_counts().sort_index().plot.bar(rot=0, grid=True)\n",
    "plt.xlabel(\"Income category\")\n",
    "plt.ylabel(\"Number of districts\")\n",
    "save_fig(\"housing_income_cat_bar_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "\n",
    "splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)\n",
    "strat_splits = []\n",
    "for train_index, test_index in splitter.split(housing, housing[\"income_cat\"]):\n",
    "    strat_train_set_n = housing.loc[train_index]\n",
    "    strat_test_set_n = housing.loc[test_index]\n",
    "    strat_splits.append([strat_train_set_n, strat_test_set_n])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "strat_train_set, strat_test_set = strat_splits[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It's much shorter to get a single stratified split:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "strat_train_set, strat_test_set = train_test_split(\n",
    "    housing, test_size=0.2, stratify=housing[\"income_cat\"], random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    0.350533\n",
       "2    0.318798\n",
       "4    0.176357\n",
       "5    0.114341\n",
       "1    0.039971\n",
       "Name: income_cat, dtype: float64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Overall %</th>\n",
       "      <th>Stratified %</th>\n",
       "      <th>Random %</th>\n",
       "      <th>Strat. Error %</th>\n",
       "      <th>Rand. Error %</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Income Category</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.98</td>\n",
       "      <td>4.00</td>\n",
       "      <td>4.24</td>\n",
       "      <td>0.36</td>\n",
       "      <td>6.45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>31.88</td>\n",
       "      <td>31.88</td>\n",
       "      <td>30.74</td>\n",
       "      <td>-0.02</td>\n",
       "      <td>-3.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.06</td>\n",
       "      <td>35.05</td>\n",
       "      <td>34.52</td>\n",
       "      <td>-0.01</td>\n",
       "      <td>-1.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>17.63</td>\n",
       "      <td>17.64</td>\n",
       "      <td>18.41</td>\n",
       "      <td>0.03</td>\n",
       "      <td>4.42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>11.44</td>\n",
       "      <td>11.43</td>\n",
       "      <td>12.09</td>\n",
       "      <td>-0.08</td>\n",
       "      <td>5.63</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 Overall %  Stratified %  Random %  Strat. Error %  \\\n",
       "Income Category                                                      \n",
       "1                     3.98          4.00      4.24            0.36   \n",
       "2                    31.88         31.88     30.74           -0.02   \n",
       "3                    35.06         35.05     34.52           -0.01   \n",
       "4                    17.63         17.64     18.41            0.03   \n",
       "5                    11.44         11.43     12.09           -0.08   \n",
       "\n",
       "                 Rand. Error %  \n",
       "Income Category                 \n",
       "1                         6.45  \n",
       "2                        -3.59  \n",
       "3                        -1.53  \n",
       "4                         4.42  \n",
       "5                         5.63  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – computes the data for Figure 2–10\n",
    "\n",
    "def income_cat_proportions(data):\n",
    "    return data[\"income_cat\"].value_counts() / len(data)\n",
    "\n",
    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
    "\n",
    "compare_props = pd.DataFrame({\n",
    "    \"Overall %\": income_cat_proportions(housing),\n",
    "    \"Stratified %\": income_cat_proportions(strat_test_set),\n",
    "    \"Random %\": income_cat_proportions(test_set),\n",
    "}).sort_index()\n",
    "compare_props.index.name = \"Income Category\"\n",
    "compare_props[\"Strat. Error %\"] = (compare_props[\"Stratified %\"] /\n",
    "                                   compare_props[\"Overall %\"] - 1)\n",
    "compare_props[\"Rand. Error %\"] = (compare_props[\"Random %\"] /\n",
    "                                  compare_props[\"Overall %\"] - 1)\n",
    "(compare_props * 100).round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "for set_ in (strat_train_set, strat_test_set):\n",
    "    set_.drop(\"income_cat\", axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Discover and Visualize the Data to Gain Insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing = strat_train_set.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualizing Geographical Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABo/ElEQVR4nO2deXwV1d24nzNzl4QEAgYNSwDBiAgoUWkRQWRR64LYvlLaSmtbq7Z9tVZFoa2lirb9qWi1KrW1vl14a4uIb2WxiwsggoiiBiQUMYJCQFEiW0K4y8z5/TF3LneZu+YuITnP5xPlznLmzNzkfOe7CyklCoVCoVAUGq3YE1AoFApF50QJIIVCoVAUBSWAFAqFQlEUlABSKBQKRVFQAkihUCgURUEJIIVCoVAUBVexJ5ANPXv2lCeeeGJBrtXS0kJZWVlBrtVe6ezPoLPfP6hnAOoZvPnmm3ullMfncsxjUgCdeOKJrF+/viDXWrlyJePHjy/Itdornf0ZdPb7B/UMQD0DIcSHuR5TmeAUCoVCURSUAFIoFApFUVACSKFQKBRFoeACSAihCyHeFkIsC32eK4TYIoTYKIT4uxCie6HnpFAoFIrCUwwN6IfAfyI+vwAMl1KeDmwFflyEOSkUCoWiwBRUAAkhqoFLgSfsbVLK56WUwdDH14DqQs4pkqZmHxt27qep2VesKbSJY33+CoWic1HoMOyHgJlA1wT7rwaeKthsInjytQ+Zs2wzHl0QNCX3XXE6U2r7FmMqWbG4bhezntmIW9MImOYxN3+FQtH5EIXqBySEmAxcIqX8byHEeOBWKeXkiP23AyOB/5IOkxJCXAdcB1BVVXXWggULcja3z1r87NrfGrVNE4IhvbrSeriF8vLyqH2GKfEbJh5dQ9dE1tfN5ThbPj6EGfHY7Pm3ZVyb5uZmSruU5WSuxyLNzc1xvwOdDfUM1DOYMGHCm1LKkbkcs5Aa0BhgihDiEqAE6CaE+IuU8utCiG8Ck4FJTsIHQEr5OPA4wMiRI2VbEsIa9hyibud+avt1p0eZh9H/7yX8RvSjKPPq/PXcM9j3fl1U8llbNI2mZh+N+1qp7lHK6oa9OdNYNuzcz29fXschXzC8ravXxbyRw6go9VDdo5TKcm9WYwMs/ucLXL/iyDGrXUU+92yeQ2dPQAT1DEA9g3xQMAEkpfwxoQCDCA3o60KIi4BZwHlSysP5nsfPnn2H+a/tCH8+e2APAka8zAsYkuoepeyL2NbU7GPWMxs5EjA5ggnAzGc2MqamZ8qFLVJw+Q0TwzQJmmQ8jhPVPUoJmGbUtiNBg2vnr8ej620SGvbifSSgpzXXtiz2bRUUTmT7whA5F4VCkR/aQymeRwEv8IIQAuA1KeX38nGhhj2HooQPwGvb9zkee+uFg+MWwcZ9rbg1LbwQA7g1jcZ9rUkXTCfBFUs64ySistzLfVeczq1PbwDAlBLTlAQk+IKWVpStgGvc10qswS3RXNuiHebDh5XtC0PsXO49pz38mSgUHY+iJKJKKVfa/h8pZY2Usp+Usjb0kxfhA1C3c3/axw7pFR8n4aRpBEwz5VuyLbiSkc44yVj/wWf4DYnfkARNiFXqbKGRKdU9SonVD53mGrnYH/IFORIwmfnMxrQi8tpybjKcnnuq5+A0l8Z9rSqyUKHIA52qEkJtv+5pH3uwNRi3zdY0StwaXb0uStwa911xekqtwklwuXWB1yUyGicRTppdLNkKuMpyL9U9SlPeczaLfS7OTUaZR8cXNKK2pXoOTnMRoe0KhSK3dCrbQk1VV64a3Z/5a5Mv1gDdSp0fzZTavoyp6ZmRr6Ky3MvsS4cyZ2k9bl3DkFaYd6bjJCIdzW72pUOzvkZFqZs1s8YlnWu22mFbz02EbUbTNAGGxKsLhCZSCnqnucjQdoVCkVs6lQYEcNflp/HizeO487JTEx7j1gXD+lREbYtM8qws9zKiX/e0F/TFdbu4+7nNeFwaAVMye/JQptT2zXicRKTS7Mq8OsP7ViQ9JhWp5pqtdpjpuekk20b5fgKWMJFCsOyGsSn9Sk5zyWVQhEKhOEqn0oBsaqq6UlPVlePKvMxYWEfQtN5y3ZpA0wRzp0Yvfm0NvbYXQ5u7l23momG9MlrUkkWIpdLs/MG2aRPpzisb7dAmnXPT/R6cgkW8ukaL34g7Np25vLN+bdr3oVAo0qdTCiCbyIWmzKPT4jfiFj/DlFmHXkP2kXOR2AuvLgQBw+SOy4Yx/ewBUcfcdflpXHX2ify7/mPuf35rVOBAZGpVLkOdEwmEbMetLPcmPDeTiLZcmPSSzUWhUOSGTi2AIPVC4zfMNgmQti6GThrU7c9uAgHTR0ULoZqqrrT4DX778raopNRSt4vGfa38a9PHzFm6GV0DU8LcqdmHOrclJyobMhHkthltZoxwTHde+chHUigU8XR6AeRE5ALk0bU2CZC2LoaN+1rRRXzpmzlLnc14iQTeum1N/PKfW6wNIUvUjKc3ZC0wcqHZZUKmgtzWbut3HwAEw/p0S+s6TlpdemcqFIpMUQIoBqckxEgB4jdMrh9fE3desrfmtvhGqnuUEjDik1fdukj77X/25KHcuWRT3BgBQ/J8/cd8LUaTSkcDyHXkWqprZiPIMy13lEirmzehJKt7UigUyVECKAKnBahxXytfPbcna2ZN5Ml1O5i34j0eX7WNeSsbwgtaOs7xTHwKsYvxHZcNs8xuERimTPn2b49Rv/sgGgLiUkrhx3/fxH8+Pshdl58GpK8BtFWziyTd4IJMBHk2JsJEWp3f4QVAoVC0HSWAInBagOwkxOoepfxmZQO+oIwqbzO0d7ec+kKcFuPpZw8AYZnd3LrACLWLSDa+LfAW1+1i5qKN+Bzq3dnMX7uDq84+kR5lnow0gLZodjaZCop0BXk2JsJEWp1H73TZCgpFQVACKIJkSYiJFrS6nftz5gtJthhPHzWAi4b1SrjYO5mw7PF8wdRv8HcsrmfmxUMy1gDaGi2WL19SNibCRFqdvv+9rOehUCgSowRQBPYCdNuiDehCw5Bm1ILutKDV9uueM19IqsU40WKfyITlNF4i1mxrIhA0Cq4BpCMosolKy9ZE6KTVrVypBJBCkQ+UbSEGy1AlLNtbRB3oRNn6NVVds64AEEs2b+3JCnk6jZeMD5oOO95LPhvQpaqCsLhuF2PuXc7Xn1jHmHuXs6RuV9pjT6nty5pZE/nLNaNYM2ti2iHnuapQoVAokqM0oBBNzT7qdx+0/CURJiu7EnJluTehzyPR9kzf3LN5a0+mNY3o1z0ugs8fNB1CESxq+3WnpqprwTWAZM+vrf41lVCqULRflAAionAlwtFfUr/7IOMGHw8kXtBit2dbvidTx34qrSm22sPFD7/i2IDvqtH9qanqmvQe85mg6XTNfPiHVJKpQtF+6PQCyKnSQCSmlFw7f31GVQPa+uaezlt75EKaSmuyx9uwcz8lLp2AcbRKgtel8cCXT2fyiL4Jx4f0BWouF/hc5xrlo+mdQqHInk4vgBr3tSLNxCHKAL6gmVHVAKfqBbmsEuC0kK6ZNTGr5FGQjD6pZ9Lxf3m2zk9eSi5QG/Yc4qEXt/LPTR9T4tYwJG1e4HOZa1To0kEKhSI1nT4IocyjJ82RsQkYMlTWJTWbdh2Iq7yczpt7pq0GIoMOgISOc3tcsISCK+JbNyWsadibdPzdB47g0pwFKsDPnn2H8x9cxbJ3PsaQ0OI329zV1J7zmJqeWQUSxJKvpncKhSJ7Or0GtOXjg2kf69QlNZamZh93P7c5bvst5w9O+qadyjxkm7YOtPoz8ovEjjt78lB0TSMY0oQChozSBBIl4/pjhLQtUJN1Y9UEWWl9+TCV5aPpnUKhaBudWgNaXLeLG/5Wl/bxibqkRuL0pg1w/wtbo0KII7WdZKHU9jztUORr56/nSBptppuafaza+ikzF0WPO2fp5qTaTKLQ7TsuG0qJW6PMq+NxacyebHVYTdaN1W8kLheUiFTPIlva0jBPoVDkh06rATXsOcRNC+oyOqdPRXbtpcFqCmdrGrFFMq8fX5NQqwGYuWgDvqAM73dpVvC
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True)\n",
    "save_fig(\"bad_visualization_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACxSklEQVR4nOz9eZhcV3qfCb7n7rEvua/YNxIkQBIsssgSiyxKllylvV2S3S23ZLul8TqaGXsky26Pxt3W2B7bbftx99M9ctv9qNttl6WyZEmlKtUqFmshixtAAiRA7LmvkbFH3P3MHzcjkZnITGQCiaWI+z6iChmZcePEjczznW/7fUJKSUxMTExMzL1Gud8LiImJiYl5OIkNUExMTEzMfSE2QDExMTEx94XYAMXExMTE3BdiAxQTExMTc1+IDVBMTExMzH1Bu98LuB26u7vl3r1778lrNZtNUqnUPXmtB5WH/R487O8f4nsA8T14++23F6WUPbt5ze9LA7R3717eeuute/Jar7zyCi+++OI9ea0HlYf9Hjzs7x/iewDxPRBCjO32NeMQXExMTEzMfSE2QDExMTEx94XYAMXExMTE3BfuuQESQqhCiNNCiC8sf/1PhBAXhBDvCSF+TwiRv9driomJiYm599wPD+iXgfOrvv4qcFxK+ThwEfi1+7CmmJiYmJh7zD01QEKIYeAzwP/aeUxK+RUppb/85evA8L1c02r8IKTl+vhBeL+WcEd8v68/Jibm4eJel2H/C+BXgMwm3/+LwH+8Z6tZxWLd5t3JKgCmrvDYYJ5cUr8fS7ktKi2Xc9NVgkCiquL7bv0xMTEPH+JezQMSQvwo8Gkp5V8VQrwI/C0p5Y+u+v7fBU4BPy03WJQQ4peAXwLo6+t76nOf+9yurc0PJeWWiwAUITBUBQSkTY1Go0E6nb7pOaEERdz5a+/WdRqOjyIEQoCUEEpJ2tyd80XnHuzWWr/f2Ox34GEivgfxPXjppZfellKe2s1r3ksD9A+BPw/4gAVkgd+VUv6cEOLngb8MvCylbN3qWqdOnZJ30ohquz4N1ydtaGiqwisX57ky1yCXNAilJJCSwVyCjx/o4o3vfntN89mdeBp+EOIGIYaq0HD8XfNYWq7Pm9eWKKbMlceWmg5PjObRVAVDVdDU24+2fv0bf4I5evz71rtafd9v5z487A2IEN8DiO+BEGLXDdA9C8FJKX+N5QKDVR7QzwkhfgT4VeCT2zE+d8rYYoMvvT+L54fomsKz+wpcn29wZb6BoSskTZ20qTKYS0Se0Cr8IOTcdBVLUzEtFccPODtd4dl9Xbfc2FYbLgG0/YCulLnj62yEoSqoqsDxA0wtup7tB5yeqIDkjoyGH4S0vYDcNt/znWz2d2ooNuJ2Dwyr1xITE3N3eBCkeP5HwAS+KoQAeF1K+ZfvxgvZrs+X3p8lbWqkMjofzlT4259/j5rj44eS7pRJd9qkO2Pys0+P3LQJukFIEEhMSwXA1FSajo8bhFtumOsNV9V2uTLVpO9gYkfX2QxNjXJW74wtMdFuYigKQhWkDG3FIN2ugXOXCxpM7dbv+U68w7uRw7rdA8P6tQRhPLY+JuZucF8MkJTyFeCV5X8fvFev23B9PD8kldFpex5vjJcptz0E4PshczUbQxMc7EuT0NWbnr+Rp6Gq4pan5PWGK21oSCFpOB65hLHt62xFpeXwzniZiXLkRHZnTJ7f34OWUu7IwHXWdKv3fCfe4Z08dytu58Cw0VoqXoB/m4eDmJiYzXmo/qLShoauKTRdj0bbp1R3MVQVIQRZy0BRo/+tND388OZS5o6nYfsBS00H2w94bDB/y41pteGCqOjhUG+GQModXWczbNfnj87O0HB8Rgop8pbGW9fL/PH707w7WaHUdG7bwGmqQkJXb/meVzb7VZ5SEMgVD2or7uS5W6EAXhiVpgPbMvQbraXzeExMzO7yIITg7hmWofGZ44P80blpyi0HpKSYMqjZPq4fIEOJpigM5C00ZeNNKpfUeXZf145yFZqqcKw/y+mJMgoCQ1d4Zl8XKVPdlZxHw/WxvQBdVVGFoOGEFBIGoRQ0bY8LszU++9TNIcXtoirilu/5dr3DO33uZnTCaK4nOT1fZrSQJJvUb2noN1pL5/GYmJjd5aEyQAAjXUl+4eN7qbQ9+rIJXrkwT6XtIyXsKSY5MZLj2GCOpLH21qxPkO9kM6+0XM7P1hAIJHCsP7uS39iNsE7a0LB0lbrtYIuowMHUFfZ1J3h8JIfjSRLGzSHFnXCr99zxDs9OV2g6/koeZ7sGervP3U6hwuowWq5o0JXWaTgBp0YLWMbWv/IbrSWhq3H4LSbmLvDQGSCIPKF+Q+O/eHKEkWKSqaUmCw2Xvd1p+jImJ0YKazacOy29XtkMrSjfc362tuP8xlYbr2Vo/NjjQ/yn0xOMl5osNR36MwkUoXBxrsFwPnnXTvCr13U73mGH7Tx3u5/D+txP0tCxvZDtBtHWr+Xbkw9h81NMzD3goTRAHXJJnZeO9OIGIQoQwoab350kyG+3cm41N8JJISGSJ0YKdKXNNT8z0pXkL79wgMWmw/eulCi3PTRFIZQSyY0qrt0sdd7MINzudbfysnZSqLAbIb2derkxMTE756E2QHDrjSaU3JEBudPNsLPxen7IdMWm7flcnm/w2adGbjJClqFRBLozFgd6s3hBiK4qVNsubhBSabmcHi/jh5KUoXFipHDbpc53q3JtM3ZiyO8kHNh5b7vdjxQTE3MzD70B2ojVG5AiuCMDcqeboRuEuF5kfHRVIWVaLDZszkxU+OThnk1P/34YYuk31luzXf7D98ZZqLuoimAgZ2H7IZ862ntbm+xueHY7YaeGvBNG61TAJbeZA9vIq4uJibk7xAZoHRs1IT6xyoAI4FD/zVqqW52a7yQ3YqgKIZK255MyLdwgwDJulAZv5/R/rD/L22NLLNRdulImoZRMVdvUWi4nR7L0ZpPbfi+r17WblWu3es3bMeQ7lTvazKuLiYm5O8QGaBWbNSGmTJVn93WxUHe4OF/n4mydK2pjZUPbTnJ8JzmF9ZvxEyMFLs83WGzYWIbKaD6Fqm1eGrze4LVcn5bjI4Sg7fqMl1tcW2ygqgKE4L96Zi8jXZER2q4HcKee3Wq2W1ywE0N+OyHCzby6WAghJubuEBugVWy0AXUeN1SFK4uNm+RtTo0WdjUXstFm3JU2+exTI5yZqACgatxys+8YvErL5b2JChOVNo7rM11xmC7bqEIwXEiw1HT5z6cn+cUX9qOpyo48gDvx7Drs1FBs15DfTohwM69OxkVwMTF3hTjDuor1igWrmxA369ZvuP6udfGv3oyLKRNLUzk7XcEPQrrSJp883MPHD3Tx7L6umzyEjYbRda6XNDVOjXYxVExQbXvYfoiuqkwu2Vyca/Ddq4t8OFvf9D1u5QFoqkJyWVX8drhbKggbfZa3ChFupnQRExNzd4g9oFV0NqAzk2WWGg6moa5pQtzodJw2tF3Lhdzq1L7Z6X+zENbq65kaPLe/m2vzTZKGQ7nlk9BUvDAkZRh86/IiB3pS99wD2E4u6Xaq0m43RLgbXl1MTMz2iP+61iGRCAlSgFh18t/sdGwZ2m3pw23E7Zzat/Ka1l/Pk5K9PSmO9mVp2tHIBkNTeXK0iJQSOwjvuQdwK329Ssvl9Wsl3ry2xOvXSlRb3rav3TEmT+8rbug1brWmO/HqYmJitkfsAS3TCWG9N1EhaWoUUiaOHzC7Sgl5s9PxZo/v9OR+O6f2rbym5LJxXF3Bt6cryULNZrQrgSYEw8UUaUul5S1L+hjaPfcAtrp/d5pfixtKY2IeXGIDxI0QVsv2ubTQ4MmRIqbGipJAy/XJJgxg8w1t/eO3K9+z0xDQrUJYq6+nAK9eWqRq+zw5WuTsZIVK26XhWPzY44MrOmmbvce72aC50WvejV6juMk0JubB4aE3QJ1Ttq4oZBMGvif5ygdTHB/IsdjyKHghb18v8+Se4m3pv93OyX07p/bVG+mtvKbO9Vquj6kpPDlSwAtDPn6gyFzN5hOHuimmrE2vD9s3qLu5we92r9HdGHo
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True, alpha=0.2)\n",
    "save_fig(\"better_visualization_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAArMAAAHoCAYAAABaRmeyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOzdd3hURRfA4d9szaaHklBCr9KkCSJIVcACKjZsoGIXsRfsoljAggUV/bBSFStYEJQqSFOQJr0mISEkpG3fne+P2ZCEJKSQbALM+zxrNnfvnTt7E+LZuWfOCCklmqZpmqZpmnYqMlR1BzRN0zRN0zStvHQwq2mapmmapp2ydDCraZqmaZqmnbJ0MKtpmqZpmqadsnQwq2mapmmapp2ydDCraZqmaZqmnbJ0MKtpmqZpmnaaEELsFUJsFEKsF0KsDWyrIYRYIITYEfgak2//sUKInUKIbUKIQfm2dwm0s1MI8Y4QQgS2W4UQswPbVwkhGuc7ZmTgHDuEECOD9Z51MKtpmqZpmnZ66Sel7Cil7Br4/gngdyllC+D3wPcIIdoAw4G2wGDgfSGEMXDMB8AdQIvAY3Bg+yggXUrZHHgLeC3QVg3gOaA70A14Ln/QXJl0MKtpmqZpmnZ6uwz4PPD8c+DyfNtnSSldUso9wE6gmxCiLhAppVwp1epaXxx3TG5bc4ABgVHbQcACKWWalDIdWEBeAFypdDCraZqmaZp2+pDAb0KIdUKIOwLb4qSUSQCBr7GB7fWBA/mOPRjYVj/w/PjtBY6RUnqBDKDmCdqqdKZgnKSi1apVSzZu3Liqu1GsnJwcwsLCqrobZzT9M6ha+vpXLX39q5a+/lUrJyeH//77L1VKWbuq+tBcCGmvhHaTYDPgzLfpIynlR8ft1lNKmSiEiAUWCCH+O0GTooht8gTby3tMpTolg9nGjRuzdu3aqu5GsRYvXkzfvn2ruhtnNP0zqFr6+lctff2rlr7+VWvx4sX069dvX1X2wQ7cWQntPg/OfHmwRZJSJga+pgghvkPlryYLIepKKZMCKQQpgd0PAg3yHR4PJAa2xxexPf8xB4UQJiAKSAts73vcMYvL+BbLRacZaJqmaZqmVSCBGi2s6EeJ5xUiTAgRkfscGAhsAn4EcqsLjAR+CDz/ERgeqFDQBDXRa3UgFSFLCHFuIB92xHHH5LZ1FfBHIK92PjBQCBETmPg1MLCt0p2SI7OapmmapmnVlQDMVXPqOOC7QBUtEzBDSvmrEGIN8JUQYhSwH7gaQEq5WQjxFbAF8AL3Sil9gbbuBj4DbMAvgQfAVOBLIcRO1Ijs8EBbaUKIF4E1gf3GSSnTKvPN5tLBrKZpmqZp2mlASrkbOLuI7UeAAcUcMx4YX8T2tUC7IrY7CQTDRbz2CfBJ2Xp98nQwq2mapmmaVoFy0wy04NA5s5qmaZqmadopS39w0DRN0055mZmZpKSk4PF4qrorREVFsXXr1qruxmnLbDYTGxtLZGRkVXelWFWYM3tG0sGspmmadkrLzMwkOTmZ+vXrY7PZCEx+qTJZWVlERERUaR9OV1JKHA4HCQkJANU6oNWCRwezmqZp2iktJSWF+vXrExoaWtVd0SqZEILQ0FDq169PYmJitQ1mdc5scOlrrWmapp3SPB4PNputqruhBZHNZqsWKSXF0WkGwaUngGmapmmnvKpOLdCCS/+8tfz0yKymaZqmaVoF0mkGwaVHZjVN0zTtDPbZZ58RHh5+0u0sXrwYIQSpqakV0CtNK72gB7NCCKMQ4h8hxLzA9xOFEP8JIf4VQnwnhIgOdp80TdM0DUC63Ui3u6q7Ue01btyY119/vcC28847j6SkJGrWrFlFvao+cnNmK/qhFa0qRmbvB/IX4FsAtJNSdgC2A2OroE+apmnaGUpKiX3GDJLbtSPRZiPRZiO5TRvs06Yhpazq7p0yLBYLderU0fms5KUZVPRDK1pQg1khRDxwCfC/3G1Syt+klN7At38B8cHsk6ZpmnbmklKSfuutHL3jDrybN4PfD34/3q1bOXrXXaSPGFFpAW3fvn256667uP/++4mJiSEmJoZHH30Uv98PQHp6OiNHjiQmJgabzcYFF1zA5s2bjx2fmx4wd+5cWrZsSUhICP369WP37t3H9nn++edp165dgfOWlFawa9cuLrvsMurUqUNYWBidO3dm3rx5Bfq9b98+Hn30UYQQx4LXotIMvv32W9q3b4/VaqVBgwaMHz++wPVs3LgxL730EnfeeSeRkZHEx8czceLEcl5R7UwV7JHZScBjgL+Y128Ffglab8opIR2mLIZ560F/aC/a0aM+7PbifsyapmnVg3POHJxff43MySn0mszJwfnddzhmz66080+fPh2/38/KlSuZMmUKH330EZMmTQLg5ptvZtWqVfzwww+sXr2a0NBQBg8ejMPhOHa8y+XihRde4NNPP2XlypX4fD6uuOKKkwrAs7Ozueiii1iwYAEbNmzgyiuvZNiwYfz333+AClDj4+N59tlnSUpKIikpqch21q1bx9VXX82wYcPYuHEjr776Kq+88grvvfdegf3eeust2rdvz99//83jjz/OY489xsqVK8vd/+pApxkEV9BGrYUQlwIpUsp1Qoi+Rbz+FOAFphdz/B3AHQBxcXEsXry40vp6Ih4fbE5UQeyO/TBtNzSoUXCf7OzsKutfdXDokI/ERDXY3ry5mcjI4GeznOk/g6qmr3/VOtOuf1RUFFlZWeU6Nvull4oMZHPJnBwyXnoJ3yWXlLpNn89Xqv74fD7i4uIYP348Qgjq16/PmDFjeOONN+jXrx8//vgjv/zyC506dQLg/fffp23btkydOpWRI0fidDrxer288sordOjQAYAPPviADh06MHfuXPr164fL5cLv9xfoj9PpBDi27fjvmzZtStOmTY/tP2bMGL7//numT5/OY489htlsxmAwYLFYCAsLO3as3W4H1O+f1Wrltddeo1evXjzyyCMADB06lE2bNvHqq69y8803q+srJf369WPkyJGACuAnTZrEzz//XGhE+XhOp7PI3/Ps7OwSr712eglmCkZPYKgQ4mIgBIgUQkyTUt4ohBgJXAoMkMV8nJRSfgR8BNC1a1fZt2/fIHW7oBtehhnbAKP63mIE18cF91m8eDFV1b+q5vdLrNadeAOJI+3aWdi4sVHQ+3Em/wyqA339q9aZdv23bt1aruVjpZRk5rttXxz/1q2Eh4UhDKX7YF7a5WyNRiPnnXdegVWs+vbty0svvcSBAwcwGAwMGDAAs1mNyUVERNC+fXt2795NREQEISEhGAwG+vbte2yftm3bUq9ePfbu3UtERARWqxWDwVCgPyEhIcfaK+r7nJwcXnjhBebNm0dSUhIejwen00mnTp2O7SOEwGq1Fmg3dwW28PBwIiIi2LlzJ5dcckmBfQYMGMCrr76KlJLIyEiEEHTp0qXAPvHx8WRkZJR4DUNCQo4F+vlVhw9yetGE4ApaMCulHEtgcldgZPaRQCA7GHgc6COltAerP+WRlgZffQGcw7FgNrocqyd6fWAyVmTPqg+DQVCzppGUFB8mEzRqVLUp63a7D78fwsNP0wuuaVpwBHlS04nSBMoywcpgMBRqq6SVsx555BF+/fVXXn/9dVq0aEFoaCgjRozAXcYqD1LKYvuaf3tuIJ7/tdy84VOZnrAVPNWhzux7QASwQAixXgjxYVV3qDgeDxj2AQcACSY/fHNv2doY/T+wXAetxsCR8t0VK7WjRyXPPefh0Ufd7NsXvD8Mf/xRn4svDmX48Ag++6xO0M57vFGjdhAV9RcxMX9x+eVbcbtP/T+OmqZVHCEElm7dStzP3KVLpc3QX7VqVYFg86+//qJevXq0adPmWC5trszMTDZu3EibNm2ObfP7/axZs+bY9/v37ycxMZGzzjoLgNq1a5OcnFzgHOvXrz9hn5YvX86IESO48sor6dChA/Hx8ezatavAPhaLBZ/Pd8J22rRpw/Llywu1HR8fX66RdE0rTpUEs1LKxVLKSwPPm0spG0gpOwYed1VFn/L7axN0Ggltroe5+f4dxsXBtbcDh8G4Ea5
      "text/plain": [
       "<Figure size 720x504 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True,\n",
    "             s=housing[\"population\"] / 100, label=\"population\",\n",
    "             c=\"median_house_value\", cmap=\"jet\", colorbar=True,\n",
    "             legend=True, sharex=False, figsize=(10, 7))\n",
    "save_fig(\"housing_prices_scatterplot\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The argument `sharex=False` fixes a display bug: without it, the x-axis values and label are not displayed (see: https://github.com/pandas-dev/pandas/issues/10611)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The next cell generates the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAosAAAHoCAYAAAAhYqV+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOyddZgkxdnAf9Xd47Kut3u754YczuHuwV1CIB8WQiBIgISgIYFAcA6HBNfg7hJcDs5d1313fKa7vj9m1nXWbqV/z7PP7s50VVdVV1e//dYrQkqJiYmJiYmJiYmJSVcom7sBJiYmJiYmJiYmIxdTWDQxMTExMTExMekWU1g0MTExMTExMTHpFlNYNDExMTExMTEx6RZTWDQxMTExMTExMekWU1g0MTExMTExMTHpFlNYNDExMTExMTHpA0KIdUKIhUKIBUKI7xOfpQsh3hdCrEz8Tmtz/JVCiFVCiOVCiAPbfL5dop5VQoi7hBAi8blNCPFc4vNvhBDFbcqcnjjHSiHE6cPYbVNYNDExMTExMTFJgr2llHOllNsn/r8C+FBKOQ34MPE/QojZwInAHOAgYL4QQk2UuQ84G5iW+Dko8flvgTop5VTgduDmRF3pwDXATsCOwDVthdKhZtiFRSGEKoT4SQjxRuL/W4QQy4QQvwghXhZCpA53m0xMTExMTExM+skRwH8Sf/8HOLLN589KKcNSyrXAKmBHIUQe4JVSfiXjmVEe71Cmua4XgX0TWscDgfellLVSyjrgfVoFzCFnc2gWLwSWtvn/fWALKeVWwArgys3QJhMTExMTExOT3pDAe0KIH4QQZyc+y5FSlgEkfmcnPp8AbGxTdlPiswmJvzt+3q6MlDIGNAAZPdQ1LGjDdSIAIUQBcChwI3AxgJTyvTaHfA0c21s9ntQMmZlX2OnzaFTHThivTR+cBndANyQIUOOmBUNOVDewqCPfUiBqSDQBYpjGZSQS0Q2so+BaDSdjdUzGWr+klNTX+7HbrTgc1s3dnHHD5p5HMUOi9mPd3lBaRXVd42Zf7KcKIQNDUG8ZLAZCbT56UEr5YJv/d5VSlgohsoH3hRDLeqiuq3GSPXze3zJDzrAKi8AdwJ8ATzffnwk811slmXmFXPvoe11+t2hRKXnWBvLdEQ6c0oBdG5yxNKSktDFEnseOqgzPfbKpIUiux442TOfrL+VNYVIdGnZN7f3gMcq6ugBFqY5xLTC3RUrJ+vogxWnOzd2UQWdjfZB87/CtA0NNOBzl1Te+Z+aMfLbaomhzN2dcMBLujwpfmBR78uv2ridcMUQtSo4AcM4Q1HsthNrYInZCSlma+F0phHiZuP1ghRAiT0pZlthirkwcvgloq9kqAEoTnxd08XnbMpuEEBqQAtQmPt+rQ5lP+tHFfjFswqIQ4jCgUkr5gxBiry6+/wsQA57qpvzZxI1Bycgp6OoQAObMyUPKPOqCEZ5fuYFDJlaQ6WqvaZQSHlgwhY825LBHQSW/23YVva37AshwWqkJRMh221rromtxf6DU1DRRVlaP36Z127ac7BS83vaLjZSSNWsrMQwjqfOVVtpZsjqFuTPryEyLJFW2KRyjzqIOWKidWJiJzWYZUB2bg2F7tTMx2YwM1VpnMnoYSWudYPi1XUIIF6BIKZsSfx8AXA+8BpwO3JT4/WqiyGvA00KI24B84o4s30opdSFEkxBiZ+Ab4NfA3W3KnA58RXyn9SMppRRCvAv8vY1TywEMo9necI71rsDhQohDADvgFUI8KaU8NeECfhiwb8LYsxMJNfCDAJNmze12zgohEAKcLhthbRJPLteY0fghVtmqVX6lYh5Plm1BWFp5f20WP6+o4cTcz3rtgCHjW9EWdeiXzJgU6BKqejhm7YYaVNF5KCKGSOqmLqnwcvlt+yGQKIrkjiveIC0lmHSbB8r6jdWj9mEUjhms1sbO1uRgMFbHJKwbLFeVUTtXO2Ikltz166uoqmrczK0ZP2zu+yOqS1RF9Koo6UhT0/A/G7pCAJtBtZADvJzYQdKAp6WU7wghvgOeF0L8FtgAHAcgpVwshHgeWEJcGXa+lLJZe3Ue8G/AAbyd+AF4BHhCCLGKuEbxxERdtUKIG4DvEsddL6WsHcrOtmXYhEUp5ZUkpOCEZvHShKB4EHA5sKeUg2uCYLNp1IRsZEwqwmONAaDrgpf+txdhZ9w2J2xYWRDemotnlgzmqdsRaPKhaho2h73PZXKL88iZmNvjMRtXbKC2vKb9hwK23GVrlCRsYe68W8WQGpGIwOOWuKf8iiOPHRq7z55Y8eMygr7BXYjqqmpIyUxDEWNPaOkri79eiNNppbAgY9jPHdENDAn2MSg0jiUyM1OIGgYOy/g1JTHpmpWrytA0lUnFcZ8NbRzPESnlGmDrLj6vAfbtpsyNxP00On7+PbBFF5+HSAibXXz3KPBocq0eHIZbi9sV9wA24oaiAF9LKc8dzBOkTZ3NnGIVIQTvve/Gv94WVwZroCHZd65g7p7b9rm+aAwsvY1cYs9G13Vqyipxely4U7wD6UYnJs4oYuKMgdsY7TJPoqpgsUgMCdttKzeL7d2UrWeiaYN73jWLllM0cwqqNhKm+vAjpWTpd4vxeBxsOWfisJ+/MRxDNyRpjtFnXjCeCER0/FGdLJfp4GLSnnXrqnA4rC3rh8M+MubI5tiGHs9sltd9KeUnUsrDEn9PlVIWJgJczh1sQRFgTXmrlszpNLAslrACRINkW2+AfxxV3qd6DAOO/FcR9tO3YN7VUwhGehBsEl8FfQF0Xe9VUGy7bSwlvPhfF9den85339u6LTNY7LiD5IN3I/ztuhj/+yzClCmDb5nSU42rVxvM2qIJu7uJ7LwmPv4kNujnT7ZNQ1Own3VsbkOhzX3+8cR4Gevx0k8TkzHCmBXM/UHBC2+4WL1uCrOzaxCiCYBddwlwym/qefr9VPJcUf5xWTl2S99Wrl822PlwsRsQLNlk44NFbn61bVP3bWhsIhIKY7P3LvC1FTv/eWsqN/49nUBQcOttqXz4Xgk77RjuUxv7y847SXbeaei2nnvSFx50qJ+16yRSQk2t5MijA6xc5iY7e2jfZfqtw2xTsN9G/8kU6mdDB+15nMT5x5oTxLD3ZwQPns9wUEcWMTn6tiHNedk3NKGTRhVuZWTYJfbEZrJZHLeMWWHxzkdSWLPBQkwX/P5OJwXZ69lvez93vZjBkwvTiGQIVgdt/OrPxZx5cB13XljWa535aVEEEiEkuhRMyW7vNSylJOQPUFsZtyO0Ox1YbVY8aSlJtf3Jpz34A3FBKRiE199w9SgsDsbC0VUdlZWClasUtpijk5JcF/pMU5Nkw8a4oNiMqsLiJcaQC4uDwWAv2IYhCQbB5Rp4zYLhV+CMpQcyjL3+9Bef4aBWyadgQj52m8UMETUGkVISCkfZVGIBo3RUCIwmw8fIfxr3k1Xr44IiQDCs8OkCF2U1GlfcmUuwWkGPCiSCQEjl0bfS+W6Zo9c6s1N0Pr9mDVcfXck7V6xldkGrACelpKGmDl+jj5yJ+eRMzCc9J7NXQfGXhYLDj7Rw/EkWNmyIf7bVlhGs1njoG6dTMmdOz6FsBmPZ7ljH198oTJ3p4rAjHEyb6WLt2qF5OLhcYO1gAhOJwoT8vp0vFpOsX28QCIz+fa2mJsn02THSsmLccdfwOxiZmHRHHVkUTMjHYbeaguIYRQiBw26lYEI+dWRt7ub0SrPN4mD/mHTNmBUWiUCLusqAhV/bmTJ5BpHvBCwCfqQlRnsoInj18745n2xdFOLqoyvZbUar47ah61RuKkNRFLLyc9A0DU3Tel1UQyHYez8rb76t8PIrCvsdFJea7runkiOP8DNjRoTLLqnjxON9SXZ+4FxzrQ2/X9DYKKirF9x979AYNSuK4Il/23E6IcULTidccpGV6dN73+ratMlg2iwfW8z1kTOhiXfeHbitY22tZK99/HjSGjnwED9NTcMnhC74WVJRAboODz+SXJzMjsy/38I++zv45NPRt2VoMvKISRX7KIyBapI8dptlVJgaNG9DD/aPSdeMSUHaMMDYCGQJsALVgje+9SJ
      "text/plain": [
       "<Figure size 720x504 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – this cell generates the first figure in the chapter\n",
    "\n",
    "# Download the California image\n",
    "filename = \"california.png\"\n",
    "if not (IMAGES_PATH / filename).is_file():\n",
    "    homl3_root = \"https://github.com/ageron/handson-ml3/raw/main/\"\n",
    "    url = homl3_root + \"images/end_to_end_project/\" + filename\n",
    "    print(\"Downloading\", filename)\n",
    "    urllib.request.urlretrieve(url, IMAGES_PATH / filename)\n",
    "\n",
    "housing_renamed = housing.rename(columns={\n",
    "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
    "    \"population\": \"Population\",\n",
    "    \"median_house_value\": \"Median house value (ᴜsᴅ)\"})\n",
    "housing_renamed.plot(\n",
    "             kind=\"scatter\", x=\"Longitude\", y=\"Latitude\",\n",
    "             s=housing_renamed[\"Population\"] / 100, label=\"Population\",\n",
    "             c=\"Median house value (ᴜsᴅ)\", cmap=\"jet\", colorbar=True,\n",
    "             legend=True, sharex=False, figsize=(10, 7))\n",
    "\n",
    "california_img = plt.imread(IMAGES_PATH / filename)\n",
    "axis = -124.55, -113.95, 32.45, 42.05\n",
    "plt.axis(axis)\n",
    "plt.imshow(california_img, extent=axis)\n",
    "\n",
    "save_fig(\"california_housing_prices_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Looking for Correlations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "corr_matrix = housing.corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "median_house_value    1.000000\n",
       "median_income         0.688380\n",
       "total_rooms           0.137455\n",
       "housing_median_age    0.102175\n",
       "households            0.071426\n",
       "total_bedrooms        0.054635\n",
       "population           -0.020153\n",
       "longitude            -0.050859\n",
       "latitude             -0.139584\n",
       "Name: median_house_value, dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAJECAYAAAAYK8UIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOz9d7yl11mfjV9P3b2d3svMmd6lUZdlWZLlhowLprhhIEAIEAh5CeH9ERJIIQkhL4GEgCmmuGPjXmTJ6nU0vffT+z67t6eu3x9rnzPnzIzkGUuyZfm5Pp+Z2bPLs5/d1lr3uu/7+1WEEAQEBAQEBAQEBAQEBAR8Z9Tv9wkEBAQEBAQEBAQEBAT8oBAEUAEBAQEBAQEBAQEBAddIEEAFBAQEBAQEBAQEBARcI0EAFRAQEBAQEBAQEBAQcI0EAVRAQEBAQEBAQEBAQMA1EgRQAQEBAQEBAQEBAQEB10gQQAUEBAQEBAQEBAQEBFwjQQAVEBAQEBAQEBAQEBBwjQQBVEBAQEBAQEBAQEBAwDUSBFABAQEBAQEBAQEBAQHXyHUHUIqi7FAU5X8rivINRVG6m9e9S1GUPa/86QUEBAQEBAQEBAQEBLx2uK4ASlGU+4EXgF7gHiDSvGk98O9f2VMLCAgICAgICAgICAh4bXG9Gaj/CPyGEOLdgL3q+seAm1+pkwoICAgICAgICAgICHgtcr0B1Dbg61e5Pge0vPzTCQgICAgICAgICAgIeO1yvQFUHlm+dzk3AFMv/3QCAgICAgICAgICAgJeu1xvAPVJ4A8VRekDBKArivJG4H8Af/9Kn1xAQEBAQEBAQEBAQMBrCUUIce13VhQD+FvgJwEF8Jv/fhL4iBDCexXOMSAgICAgICAgICAg4DXBdQVQKw9SlPXAHmQG65AQ4twrfWIBAQEBAQEBAQEBAQGvNb6rACogICAgICAgICAgIOCHEf167qwoyp+81O1CiH/58k4nICAgICAgICAgICDgtct1BVDAjsv+bwCbm8c5+IqcUUBAQEBAQEBAQEBAwGuU6wqghBBvuvw6RVHCwF8DT75SJxUQEBAQEBAQEBAQEPBa5BXpgVIUZSvwoBCi/+WfUkBAQEBAQEBAQEBAwGuT6/WBejHagfgrdKyAgICAgICAgICAgIDXJNcrIvEbl18FdAMfAL7+Sp1UQEBAQEBAQEBAQEDAa5HrNdIdvewqH1gEHgH+QAhRfgXPLSAgICAgICAgICAg4DXFD7UPVFtbmxgaGvp+n0ZAwMviwIEDWSFE+yt93OD3EfB6IPh9BAS8OMHvIyDgxXmp38f1ypi/rhgaGmL//v3f79MICHhZKIoy/mocd/n3IYTgwRNzTOXr3L2pg5GOOA8en+OLh6cZao3yc3eu48tHZ/jqkRkyMYN/dd8mtvemABhfqvLZ/ZMcGi9QqtuULZftvSl+5U0b+INvnOLAeJ54SOPGwQxPnM3iC0EibKAqgnzFpuG/Gq/s9U1Ig7ChUnyF3jxTUxhuj7GlM8EzF5dYLNuoCnQmTXrTMRYrFqW6TcPx8YVAAAoKg20RNnQksBzBbLHObKFO3fGIh3USYR3bFagKREyNquXiC9jWk2KwNcJUvkGhZrOpM8EDu3u5cTDDx54a5aFTcxTrLoMtUbb0JOlMhHnn7h7OL1T4kT99auWcf+ue9fzS/ZuB783v46b/+C2yNffVeJqXRFMgrKu4vsAX0Boz6UyFKNQd9vRn+Lk7hvnPXz/FdKFOa9ykLR4iW7GZyNVoi5nsGUpzbq7CQqlBueHSlQrzxz+5my3dqZd1Xo7n89WjM+SrDm/Z3kVvOrJyW932+MqRGRquR0vUZK7U4KahFnb1p1/muxHw3fBq/z4u58hkgRfGcmzsTLClO8nXj80SMTR2DSR44E+ewfEE92/tYKFsc2KmyEBLlD/5qT386SPnSYR1/tmdg3zob/ZTtVz+xd0jnJgp8vxojhsGMrx5awd/+OBZEiGdP/2p3fzqpw5Ttlx+8y0b+cwLU5yaLbG7P82Hbhvijx86S2vc5NfvWcfP/N1BHF/w6/eN8I/7p5nK19jWk2K4LcqXDs8Q0jV+8Y4B/r/HZBHW7etaeGEsj+MLUmENTVXJ1RwAfvHWfv7iuUkA3r2riy8dncMXkArraJpCrirvd9tgnGfHKwBs64pxYq4KSGGCsKFSc+T4vaND59iCHFs2ZlTO5uX1hgoo4HjyfR1JqZwvytv2dJkcmrMBiBoqluvjNXMl2ztjHJ+Xz/XGoQiPj9VXnnf1jBHRod4c0oZjMFq9+uccUsFqPvDW/ijPTdbk598SYbbYwPIEhgojHTFONV/j+2/q4ZMvzADQmwoxX7ZwfdBVGGyJciErj/HLd/bzf56S7+XewRQHxosI5Lh3/7YOvnl8AV1T+LOf2sWvf/YYri/4zTdv5H88dJaG69MSM7hzfRtfOzZL2ND41/eN8PtfPwPAj+3pxkPl+HSRd+7q4Y8eOrvymjYD3/yv7wBe+vfxHTNQ38k8dzU/aEa6e/fuFUEAFfCDjqIoB4QQe1/p4y7/PpYqFn//rBxDetMR3ntjH7/1uaNM5msYmsK79/TxyOl5Dk8WiId03rKti9+4fxMAXzg0xVePzHB6rsxS1cZQVVIRg/fc0MvfPztGqeGiqQoq4AmxMoj6Qv4JeG0QNzUipsZS1V75XHQFQoaKAtRsf83kqwCGBgOtMeq2R6G2HGCBrioIZGDm+IKQpmJ7HoamETU1hlpjzJcbeL6gOxXh3i2d/Pwbhvnpj+1jYqlGqeHQmQgx0BJjW2+Kt2zrYt/oEv/hKyfXnPPYpQnwVf19TC5VecMfPvZKH/67QlUgamooQH8mykhnjP1jBfI1G1NXiZsauZqDLwQKCj3pCMW6Q6kur4uaGu+/ZYDffvvW7/hctutjaAqKolxx2/hSlX86OA3Apq4Eb9/RvXLbiZki3zoxj+cLpvI1BltjcmH8hnWv2PsQcO282r+Py/mrJy9SbsiV+fa+JMenSgAcmsjxxLklAEKagusLPCG/0z975yDHpmSHSCaq88jpRQA6k2HKDRfP91EUheG2GBcXZVCyvTfF8ekiACMdCU7OlhBCoKoKt69v48ycfN6WmMmJGXm5LaaTrcpzUxUFQ1OoNwMZDfBe6Tcp4LsmpClYzaiwNWaSr8mAMRNRWapdmo0ihrryGYY0sJofogZ0pcMAJEIGp+fXdiBdy/xxLSp8O67xz/ZrOFZAQMAPGKmIQVcqjKLAhs44mqqwuz+Npip0pyLsGUwz3BYjbGjEwwY3D7esPHZDR4LOZJhk2KA1amLqKt3pMLeta6UzKY9paiobuxIoyMlS11SihlwEBlw/KmC+UvqqzeMlIjrrO2JEDG3l+lhIoyMZJmzqhHQFTZU7gyryc8xETdoTITqTYTLRECFdRVdl0JWK6JiGRjykk4zoxEwdU1Poa4nQl4nQlQyTjBh0p8Js6oqjayrbe1MkwzrJsEFLPER/a5R4SKc3E2FjV2LNOf/I1le8IulF6UqFCevfv2+r0vyjAhFTpSVmEjI0WuMmD+zsIRkxCBsamYhBezJMezyEpqokIwbr26OkowbRkIahKSTCBndt7PiOz3lipsifPXaejz83juVeuazsSITJRA00VWFDx1qB3r6M/NzChsquvjQAGzsTVxwj4PXJ8me9rj3G5s4khqYQD+l85PYh1ObPaENHHEOTg5iqKNw10oGuKiTDOu+/ZYCQoaEoCm/Y0Mb65vdruC3GvVs6UBSFiKnz4VsHCDfv96bN7XQlQwgBvakId21oQ1EUkhGDD986iK7KjYB7tnSRihgAdKVC9GVk5lRR4I716ZXX0JcOrVxWFbnpt8zNg8mVy1s6wiuXDZU1c1rKvHQ5zFpWD9+RVZdDvDirb0usqi27fGRa9bQkjZc44HdBfNWJR3Rl5bkVZCCzzLqW8Jr7rSYZvjTH7OiKrlzORNYWzG1qjvmqAj9zW7/ciFUU3r6zb+U+hgJbuhMoity4e8eqjZwdPXGGWmMA3DiYvq7XucwPdQ9UkIEKeD3wvdhBFELg+mJlUgNoOB6mpqK
      "text/plain": [
       "<Figure size 864x576 with 16 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "from pandas.plotting import scatter_matrix\n",
    "\n",
    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
    "              \"housing_median_age\"]\n",
    "scatter_matrix(housing[attributes], figsize=(12, 8))\n",
    "save_fig(\"scatter_matrix_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOz9ebBtWZ7Xh33WWns+853ffWPONWV3U109IMA0AjcKS2FwWAjCstU2ROCwHQxGCgOehJFbQiGDHBIh2dhGAuxGYDkUaoeEUTNU0E031V1zZWVVzm+683DGPe+1lv9Y+5y87+XLzPeysiozq+4v4sV9Z5+z91lnD+u3fr/f9/f9Cmstl3Zpl3Zpl3ZpP2iTH/UALu3SLu3SLu1H0y4d0KVd2qVd2qV9JHbpgC7t0i7t0i7tI7FLB3Rpl3Zpl3ZpH4ldOqBLu7RLu7RL+0js0gFd2qVd2qVd2kdi3kc9gI+zbWxs2Fu3bq1ep2lKp9P56Ab0BPZJGesnZZzwyRnrJ2Wc8MkZ6+U4P7h95StfObXWbj7yTWvt5b93+feTP/mT9qL943/8j+0nxT4pY/2kjNPaT85YPynjtPaTM9bLcX5wA75s32WOvUzBXdqlXdqlXdpHYpcO6NIu7dIu7dI+Ert0QJd2aZd2aZf2kdilA7q0S7u0S7u0j8R+oA5ICHFbCPEtIcTXhRBfbretCSF+RQjxWvt3dOHzf14I8boQ4hUhxO+/sP0n2+O8LoT4D4QQot0eCiH+Trv9S0KIWxf2+YX2O14TQvzCD/BnX9qlXdqlXdoj7KOAYf8ea+3phdd/DviH1tq/JIT4c+3rPyuE+AzwR4DPArvAPxBCPG+t1cB/DPxx4J8B/zXwLwB/D/hjwNha+6wQ4o8A/y7wh4UQa8C/CXwBsMBXhBC/bK0dfz9+oDEWbS1KCKQU73iv1gYAX8kH3l/uZ7WlwRJIiee9+xph+XlhQbes5koIam0w1tI0ZrX/8nubxlAbgxSCJPDwPElVaeZ1jbKCyFM0uGN6nkRYKBpNow2V0VS1oeN7KE+S1TV53hBHHoFSbiy4E1w2DWleozwBBpCC0JNUlUH5glB6gCWvGn7rjQOSyIfGMjc1gZH0uyG+UighqIymLDReIKgqQ+ArfCXJ8hoRCGLhEYYeXd8HAeOsYJ5XRIHC9z1sY8iqBuUJBAIpBLHyiCIPqy2lNUhryWpNIAUNYIyhqBuwgq1eTNFoylpzOF6QNZqm1gS+wvMUoZDMqoqibOiGPkiBNoZGG+rG4PkSD0HZaCyQBB7GQt1od59INyZPKXqBT2UNTW1IIg+jYV6XFIVmrRPi+x5CQKMNZa3RGE6mGed5xVYvYq0bkVcNX7t7RCAVCoHyJdoYjIZBHGCVQJea3GiaRiOEYJREDKOQTDdMFgW1MfieRFqB8iShp7DtfaSEIPDdtSlqTakbrIHIV1SNAQHDOCQOPLS1GGPJiprTNKexhq4fsNaJEEBRa37ttft0A59+FBKHPpFUzMuKs0WO8iWjMAQpEBaEEpRVwzgrqLSmo3yG/QgfyaKukQjWuhFSCLKqwQqLsILIVygp8ZX7Z4V7VoyxZFVDpTVaW3xPEiqFlO7ZVUK4Z9JCXjbuOTIGYyyekkSeotaGrGnwhSQJPIRyY7WCB/7W2lBrg3LrZbS1q/HU2qCte+atcOd5+f3L/XwlCX212vfh594+ONU8cs4Rlgfmlnebqy5u/0evvcWXvrPgZz7d5fe98PT7zn9PYh+HPqA/APxc+/+/AXwR+LPt9v/MWlsCbwkhXgd+WghxG+hba38DQAjxN4E/iHNAfwD4C+2x/nPgr7bR0e8HfsVae97u8ys4p/W3P+wfU9Saw2mBsRYpBDuDiKi9aYpac+c05XheArDVD7m53iHy1Wq/aV7x+vGCfuQTB4rPXR0wTIJ3/Z68bjicFJhWVSOrG2Z5Q5LX/JPXTvj8zRGRr7hzlvLm8ZzvHMyYFTWjTsCntvvc2ujwm7fPuX2aUjWabuix1g1pjGWzGzLLa04WBW8epxzMcnwp8aQg9iX70wLj7muGnYBQQmWhrjVH84JGa/LaoiR4ErSFQLlbrhd7TLKSP/pszZ/4R19d/S4DKCD0YJD4CAF53WC1JW/cdiHAGhAStIZuKNke9bg2CMmqhteOU9KyRhtDJ/Ao2om/ri2BAt/3GHZC1js+jQGJZW+aE0nFtCgJPMWiqtEaIl/iKUGgPP4Ht3L+d3/1V/GVpGg0nlCMEo+ytiyqCm0AAbGvqKuG0rrfLCxYC8aAVKCkmzAabd33S7etH/sEnsCTCgVowBOSg1mGFBJfSm5udhHWMM8b5lXFybQmM2/fFyHwJ3+s4U/+R19GAr4H0oKWEHkKX0E38piXmqJqqGoIfNjpx+yOYuZZw96sIC8rhABfCpLQI/B9d+6NJPAlg9hDSMEsazhZlPhCUFs3SXZCj6c3uvzU02sMopDvHE34te8ec3+WUVaGtU7IM1tdxouK37u24C//w28ggGEs2elHKCU4npXMsgYEhJ5g1IlAWhptmKQVaQUNEAnodhQhYKXAV4r1Tsgw8Sgay3laMYg8OqHHs1tdro5itvsJNzcSam24fZrynb0prx7P0drSS3yu9COujmI2+zEKMFaQZxX/r9+8zXhRcp43VJVmaxDRDRXTrGGSVYS+4qn1hM9eH1A30Is85kVDN1QcTQuO5gWTtKZoGgQC33P3TzcIKI1u7xFDL/ZpNCSBW9SdpyVpqRnEAS/sdOmFPsZaDqYFFosFFG6uqRpDUetHzjlpWTHOGja6IXGgeHarS1Gbd8xVF+ewP/NLv8nXD3IA/m//DH7mxh5/53/+u77XaXJlP2gHZIH/Rghhgf+rtfavAdvW2gMAa+2BEGKr/exVXISztPvttrr9/8Pbl/vca4/VCCGmwPrF7Y/Y50MzYyyH0wJfudVsow2H04IbawkA+5OcSV7Ri9xpH6cVgZJcHyUcTguEtdw9y4h9t9oMlOClvSk/+9T6A5HQ8nuUgHnRkFYNQrjV0CsHc3b6EZ4UpGXD1++NudqPOUkLDmcFi1ITKIWUkjsnKb/11glR4LPZC3nzZMHBdMG1UnNrs8M37o4x1pDmNQeTjMYYtLQUecl50TAIFUhFXjZMi4J+6COs4ThtaGpoLFSAwDkVA/iioRvCndOGBjcxNw+fR6BuIJ/VqxvU4C58qd1NZIGgPbaxBjWeMZ4rZlmDH0p0YyhrmJQNYbu/BfIGurphbAwnk5TNXsh5VuFLy34JUsBJahC4h8PUhsxCx9MYY5ikBokh9KGyDfO8oTHOwXoS0gbGeUPQjle1f5cmjBsHWFQ7JqPdb6mqGiEhVNCNAxZZRVpDP4QGTVlpvrs/phcqqkYzyyF/SM6rBBddLV+3J1dpKGuNAk4WmkBCYcAHdAPHk5y985ygdfBF5a6fLy15XaNkTeAJun6AZwSTzE1QRW2Ifcl5pqkbQxwq1rsBb52l5GXFp68O+NIbpxxMM3RjCZRglpZ85c2SRQX//MhdwwY4yw1VlVFb0AZCH7IaUm1ZlDndEGblg/dLYaFYaHxgEIEKLW+dlMQ+dOMIJeBsoWmM5s6Zi8IbbenFiv1pzluHc44XJdZYKm2Y5hVlpd3njMFagZQwsJa7pxmLoqZsDL3Y43CaM1tUSE/w9GaPaV7z1mmKtpbPXR1w9yxjqxdw5zzjdFYwLxskgqNpAUJwcz3hJK149WjBM5tdAik5npeczkteuNLnaJZzPCsIPcXOMKJuDF+5M+azu318T5KWjYtarHNCi3YeeNScE3uSN08q9yxZiy/hn75+yhdujuiE3mquujaMV3PYr715b+V8lvaluzP+wStvfmiR0A/aAf0Oa+1+62R+RQjx3ff47KMCSvse2z/oPg9+qRB/HJfeY3t7my9+8Yur9xaLxQOvH3XAujHvCGXfap1H1Ri0sSzfNhamUvCWkjRtiKxrjZCCwlrOpERbyz+573Exk7f8HiHcMU2bIrAWbjQafywxukQcvswMyJVEG0uv1jxvLcKATN0BO9rgVQLVCHqeoelZPCGIp4pPCQ0CdGL
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
    "             alpha=0.1, grid=True)\n",
    "save_fig(\"income_vs_house_value_scatterplot\")  # extra code\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experimenting with Attribute Combinations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing[\"rooms_per_house\"] = housing[\"total_rooms\"] / housing[\"households\"]\n",
    "housing[\"bedrooms_ratio\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n",
    "housing[\"people_per_house\"] = housing[\"population\"] / housing[\"households\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "median_house_value    1.000000\n",
       "median_income         0.688380\n",
       "rooms_per_house       0.143663\n",
       "total_rooms           0.137455\n",
       "housing_median_age    0.102175\n",
       "households            0.071426\n",
       "total_bedrooms        0.054635\n",
       "population           -0.020153\n",
       "people_per_house     -0.038224\n",
       "longitude            -0.050859\n",
       "latitude             -0.139584\n",
       "bedrooms_ratio       -0.256397\n",
       "Name: median_house_value, dtype: float64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corr_matrix = housing.corr()\n",
    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare the Data for Machine Learning Algorithms"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's revert to the original training set and separate the target (note that `strat_train_set.drop()` creates a copy of `strat_train_set` without the column, it doesn't actually modify `strat_train_set` itself, unless you pass `inplace=True`):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
    "housing_labels = strat_train_set[\"median_house_value\"].copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Cleaning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the book 3 options are listed to handle the NaN values:\n",
    "\n",
    "```python\n",
    "housing.dropna(subset=[\"total_bedrooms\"], inplace=True)    # option 1\n",
    "\n",
    "housing.drop(\"total_bedrooms\", axis=1)       # option 2\n",
    "\n",
    "median = housing[\"total_bedrooms\"].median()  # option 3\n",
    "housing[\"total_bedrooms\"].fillna(median, inplace=True)\n",
    "```\n",
    "\n",
    "For each option, we'll create a copy of `housing` and work on that copy to avoid breaking `housing`. We'll also show the output of each option, but filtering on the rows that originally contained a NaN value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>ocean_proximity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14452</th>\n",
       "      <td>-120.67</td>\n",
       "      <td>40.50</td>\n",
       "      <td>15.0</td>\n",
       "      <td>5343.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2503.0</td>\n",
       "      <td>902.0</td>\n",
       "      <td>3.5962</td>\n",
       "      <td>INLAND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18217</th>\n",
       "      <td>-117.96</td>\n",
       "      <td>34.03</td>\n",
       "      <td>35.0</td>\n",
       "      <td>2093.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1755.0</td>\n",
       "      <td>403.0</td>\n",
       "      <td>3.4115</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11889</th>\n",
       "      <td>-118.05</td>\n",
       "      <td>34.04</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1348.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1098.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>4.2917</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20325</th>\n",
       "      <td>-118.88</td>\n",
       "      <td>34.17</td>\n",
       "      <td>15.0</td>\n",
       "      <td>4260.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1701.0</td>\n",
       "      <td>669.0</td>\n",
       "      <td>5.1033</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14360</th>\n",
       "      <td>-117.87</td>\n",
       "      <td>33.62</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1266.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>375.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>9.8020</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "14452    -120.67     40.50                15.0       5343.0             NaN   \n",
       "18217    -117.96     34.03                35.0       2093.0             NaN   \n",
       "11889    -118.05     34.04                33.0       1348.0             NaN   \n",
       "20325    -118.88     34.17                15.0       4260.0             NaN   \n",
       "14360    -117.87     33.62                 8.0       1266.0             NaN   \n",
       "\n",
       "       population  households  median_income ocean_proximity  \n",
       "14452      2503.0       902.0         3.5962          INLAND  \n",
       "18217      1755.0       403.0         3.4115       <1H OCEAN  \n",
       "11889      1098.0       257.0         4.2917       <1H OCEAN  \n",
       "20325      1701.0       669.0         5.1033       <1H OCEAN  \n",
       "14360       375.0       183.0         9.8020       <1H OCEAN  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "null_rows_idx = housing.isnull().any(axis=1)\n",
    "housing.loc[null_rows_idx].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>ocean_proximity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, ocean_proximity]\n",
       "Index: []"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_option1 = housing.copy()\n",
    "\n",
    "housing_option1.dropna(subset=[\"total_bedrooms\"], inplace=True)  # option 1\n",
    "\n",
    "housing_option1.loc[null_rows_idx].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>ocean_proximity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14452</th>\n",
       "      <td>-120.67</td>\n",
       "      <td>40.50</td>\n",
       "      <td>15.0</td>\n",
       "      <td>5343.0</td>\n",
       "      <td>2503.0</td>\n",
       "      <td>902.0</td>\n",
       "      <td>3.5962</td>\n",
       "      <td>INLAND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18217</th>\n",
       "      <td>-117.96</td>\n",
       "      <td>34.03</td>\n",
       "      <td>35.0</td>\n",
       "      <td>2093.0</td>\n",
       "      <td>1755.0</td>\n",
       "      <td>403.0</td>\n",
       "      <td>3.4115</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11889</th>\n",
       "      <td>-118.05</td>\n",
       "      <td>34.04</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1348.0</td>\n",
       "      <td>1098.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>4.2917</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20325</th>\n",
       "      <td>-118.88</td>\n",
       "      <td>34.17</td>\n",
       "      <td>15.0</td>\n",
       "      <td>4260.0</td>\n",
       "      <td>1701.0</td>\n",
       "      <td>669.0</td>\n",
       "      <td>5.1033</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14360</th>\n",
       "      <td>-117.87</td>\n",
       "      <td>33.62</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1266.0</td>\n",
       "      <td>375.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>9.8020</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       longitude  latitude  housing_median_age  total_rooms  population  \\\n",
       "14452    -120.67     40.50                15.0       5343.0      2503.0   \n",
       "18217    -117.96     34.03                35.0       2093.0      1755.0   \n",
       "11889    -118.05     34.04                33.0       1348.0      1098.0   \n",
       "20325    -118.88     34.17                15.0       4260.0      1701.0   \n",
       "14360    -117.87     33.62                 8.0       1266.0       375.0   \n",
       "\n",
       "       households  median_income ocean_proximity  \n",
       "14452       902.0         3.5962          INLAND  \n",
       "18217       403.0         3.4115       <1H OCEAN  \n",
       "11889       257.0         4.2917       <1H OCEAN  \n",
       "20325       669.0         5.1033       <1H OCEAN  \n",
       "14360       183.0         9.8020       <1H OCEAN  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_option2 = housing.copy()\n",
    "\n",
    "housing_option2.drop(\"total_bedrooms\", axis=1, inplace=True)  # option 2\n",
    "\n",
    "housing_option2.loc[null_rows_idx].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "      <th>ocean_proximity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14452</th>\n",
       "      <td>-120.67</td>\n",
       "      <td>40.50</td>\n",
       "      <td>15.0</td>\n",
       "      <td>5343.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>2503.0</td>\n",
       "      <td>902.0</td>\n",
       "      <td>3.5962</td>\n",
       "      <td>INLAND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18217</th>\n",
       "      <td>-117.96</td>\n",
       "      <td>34.03</td>\n",
       "      <td>35.0</td>\n",
       "      <td>2093.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1755.0</td>\n",
       "      <td>403.0</td>\n",
       "      <td>3.4115</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11889</th>\n",
       "      <td>-118.05</td>\n",
       "      <td>34.04</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1348.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1098.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>4.2917</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20325</th>\n",
       "      <td>-118.88</td>\n",
       "      <td>34.17</td>\n",
       "      <td>15.0</td>\n",
       "      <td>4260.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1701.0</td>\n",
       "      <td>669.0</td>\n",
       "      <td>5.1033</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14360</th>\n",
       "      <td>-117.87</td>\n",
       "      <td>33.62</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1266.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>375.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>9.8020</td>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "14452    -120.67     40.50                15.0       5343.0           434.0   \n",
       "18217    -117.96     34.03                35.0       2093.0           434.0   \n",
       "11889    -118.05     34.04                33.0       1348.0           434.0   \n",
       "20325    -118.88     34.17                15.0       4260.0           434.0   \n",
       "14360    -117.87     33.62                 8.0       1266.0           434.0   \n",
       "\n",
       "       population  households  median_income ocean_proximity  \n",
       "14452      2503.0       902.0         3.5962          INLAND  \n",
       "18217      1755.0       403.0         3.4115       <1H OCEAN  \n",
       "11889      1098.0       257.0         4.2917       <1H OCEAN  \n",
       "20325      1701.0       669.0         5.1033       <1H OCEAN  \n",
       "14360       375.0       183.0         9.8020       <1H OCEAN  "
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_option3 = housing.copy()\n",
    "\n",
    "median = housing[\"total_bedrooms\"].median()\n",
    "housing_option3[\"total_bedrooms\"].fillna(median, inplace=True)  # option 3\n",
    "\n",
    "housing_option3.loc[null_rows_idx].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer\n",
    "\n",
    "imputer = SimpleImputer(strategy=\"median\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Separating out the numerical attributes to use the `\"median\"` strategy (as it cannot be calculated on text attributes like `ocean_proximity`):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing_num = housing.select_dtypes(include=[np.number])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SimpleImputer(strategy='median')"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "imputer.fit(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-118.51  ,   34.26  ,   29.    , 2125.    ,  434.    , 1167.    ,\n",
       "        408.    ,    3.5385])"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "imputer.statistics_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check that this is the same as manually computing the median of each attribute:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-118.51  ,   34.26  ,   29.    , 2125.    ,  434.    , 1167.    ,\n",
       "        408.    ,    3.5385])"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_num.median().values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transform the training set:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = imputer.transform(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',\n",
       "       'total_bedrooms', 'population', 'households', 'median_income'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "imputer.feature_names_in_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
    "                          index=housing_num.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14452</th>\n",
       "      <td>-120.67</td>\n",
       "      <td>40.50</td>\n",
       "      <td>15.0</td>\n",
       "      <td>5343.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>2503.0</td>\n",
       "      <td>902.0</td>\n",
       "      <td>3.5962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18217</th>\n",
       "      <td>-117.96</td>\n",
       "      <td>34.03</td>\n",
       "      <td>35.0</td>\n",
       "      <td>2093.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1755.0</td>\n",
       "      <td>403.0</td>\n",
       "      <td>3.4115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11889</th>\n",
       "      <td>-118.05</td>\n",
       "      <td>34.04</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1348.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1098.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>4.2917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20325</th>\n",
       "      <td>-118.88</td>\n",
       "      <td>34.17</td>\n",
       "      <td>15.0</td>\n",
       "      <td>4260.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1701.0</td>\n",
       "      <td>669.0</td>\n",
       "      <td>5.1033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14360</th>\n",
       "      <td>-117.87</td>\n",
       "      <td>33.62</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1266.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>375.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>9.8020</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "14452    -120.67     40.50                15.0       5343.0           434.0   \n",
       "18217    -117.96     34.03                35.0       2093.0           434.0   \n",
       "11889    -118.05     34.04                33.0       1348.0           434.0   \n",
       "20325    -118.88     34.17                15.0       4260.0           434.0   \n",
       "14360    -117.87     33.62                 8.0       1266.0           434.0   \n",
       "\n",
       "       population  households  median_income  \n",
       "14452      2503.0       902.0         3.5962  \n",
       "18217      1755.0       403.0         3.4115  \n",
       "11889      1098.0       257.0         4.2917  \n",
       "20325      1701.0       669.0         5.1033  \n",
       "14360       375.0       183.0         9.8020  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_tr.loc[null_rows_idx].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'median'"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "imputer.strategy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
    "                          index=housing_num.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14452</th>\n",
       "      <td>-120.67</td>\n",
       "      <td>40.50</td>\n",
       "      <td>15.0</td>\n",
       "      <td>5343.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>2503.0</td>\n",
       "      <td>902.0</td>\n",
       "      <td>3.5962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18217</th>\n",
       "      <td>-117.96</td>\n",
       "      <td>34.03</td>\n",
       "      <td>35.0</td>\n",
       "      <td>2093.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1755.0</td>\n",
       "      <td>403.0</td>\n",
       "      <td>3.4115</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11889</th>\n",
       "      <td>-118.05</td>\n",
       "      <td>34.04</td>\n",
       "      <td>33.0</td>\n",
       "      <td>1348.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1098.0</td>\n",
       "      <td>257.0</td>\n",
       "      <td>4.2917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20325</th>\n",
       "      <td>-118.88</td>\n",
       "      <td>34.17</td>\n",
       "      <td>15.0</td>\n",
       "      <td>4260.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>1701.0</td>\n",
       "      <td>669.0</td>\n",
       "      <td>5.1033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14360</th>\n",
       "      <td>-117.87</td>\n",
       "      <td>33.62</td>\n",
       "      <td>8.0</td>\n",
       "      <td>1266.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>375.0</td>\n",
       "      <td>183.0</td>\n",
       "      <td>9.8020</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "14452    -120.67     40.50                15.0       5343.0           434.0   \n",
       "18217    -117.96     34.03                35.0       2093.0           434.0   \n",
       "11889    -118.05     34.04                33.0       1348.0           434.0   \n",
       "20325    -118.88     34.17                15.0       4260.0           434.0   \n",
       "14360    -117.87     33.62                 8.0       1266.0           434.0   \n",
       "\n",
       "       population  households  median_income  \n",
       "14452      2503.0       902.0         3.5962  \n",
       "18217      1755.0       403.0         3.4115  \n",
       "11889      1098.0       257.0         4.2917  \n",
       "20325      1701.0       669.0         5.1033  \n",
       "14360       375.0       183.0         9.8020  "
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_tr.loc[null_rows_idx].head()  # not shown in the book"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from sklearn import set_config\n",
    "#\n",
    "# set_config(pandas_in_out=True)  # not available yet – see SLEP014"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's drop some outliers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import IsolationForest\n",
    "\n",
    "isolation_forest = IsolationForest(random_state=42)\n",
    "outlier_pred = isolation_forest.fit_predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-1,  1,  1, ...,  1,  1,  1])"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outlier_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "#housing = housing.iloc[outlier_pred == 1]\n",
    "#housing_labels = housing_labels.iloc[outlier_pred == 1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Handling Text and Categorical Attributes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's preprocess the categorical input feature, `ocean_proximity`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ocean_proximity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13096</th>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14973</th>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3785</th>\n",
       "      <td>INLAND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14689</th>\n",
       "      <td>INLAND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20507</th>\n",
       "      <td>NEAR OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1286</th>\n",
       "      <td>INLAND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18078</th>\n",
       "      <td>&lt;1H OCEAN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4396</th>\n",
       "      <td>NEAR BAY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      ocean_proximity\n",
       "13096        NEAR BAY\n",
       "14973       <1H OCEAN\n",
       "3785           INLAND\n",
       "14689          INLAND\n",
       "20507      NEAR OCEAN\n",
       "1286           INLAND\n",
       "18078       <1H OCEAN\n",
       "4396         NEAR BAY"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_cat = housing[[\"ocean_proximity\"]]\n",
    "housing_cat.head(8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "\n",
    "ordinal_encoder = OrdinalEncoder()\n",
    "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[3.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [4.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [3.]])"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_cat_encoded[:8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],\n",
       "       dtype=object)]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ordinal_encoder.categories_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "cat_encoder = OneHotEncoder()\n",
    "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<16512x5 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 16512 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_cat_1hot"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., 1., 0.],\n",
       "       [1., 0., 0., 0., 0.],\n",
       "       [0., 1., 0., 0., 0.],\n",
       "       ...,\n",
       "       [0., 0., 0., 0., 1.],\n",
       "       [1., 0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0., 1.]])"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_cat_1hot.toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., 1., 0.],\n",
       "       [1., 0., 0., 0., 0.],\n",
       "       [0., 1., 0., 0., 0.],\n",
       "       ...,\n",
       "       [0., 0., 0., 0., 1.],\n",
       "       [1., 0., 0., 0., 0.],\n",
       "       [0., 0., 0., 0., 1.]])"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder = OneHotEncoder(sparse=False)\n",
    "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n",
    "housing_cat_1hot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],\n",
       "       dtype=object)]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder.categories_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ocean_proximity_INLAND</th>\n",
       "      <th>ocean_proximity_NEAR BAY</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ocean_proximity_INLAND  ocean_proximity_NEAR BAY\n",
       "0                       1                         0\n",
       "1                       0                         1"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test = pd.DataFrame({\"ocean_proximity\": [\"INLAND\", \"NEAR BAY\"]})\n",
    "pd.get_dummies(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 1., 0., 0., 0.],\n",
       "       [0., 0., 0., 1., 0.]])"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder.transform(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ocean_proximity_&lt;2H OCEAN</th>\n",
       "      <th>ocean_proximity_ISLAND</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ocean_proximity_<2H OCEAN  ocean_proximity_ISLAND\n",
       "0                          1                       0\n",
       "1                          0                       1"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_unknown = pd.DataFrame({\"ocean_proximity\": [\"<2H OCEAN\", \"ISLAND\"]})\n",
    "pd.get_dummies(df_test_unknown)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., 0., 0.],\n",
       "       [0., 0., 1., 0., 0.]])"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder.handle_unknown = \"ignore\"\n",
    "cat_encoder.transform(df_test_unknown)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['ocean_proximity'], dtype=object)"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder.feature_names_in_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',\n",
       "       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',\n",
       "       'ocean_proximity_NEAR OCEAN'], dtype=object)"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_encoder.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_output = pd.DataFrame(cat_encoder.transform(df_test_unknown),\n",
    "                         columns=cat_encoder.get_feature_names_out(),\n",
    "                         index=df_test_unknown.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ocean_proximity_&lt;1H OCEAN</th>\n",
       "      <th>ocean_proximity_INLAND</th>\n",
       "      <th>ocean_proximity_ISLAND</th>\n",
       "      <th>ocean_proximity_NEAR BAY</th>\n",
       "      <th>ocean_proximity_NEAR OCEAN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ocean_proximity_<1H OCEAN  ocean_proximity_INLAND  ocean_proximity_ISLAND  \\\n",
       "0                        0.0                     0.0                     0.0   \n",
       "1                        0.0                     0.0                     1.0   \n",
       "\n",
       "   ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  \n",
       "0                       0.0                         0.0  \n",
       "1                       0.0                         0.0  "
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "min_max_scaler = MinMaxScaler(feature_range=(-1, 1))\n",
    "housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "std_scaler = StandardScaler()\n",
    "housing_num_std_scaled = std_scaler.fit_transform(housing_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi8AAADICAYAAAAgCgFXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgMElEQVR4nO3debgcVbnv8e+PgATCIIjkhjAENXqZrkFiDIIYQCWAHpAjj/igDILcI6hR8SqIQzzKIQoyqfAAigFlkCOjIB4jsEU0DIEDBMIJBAgQiAREkEQIBN77x1oNnU4Ptffuce/f53n66epV1VVvrXRV3r1qVS1FBGZmZma9YrVOB2BmZmbWH05ezMzMrKc4eTEzM7Oe4uTFzMzMeoqTFzMzM+spTl7MzMysp6ze6QDaZaONNopx48Y1XG7ZsmWMGjWq9QF1OddD4npI2l0Pt99++9MR8eZmrKvosd9KQ+V35P3oLkNxP4oe+8MmeRk3bhxz5sxpuFxfXx9TpkxpfUBdzvWQuB6SdteDpEeata6ix34rDZXfkfejuwzF/Sh67PuykZmZmfUUJy9mZmbWU5y8mJmZWU9x8mJmZmY9xcmLmZmZ9ZRhc7dRM4075pqVPi+csXeHIjEzMxt+3PJiZmZmPWXAyYukt0ka2cxgzMzMzBoplLxI+g9JB+dpSZoF3A8slvSeVgZoZmZmVq5oy8uBwPw8vScwAZgMnA/MaH5YZmZmZtUV7bA7GliUp/cCLomIWyU9A3T2udtmZmY2rBRtefkbsEWe/hBwfZ5eHVCzgzIzMzOrpWjLy6XAhZLuBzYEfpfLJwALWhCXmZmZWVVFk5cvA48AmwNfjYhluXwMcGYrAjMzMzOrpmjysglwSkS8WlF+KrBZUyMyMzMzq6Non5eHgY2qlG+Y55mZmZm1RdHkRUBUKV8HeLF54ZiZmZnVV/eykaTT82QAJ0j6Z9nsEcAk4M7WhGZmZma2qkZ9XrbL7wK2Al4qm/cScAdwUgviMjMzM6uqbvISEbsCSPo5MC0i/tGWqHpM5SjT4JGmzczMWqVon5evA+tVFkraVNLo5oZkZmZmVlvR5OV80phGlfYAftG8cMzMzMzqK5q8vBu4sUr5n4CJzQvHzMzMrL6iycvqwJpVykfWKDczMzNriaLJyy3AZ6uUHwXcVmQFkkZKulXSXZLulfSdXL6hpFmSHsjvG5R951hJCyTNl7RHWfkOkubmeadL8uCQZmZmw0TR4QGOA66X9E7guly2G7A98IGC61gO7BYRSyWtAdwk6VpgP+C6iJgh6RjgGOBrkrYGDgC2IQ1P8AdJb4+IV0jjKR0B3Az8FpgKXFswDjMzM+thhVpeIuJmYEfgIVKy8a+kYQF2jIi/FFxHRMTS/HGN/ApgH+C8XH4esG+e3ge4OCKWR8TDpNGrJ0kaA6wXEbMjIkidiUvfMTMzsyGuaMsLEXEX8MnBbEzSCOB24G3ATyLiFkmjI2Jx3sZiSRvnxceSWlZKFuWyl/N0ZbmZmZkNAzWTF0kbRsQzpel6Kykt10i+5DNB0huByyVtW2fxav1Yok75qiuQjiBdXmL06NH09fU1jHHp0qUNlzt6uxUN11NkW92sSD0MB66HpNfqYSDHfiv1Wv3V4v3oLsN5P+q1vDwlaUxELAGepnqCUBqwcUR/NhoRz0rqI/VVeTJvZ3G+JLQkL7YI2Kzsa5sCT+TyTauUV9vO2cDZABMnTowpU6Y0jK2vr49Gyx1S5Ym6lRYe2Hhb3axIPQwHroek1+phIMd+K/Va/dXi/eguw3k/6iUvuwGlFpVdBxjTayS9GXg5Jy5rkTr6fh+4CjgYmJHfr8xfuQq4UNLJpA6744FbI+IVSc9Lmky6C+og4EeDjc/MzMx6Q83kJSL+CCBpddIdP1dERNUWjoLGAOflfi+rAZdExNWSZgOXSDoMeBTYP2//XkmXAPOAFcBR+bITpNu2ZwJrke4y8p1GZmZmw0TDDrsRsULSiUDjayX113M36dbqyvK/AbvX+M7xwPFVyucA9frLmJmZ2RBV9CF1NwM7tDIQMzMzsyKK3ip9DnCSpM1JtzovK58ZEXc0OzAzMzOzaoomLxfm95OrzOv33UZmZmZmA1U0edmypVGYmZmZFVQ0edkC+EtErPR0tnwn0nuBR5odmJmZmVk1RTvs3gBUe8ru+nmemZmZWVsUTV5KT9Kt9CYqOu+amZmZtVLdy0aSrsqTAfxS0vKy2SNIz1opNKq0mZmZWTM06vPyt/wu4O/AC2XzXgJuIt1GbWZmZtYWdZOXiDgUQNJC4KSI8CUiMzMz66iifV6+S1mri6T/JelwSe9tTVhmZmZm1RVNXq4BPg8gaR1gDnAi8EdJB7UoNjMzM7NVFE1edgCuz9P7Af8ANgY+A3ylBXGZmZmZVVU0eVkXeDZPfwi4PCJeJiU0b21BXGZmZmZVFU1eHgV2kjQK2AOYlcs3BP7ZisDMzMzMqik6PMDJwC+ApaShAG7M5bsAc1sQl5mZmVlVhZKXiDhL0u3AZsCsiHg1z3oQ+GargjMzMzOrVLTlhYiYQ7rLqLzsmqZHZGZmZlZHzeRF0peBMyLixTxdU0Sc3PTIzMzMzKqo1/LyeeA84MU8XUuQ+sSYmZmZtVzN5CUitqw2bWZmZtZJRW+VNjMzM+sK9fq8fKvoSiLi35sTztAx7piV+zIvnLF3hyIxMzMbWur1edm/4vMWwNrAE/nzJqQH1C0EnLyYmZlZW9Tr87JdaVrSocBBwMER8Wgu2xz4OXBBq4M0MzMzKyna5+VbwBdLiQtAnj4a+HYrAjMzMzOrpmjyMhpYq0r5SGCj5oVjZmZmVl/R5GUWcI6kyZJG5Ndk4CxeH6TRzMzMrOWKJi+HA48BfyE9tO5F4M/A48BnWhOamZmZ2aqKDsz4FLCXpPHAVoCA+yLi/lYGZ2ZmZlap8MCMABHxAPBAi2IxMzMza6htT9iVtJmkGyTdJ+leSdNy+YaSZkl6IL9vUPadYyUtkDRf0h5l5TtImpvnnS5J7doPMzMz66x2Dg+wAjg6IrYCJgNHSdoaOAa4LiLGA9flz+R5BwDbAFOBMySNyOs6EzgCGJ9fU9u4H2ZmZtZBbUteImJxRNyRp58H7gPGAvuQRq8mv++bp/cBLo6I5RHxMLAAmCRpDLBeRMyOiADOL/uOmZmZDXE1kxdJ50paN0/vIqlf/WPqkTQO2B64BRgdEYshJTjAxnmxsaQ7nEoW5bKxebqy3MzMzIaBegnJJ4GvA88DNwBjgCWD3aCkdYBLSU/s/Ued7irVZkSd8mrbOoJ0eYnRo0fT19fXML6lS5c2XO7o7VY0XE+lItvuJkXqYThwPSS9Vg8DOfZbqdfqrxbvR3cZzvtRL3lZCHxe0u9JCcOOkv5ebcGIuLHIxiStQUpcLoiIy3Lxk5LGRMTifEmolCAtAjYr+/qmpEEhF+XpyvJqcZ0NnA0wceLEmDJlSsMY+/r6qFyucoToft6kBcDCAxtvu5tUq4fhyPWQ9Fo9DOTYb6Veq79avB/dZTjvR73/hf8fcA5wLKll4/IaywUwosa81+Q7gn5Gej7MyWWzrgIOBmbk9yvLyi+UdDJpBOvxwK0R8Yqk5/MTfm8hDRj5o0bbNzMza4fKP3gXzti7Q5EMXfVGlb4SuFLSG4FnSHf9DOay0U7Ap4C5ku7MZV8nJS2XSDoMeBTYP2//XkmXAPNIdyodFRGv5O99FphJGm/p2vwyMzOzYaDh9Y+IeFbSrsADEdH/zh6vr+cmqvdXAdi9xneOB46vUj4H2HagsZiZmVnvKjo8wB8lrSnpIGBr0qWiecCFEbG8lQGamZmZlSuUvOQHxl0LrA/MzcWfAaZLmhoR97UoPjMzs57mPjDNV/QhdacBdwKbR8T7IuJ9wObAXcCprQnNzMzMbFVF7/ndCXh3RPyjVJCf0XIccHNLIjMzM+sBqz5Ow1qtaMvLi8Abq5Svn+eZmZmZtUXR5OU3wDmSdpI0Ir92Bs4iPY/FzMzMrC2KJi/TgAeAP5FaWl4E/gjcD3yxJZGZmZmZVVH0VulngX0kvQ3YivS8lnk
      "text/plain": [
       "<Figure size 576x216 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – this cell generates Figure 2–17\n",
    "fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)\n",
    "housing[\"population\"].hist(ax=axs[0], bins=50)\n",
    "housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n",
    "axs[0].set_xlabel(\"Population\")\n",
    "axs[1].set_xlabel(\"Log of population\")\n",
    "axs[0].set_ylabel(\"Number of districts\")\n",
    "save_fig(\"long_tail_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What if we replace each value with its percentile?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEMCAYAAAAxoErWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAiNElEQVR4nO3deZhdVZnv8e+PgBITZRAtw5iosRWJgpRpFLWrhJY4xgmNrTRRMPYVEezYdNCronSuXgVEW/AaxAZELHNVJAIOGCiQlsEEkUCYggmQgAloGAohGnj7j72KnBzOrtpVdaY6+/d5nvOcs9ce1rtOVe239rSWIgIzM7Natml1AGZm1r6cJMzMLJeThJmZ5XKSMDOzXE4SZmaWy0nCzMxyNS1JSNpe0rWSfi/pJkmfT+UnSFon6fr0elPFOsdLWiXpVkmHNCtWMzPLqFnPSUgSMCkiBiRtB1wJHAPMAgYi4qSq5fcGvg/MBHYFfgW8KCIez6tjl112ialTpxaO6ZFHHmHSpEkjbcq4V8Z2l7HNUM52l7HNMLZ2L1++/P6IeE6teduOKaoRiCwbDaTJ7dJrqAw1G+iLiE3AakmryBLGVXkrTJ06lWXLlhWOqb+/n56ensLLd4oytruMbYZytruMbYaxtVvSnXnzmnpNQtIESdcDG4BLIuKaNOtjkm6Q9B1JO6Wy3YC7K1Zfm8rMzKxJmna6aatKpR2B84GjgfuA+8mOKk4EpkTEhySdBlwVEeemdc4ELo6IH1Vtax4wD6Crq2v/vr6+wnEMDAwwefLksTdonClju8vYZihnu8vYZhhbu3t7e5dHRHeteU073VQpIh6Q1A/MqrwWIekM4MI0uRbYo2K13YF7amxrEbAIoLu7O0ZyuOXD0vIoY5uhnO0uY5uhce1u5t1Nz0lHEEiaCBwM3CJpSsVi7wBuTJ+XAHMkPV3SNGA6cG2z4jUzs+YeSUwBzpY0gSw5LY6ICyV9V9K+ZKeb1gAfAYiImyQtBlYCm4GjhrqzyczM6q+ZdzfdAOxXo/ywIdZZCCxsZFxmZpbPT1ybmVkuJwkzM8vlJGFmZrlacgusjc3UBRfVLF/zpTc3ORKrB/88rZ05SZRY3s4JWreD8g7TrL04SZiNUrsltFrxzJ+xmZ7mh2IdxEmiQqP/6Ee6/aH+0283rdphFv2O5s/YzNwFF404nvH0MxhP2u1vrV6acXSeV8dZsxrT862TRAGN3lE0e/uDO8x6bKveyzdau/33Pxr1+hm0Y8KsRx3zZ2wmb9fWyt/HdvtbKMpJwoz6/gGPl53BeInTWsu3wJqZWS4fSVhN7fZfZrvFY1YWThIdxDvSzuKfp7UDn24yM7NcThJmZpbLScLMzHI5SZiZWS4nCTMzy+UkYWZmuZwkzMwsl5OEmZnlcpIwM7NcThJmZparaUlC0vaSrpX0e0k3Sfp8Kt9Z0iWSbk/vO1Wsc7ykVZJulXRIs2I1M7NMM48kNgGvj4iXA/sCsyQdACwAlkbEdGBpmkbS3sAc4KXALOB0SROaGK+ZWek1LUlEZiBNbpdeAcwGzk7lZwNvT59nA30RsSkiVgOrgJnNitfMzEAR0bzKsiOB5cALgdMi4t8lPRARO1YsszEidpL0DeDqiDg3lZ8J/Cwifli1zXnAPICurq79+/r6CsczMDDA5MmTn5xese7BUbdtPOmaCOsfbXUUzVXGNkM5213GNgNM22HCVvuzkejt7V0eEd215jW1q/CIeBzYV9KOwPmS9hlicdXaRI1tLgIWAXR3d0dPT0/hePr7+6lcfrRDeo4382ds5uQV5eolvoxthnK2u4xthmyM65Hs/4pqyd1NEfEA0E92rWG9pCkA6X1DWmwtsEfFarsD9zQvSjMza+bdTc9JRxBImggcDNwCLAEOT4sdDlyQPi8B5kh6uqRpwHTg2mbFa2ZmzT3dNAU4O12X2AZYHBEXSroKWCzpCOAu4FCAiLhJ0mJgJbAZOCqdrjIzsyZpWpKIiBuA/WqU/wk4KGedhcDCBodmZmY5/MS1mZnlcpIwM7NcThJmZpbLScLMzHKNOklIeqGk7esZjJmZtZdCSULS/5F0ePosSZcAtwH3Svr7RgZoZmatU/RI4v3ArenzG8l6cT0AOAf4Uv3DMjOzdlD0OYkusm4yAN5E9iDctZL+DCxrSGRmZtZyRY8k/gTslT6/Abg0fd6W2h3xmZlZByh6JPEj4DxJtwE7Az9P5fuSjfNgZmYdqGiS+FfgTmBP4LiIeCSVTwG+2YjAzMys9YomiV2Br0bEE1Xlp7J1d95mZtZBil6TWA3sUqN85zTPzMw6UNEkIWqMCgdMBh6rXzhmZtZOhjzdJOnr6WMAX5T0l4rZE4CZwPWNCc3MzFptuGsSM9K7gJcAf62Y91fgOuCkBsRlZmZtYMgkERG9AJL+CzgmIh5qSlRmZtYWil6T+BTwrOpCSbtL6qpvSGZm1i6KJolzyPpsqnYI8N36hWNmZu2kaJJ4JXBFjfJfA931C8fMzNpJ0SSxLfD0GuXb55SbmVkHKJokrgH+V43yo4Df1i8cMzNrJ0W75fg0cKmklwNLU9nrgf2Ag4tsQNIeZNc2ngc8ASyKiK9JOgH4MHBfWvRTEXFxWud44AjgceDjEfGLgvGamVkdFEoSEXG1pFcB/wa8k+y5ieuAj0bE7wvWtRmYHxHXSXomsDyNcAdZv1BbPW8haW9gDvBSsr6jfiXpRRHxeMH6zMxsjIoeSZCSwQdGW1FE3Avcmz4/LOlmYLchVpkN9EXEJmC1pFVkT3hfNdoYzMxsZBRRq0smkLRzRPx58PNQGxlcrnCl0lSyu6X2IeuGfC7wENkod/MjYqOkbwBXR8S5aZ0zgZ9FxA+rtjUPmAfQ1dW1f19fX+E4BgYGmDx58pPTK9Y9OJJmjFtdE2H9o62OornK2GYoZ7vL2GaAaTtM2Gp/NhK9vb3LI6LmnapDHUncJ2lKRGwA7qd2B3+DHf9NKBqMpMlkgxgdGxEPSfomcGLazonAycCHqD3i3VNiiIhFwCKA7u7u6OnpKRoK/f39VC4/d8FFhdcdz+bP2MzJKwofRHaEMrYZytnuMrYZ4KxZkxjJ/q+oob7J1wODRwi99ahM0nZkCeJ7EfFjgIhYXzH/DODCNLmWrceq2B24px5xmJlZMblJIiIuB5C0LdnF459ExKh30pIEnAncHBGnVJRPSdcrAN4B3Jg+LyEbMvUUsgvX04FrR1u/mZmN3LDHZBGxWdJXgLGeizkQOAxYIen6VPYp4H2S9iU7lbQG+Eiq9yZJi4GVZHdGHeU7m8zMmqvoiburgf3JxrkelYi4ktrXGS4eYp2FwMLR1mlmZmNTNEmcAZwkaU9gOfBI5cyIuK7egZmZWesVTRLnpfdTaswb0d1NZmY2fhRNEtMaGoWZmbWlokliL+A3EbG5sjDd+fRqxnCtwszM2lfRXmAvA2o9db1DmmdmZh2oaJIYfLK62rOpuohtZmadY8jTTZKWpI8BnCtpU8XsCWR9L/2mQbGZmVmLDXdN4k/pXcBGoLLbrL8CV5LdHmtmZh1oyCQRER8EkLQGOCkifGrJzKxEil6TOJGKowhJz5N0pKRXNyYsMzNrB0WTxEXA0fBkV9/LgK8Al0v65wbFZmZmLVY0SewPXJo+v5NsgKDnko1N/ckGxGVmZm2gaJJ4JvBA+vwG4PyI+BtZ4nhBA+IyM7M2UDRJ3AUcKGkScAhwSSrfGfhLIwIzM7PWK9otxynAd4EBsi44rkjlrwNWNCAuMzNrA4WSRER8S9JysuFEL4mIJ9KsO4DPNCo4MzNrrcKjhUfEMrK7mirLxjpanZmZtbHcJCHpX4HTI+Kx9DlX5ZjVZmbWOYY6kjgaOBt4LH3OE9QejMjMzMa53CQREdNqfTYzs/IoegusmZmV0FDXJD5bdCMR8YXhlpG0B3AO8DzgCWBRRHxN0s7AD4CpwBrgPRGxMa1zPHAE8Djw8Yj4RdGYzMxs7Ia6JnFo1fRewDOAe9L0rmQP0q0Bhk0SwGZgfkRcJ+mZwHJJlwBzgaUR8SVJC4AFwL9L2huYA7w01fUrSS+
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – just shows that we get a uniform distribution\n",
    "percentiles = [np.percentile(housing[\"median_income\"], p)\n",
    "               for p in range(1, 100)]\n",
    "flattened_median_income = pd.cut(housing[\"median_income\"],\n",
    "                                 bins=[-np.inf] + percentiles + [np.inf],\n",
    "                                 labels=range(1, 100 + 1))\n",
    "flattened_median_income.hist(bins=50)\n",
    "plt.xlabel(\"Median income percentile\")\n",
    "plt.ylabel(\"Number of districts\")\n",
    "plt.show()\n",
    "# Note: incomes below the 1st percentile are labeled 1, and incomes above the\n",
    "# 99th percentile are labeled 100. This is why the distribution below ranges\n",
    "# from 1 to 100 (not 0 to 100)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import rbf_kernel\n",
    "\n",
    "age_simil_35 = rbf_kernel(housing[[\"housing_median_age\"]], [[35]], gamma=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABSr0lEQVR4nO2deXxcVfXAv2eyJ933faOlUKAUKGVT9h0EXEAQhSKLKCLuFNGfgqLFhUVBsAIKCALKVgFBZBGQtrR0AdpSW9rSJl3SLWmzZ5Lz++O+aSaTyeRNZiYzk5zv5/M+M3PfffedN5m88869ZxFVxTAMwzC6mkC6BTAMwzB6JqaADMMwjLRgCsgwDMNIC6aADMMwjLRgCsgwDMNIC6aADMMwjLSQm24BuopAIKBFRUXpFsMwDCNp1NTUqKpmrSHRYxRQUVER1dXV6RbDMAwjaYhIbbplSISs1ZyGYRhGdmMKyDAMw0gLpoAMwzCMtGAKyDAMw0gLpoAMwzC6KSI8IEK5CB+0s19E+K0Ia0R4T4RDu1K+HuMF1xG7d++mvLycxsbGdItidFNKSkoYNWoUgYA99xldxp+Bu4CH2tl/BjDJ244A7vFeuwRTQDjls3XrVkaOHElRUREikm6RjG5Gc3MzZWVlbN++nSFDhqRbHKOHoMobIoyL0eVc4CFVFJgvQj8RhquyuSvks0cxoLy8nJEjR1JcXGzKx0gJgUCAoUOHUllZmW5R2qAKjz0GRx8N550HK1emWyKjCxkJbAz7XOq1dQlmAQGNjY1YlgQj1eTl5REMBtMtRiuqquBLX4Jnnmlp69cP/vznNAlkxEuuiCwK+zxHVefEcXy0J+4uq1JqCsjDLB8j1WTab0wVzj8fXnrJfX70UXj6aff6s5+5/aNHp1fG7sa4Wc9HbV8/+6zODhlU1emdFshZPOF/5VHApgTGiwubgjOMHooIXHYZDB0Khx8OF10Ev/wlNDXBddfBxInw7rvpltJIMXOBSzxvuCOByq5a/wFTQIbRI1FvkmXiRNiyBa66yn0eNw5OPhkWLoT+/eHqq51CMrITEf4KzAMmi1AqwuUiXC3C1V6XF4C1wBrgj8DXulI+U0BGj+DJJ2HKFCgocK9PP93xMdddB9OnQ2GhuzFH4/334bjjoKgIRo6Em29uublnMhddBHfe6dZ+cnLg059u2Xf++bBxI3znO7BoETz8cNrENBJElYtUGa5KniqjVLlflXtVudfbr6pco8o+qhykyqKOxkwmXaaAROQBESkXkQ/C2gaIyMsistp77R+27wYRWSMiq0TktLD2w0TkfW/fbyXTJtaNjGPePPj85+Hii2HpUvd6/vmwYEHs45qb4dJL4ZJLou/fvRtOOcVNYS1cCL/9LfzqV3DbbUm/hKSyeDE8/jjU1MCrr7rpt4EDW/affLJ7LSiAadPg5z+HDPOdMLoJXWkB/Rk4PaJtFvCKqk4CXvE+IyJTgAuBA7xjfi8iOd4x9wBX0RI8FTlmj6G62t0ce/VyN8Ff/ALOPhtmzmzp85e/uBtM794wZIi78ZaVtex//XW3FvDPf8Jhh7kn+U9+EkpL4T//gYMPduOffTbs2NFy3MyZru3WW2HYMOjbF2bNcjftn/zEnWvYMLc/nNtug6lToaTEWQxXXAEVFSn7igC44w444QS48UbYf3/3evzxrj0Wv/sdXHst7Ltv9P2PPOJu4g8+CAceCJ/9LFx/vbvGTLaCfvEL9/eaORPeeQeOPbb1/nHjYPx4eO01+L//gw0bnCVkGMmmyxSQqr4B7IxoPhd40Hv/IHBeWPtjqlqvqutw85MzRGQ40EdV56mq4qJ7z6OH8p3vOCXx9NPuSXbZMnjzzdZ9Ghrgppvcvueeg+3b3fRLJD/+sbshL1gAu3Y5i+Hmm2HOHKekli93iiWcN96Adevc/nvvdQvYZ54J9fXw1luu/6xZrReyAwF3nuXLnbfVO++4m3wsrr7aKcFY24YN7R8/bx6cemrrttNOg7ffjn3ejpg3zynrcA/+006DTZtg/frExk4VGzfCU0/BV78KH34IjY1tFRDAiSe6v+vZZ7sHliOP7HJRjR5Aut2wh6rqZgBV3SwioRDxkcD8sH6h4KhG731ke1L55jfdVE1XMm1ax0/k4VRVwQMPwEMPuWkggPvvh1GjWvf78pdb3k+YAPfc46yA0tLWfX/6U3czBXfDv/ZapzgO9TJDXXop/P3vrcfu2xfuvtutIey3H/zmN+7m++KLbv+++8Ls2e5J+rDDXNs3v9ly/LhxTmmde66zItrLUHPzzfDd78b+PkaMaH/fli3OQgxn6FDXnghbtrT9vkPn2bLFWRGZxgMPOOvsqqvcdy4Cn/hE234nnuh+T8uXu98mOGWVl9el4hrdnHQroPZoLzgqrqApEbkKN11Hfn5+ciTLED76yN0QZsxoaSspcVNB4Sxe7CygpUth586WqaENG1rfPKdObXkfuokedFDrtvLy1mNPmeKUT3iffv1a94k87tVX3RTQypVQWek8rBoa3A27PSUyZIjbEiFypVC1bVuyxo3Wnimcfrqz2MaPdxbc1KnuQSKS6V5kyeLFrs9JJ7nfw29/27XyGt2bdCugrSIy3LN+hgOhW1V7wVGl3vvI9qh4EcFzAEpKSnzPysdjiaQLPze66mo3JXTyyc6TacgQNwX3yU+6m3444U+2oTEj25qb2z8m1CdaW+i4jz+Gs86CK690Vs3Age4Gd9FFbeUJ5+qr3VpWLFasgDFjou8bNqyttVNe3tYqipf2xoXEx04VRxzhNlVYssRNsUVj4kQ3tblkibOiBw6EJ56A229v/dBhGImQbjfsucCl3vtLgWfD2i8UkQIRGY9zNnjHm67bIyJHet5vl4Qd06OYONHd7N95p6WtpgY+CEu6/uGHTuH8/Odunn+//dpaMV3JokVO0dx+Oxx1lJui2+Qj5vrmm50FF2uLNQV31FHw8sut215+2eU+S4SjjnJrbnV1rccdMaJ9t+108vTTTuGDU5zbtjknk2gEAm5fqP+FF8LWrW7N0TCSRVe6YYcFREmpiFwOzAZOEZHVwCneZ1R1OfAEsAJ4EbhGVUPhcF8F7sM5JnwE/LOrriGT6NXLPZlefz288oqzAK64wlkbIQtmzBjnSnvXXbB2LTz/PPzoR+mTedIkJ98ddzjnhb/+1Z+1OWSIU7ixttwYtvx117VM/X34oXt97bXW61F33eUUdDhr1jjltmmTU5whZRey1r7wBSgudt5kH3zgFvdnz4ZvfzvzpuCCQWdJhrwSly1zr6H1nWgceqjr19TknEtKSpz7tmEkiy6bglPVKL5XAJzUTv9bgFuitC8CDmx7RM/j179202znnOMU0re+5Z5SCwvd/sGD3ULzD37gnAWmTnUuwqenyXF96lQX/HjrrfDDHzoL5Ne/dh53qeToo1225x/+0Hn77bOPu5EeEVb1ZPt2WLWq9XFXXNH6if+QQ9zrunXOwunb11k811zj1kz693eeid/+dmqvpzO88Yazfi+4wH0OOdmEr/1Fcsgh7ve1Zg1Mnux+Z3//u1PW5oxgJAPRTA5YSCIlJSVaXV0ddd/KlSvZf//9u1ii5FNfD2PHwve+526ERuaRrt9aaB2tvNxZbRdd5JwQYrmLL1nirKDHH3eK6+23nVfcl77U8pBjxEeyk5GKSI2qliQiUzpJtxOCkQBLljhvshkzYM8eZ1ns2ZN6i8LILlRh7lw44wynfMClEIpl/YCzekTctCU4SzLRdTPDCCfdTghGgtx2m5sqOfFEN/32xhttY1OMns2GDS5t0Jlnus9NTbB6tYsHi0VxsbOoQwoIYPNmlwHCMJKBWUBZzCGHWIoUo2PGjnVplELu8OvXO0eKyZM7Pna//VoroCefdEHKM2Y4pxLDSASzgAyjB1BQ0JIyKORs4VcBrVrVorxCVtQ/e6TvqZFsTAEZRjemstKlQQqlR4L4FVBNjUvdBC6d0+TJ8MILyZfV6HmYAjKMbsy//+2CSUvC/KRWrYIBA2DQoI6PD8VGhU/DnXGGS1QaHoBrGJ3BFJBhdGNeecXFiIVns16
      "text/plain": [
       "<Figure size 432x288 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – this cell generates Figure 2–18\n",
    "\n",
    "ages = np.linspace(housing[\"housing_median_age\"].min(),\n",
    "                   housing[\"housing_median_age\"].max(),\n",
    "                   500).reshape(-1, 1)\n",
    "gamma1 = 0.1\n",
    "gamma2 = 0.03\n",
    "rbf1 = rbf_kernel(ages, [[35]], gamma=gamma1)\n",
    "rbf2 = rbf_kernel(ages, [[35]], gamma=gamma2)\n",
    "\n",
    "fig, ax1 = plt.subplots()\n",
    "\n",
    "ax1.set_xlabel(\"Housing median age\")\n",
    "ax1.set_ylabel(\"Number of districts\")\n",
    "ax1.hist(housing[\"housing_median_age\"], bins=50)\n",
    "\n",
    "ax2 = ax1.twinx()  # create a twin axis that shares the same x-axis\n",
    "color = \"blue\"\n",
    "ax2.plot(ages, rbf1, color=color, label=\"gamma = 0.10\")\n",
    "ax2.plot(ages, rbf2, color=color, label=\"gamma = 0.03\", linestyle=\"--\")\n",
    "ax2.tick_params(axis='y', labelcolor=color)\n",
    "ax2.set_ylabel(\"Age similarity\", color=color)\n",
    "\n",
    "plt.legend(loc=\"upper left\")\n",
    "save_fig(\"age_similarity_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "target_scaler = StandardScaler()\n",
    "scaled_labels = target_scaler.fit_transform(housing_labels.to_frame())\n",
    "\n",
    "model = LinearRegression()\n",
    "model.fit(housing[[\"median_income\"]], scaled_labels)\n",
    "some_new_data = housing[[\"median_income\"]].iloc[:5]  # pretend this is new data\n",
    "\n",
    "scaled_predictions = model.predict(some_new_data)\n",
    "predictions = target_scaler.inverse_transform(scaled_predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[131997.15275877],\n",
       "       [299359.35844434],\n",
       "       [146023.37185694],\n",
       "       [138840.33653057],\n",
       "       [192016.61557639]])"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import TransformedTargetRegressor\n",
    "\n",
    "model = TransformedTargetRegressor(LinearRegression(),\n",
    "                                   transformer=StandardScaler())\n",
    "model.fit(housing[[\"median_income\"]], housing_labels)\n",
    "predictions = model.predict(some_new_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([131997.15275877, 299359.35844434, 146023.37185694, 138840.33653057,\n",
       "       192016.61557639])"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Transformers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To create simple transformers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import FunctionTransformer\n",
    "\n",
    "log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)\n",
    "log_pop = log_transformer.transform(housing[[\"population\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "rbf_transformer = FunctionTransformer(rbf_kernel,\n",
    "                                      kw_args=dict(Y=[[35.]], gamma=0.1))\n",
    "age_simil_35 = rbf_transformer.transform(housing[[\"housing_median_age\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2.81118530e-13],\n",
       "       [8.20849986e-02],\n",
       "       [6.70320046e-01],\n",
       "       ...,\n",
       "       [9.55316054e-22],\n",
       "       [6.70320046e-01],\n",
       "       [3.03539138e-04]])"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "age_simil_35"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "sf_coords = 37.7749, -122.41\n",
    "sf_transformer = FunctionTransformer(rbf_kernel,\n",
    "                                     kw_args=dict(Y=[sf_coords], gamma=0.1))\n",
    "sf_simil = sf_transformer.transform(housing[[\"latitude\", \"longitude\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.999927  ],\n",
       "       [0.05258419],\n",
       "       [0.94864161],\n",
       "       ...,\n",
       "       [0.00388525],\n",
       "       [0.05038518],\n",
       "       [0.99868067]])"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sf_simil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.5 ],\n",
       "       [0.75]])"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratio_transformer = FunctionTransformer(lambda X: X[:, [0]] / X[:, [1]])\n",
    "ratio_transformer.transform(np.array([[1., 2.], [3., 4.]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.utils.validation import check_array, check_is_fitted\n",
    "\n",
    "class StandardScalerClone(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, with_mean=True):  # no *args or **kwargs!\n",
    "        self.with_mean = with_mean\n",
    "\n",
    "    def fit(self, X, y=None):  # y is required even though we don't use it\n",
    "        X = check_array(X)  # checks that X is an array with finite float values\n",
    "        self.mean_ = X.mean(axis=0)\n",
    "        self.scale_ = X.std(axis=0)\n",
    "        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()\n",
    "        return self  # always return self!\n",
    "\n",
    "    def transform(self, X):\n",
    "        check_is_fitted(self)  # looks for learned attributes (with trailing _)\n",
    "        X = check_array(X)\n",
    "        assert self.n_features_in_ == X.shape[1]\n",
    "        if self.with_mean:\n",
    "            X = X - self.mean_\n",
    "        return X / self.scale_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "class ClusterSimilarity(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):\n",
    "        self.n_clusters = n_clusters\n",
    "        self.gamma = gamma\n",
    "        self.random_state = random_state\n",
    "\n",
    "    def fit(self, X, y=None, sample_weight=None):\n",
    "        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)\n",
    "        self.kmeans_.fit(X, sample_weight=sample_weight)\n",
    "        return self  # always return self!\n",
    "\n",
    "    def transform(self, X):\n",
    "        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)\n",
    "    \n",
    "    def get_feature_names_out(self, names=None):\n",
    "        return [f\"Cluster {i} similarity\" for i in range(self.n_clusters)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n",
    "similarities = cluster_simil.fit_transform(housing[[\"latitude\", \"longitude\"]],\n",
    "                                           sample_weight=housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.  , 0.14, 0.  , 0.  , 0.  , 0.08, 0.  , 0.99, 0.  , 0.6 ],\n",
       "       [0.63, 0.  , 0.99, 0.  , 0.  , 0.  , 0.04, 0.  , 0.11, 0.  ],\n",
       "       [0.  , 0.29, 0.  , 0.  , 0.01, 0.44, 0.  , 0.7 , 0.  , 0.3 ]])"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similarities[:3].round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAp0AAAHoCAYAAAAL0lTRAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOydd3hURReH39m+m4T0BAgl9N4RBUFBpAkWULB/iF3sigUrKp8VO5bPBlbELoKCKCCgKL33FloIJQkp2/fO98fdNNJ7wHmfZ59k986dOXd2N/ndM3POEVJKFAqFQqFQKBSK6sRQ2wYoFAqFQqFQKE5/lOhUKBQKhUKhUFQ7SnQqFAqFQqFQKKodJToVCoVCoVAoFNWOEp0KhUKhUCgUimpHiU6FQqFQKBQKRbVjqm0DKkJMTIxMTEysbTOKJTs7m5CQkNo241+Neg9qFzX/tYua/9pFzX/tkp2dzdatW49JKWNry4aWQkhnNfSbDPOklEOroesa4ZQUnYmJiaxcubK2zSiWRYsW0b9//9o241+Neg9qFzX/tYua/9pFzX/tsmjRIgYMGJBUmzY4gVuqod9JEFMN3dYYp6ToVCgUCoVCoairCJTAKgo1JwqFQqFQKBRViADMtW1EHUQFEikUCoVCoVAoqh3l6VQoFAqFQqGoQtTyetEoT6dCoVAoFAqFotpRQlyhUCgUlSYjI4MjR47g8/lq25RaJzw8nC1bttS2GactZrOZuLg46tWrV9umFIva01k0SnQqFAqFolJkZGSQkpJCQkICdrsdIURtm1SrZGZmEhYWVttmnJZIKXG5XBw8eBCgTgtPRWHU8rpCoVAoKsWRI0dISEjA4XCUS3AuXLiQxMREFi5cWCXtFKc/QggcDgcJCQkcOXKkts0plpw9nVX9ONVRolOhUCgUlcLn82G328t1zsKFCxkxYgRJSUmMGDGCRYsWlamdEp4KALvdXqe3cuQsr1f141RHiU6FQqFQVJryejhHjBiB06kXCnQ6nQwfPryQ8CyqnRKeCijf501Rd1CiU6FQKBQ1xslCMoeThWdJ7ZTwVNR11PJ60SjRqVAoFIoaoTghmUOO8HzxxRdLbVeTwjMxMZEpU6bUyFgKxelMjYtOIYRRCLFGCDE7+PwlIcRWIcR6IcT3QoiImrZJoVAoFNXPuHHjihWSOTidTp566qkytRs3blylbUpJSeHuu++mRYsWWK1WEhISGDZsGD///HOl+y6O6667jhEjRlRb/zXB3r17EUKwcuXK2jalTqL2dBZNbXg67wbyJzCbD3SUUnYGtgMTa8EmhUKhUFQz06ZNw+FwlNquNMEJ4HA4mD59eqXs2bt3L927d2fevHk899xzrF+/nt9++43hw4dz6623VqrvmsDv9yOlrG0zKo3X661tE6octbxeNDUqOoUQjYDhwAc5r0kpf5VS+oNP/wYa1aRNCoVCoagZBgwYwOzZs8skPEvC4XAwZ84c+vfvX6l+xo8fj5SSlStXMmbMGNq0aUO7du244447WLduXbHnCSH45ptvCrx28hL8//73P1q3bo3NZiM2NpYhQ4bg9/uZNGkSH3/8MXPmzEEIgRAidx/rwYMHueKKK4iMjCQyMpLhw4ezY8eO3D4nTZpEx44dmT59eq5nNjs7u0gbt27dykUXXUR4eDihoaH07t2bDRs25B6fNm0a7du3x2az0bp1a1599VU0TStwje+99x6jR48mJCSE5s2b89lnn+Ueb9asGQBnnHEGQogC70VZ+n7rrbcYNWoUISEhPPLII/h8Pu666y4aNmyI1WqlcePGPPzww8W+B4pTk5oWzq8BDwLFZc29HphZY9ZUEI8nwJo1R6lf30FiokpMq1AoFGUlR3iWtGezJKpKcKampjJ37lwmT55MaGhooeORkZEV7nv16tXcfvvtfPzxx/Tt25f09HQWLFgAwIQJE9iyZQupqal8+umnAERFReF0OhkwYAB9+vThjz/+wGKxMGXKFM4//3y2bNmSK9T37NnDF198wddff43FYsFmsxUa/9ChQ/Tt25ezzz6b+fPnExERwfLlywkEAgC8//77PPHEE7z55pv06NGDjRs3ctNNN2E2m7njjjty+3n66ad5/vnnee655/jwww+5/vrr6devH02bNmX58uX06tWLuXPn0qVLFywWS7n6fuqpp3j22WeZMmUKQgjeeOMNvv/+e7788ksSExM5cOAA27Ztq/B7UNuoikRFU2OiUwgxAjgipVwlhOhfxPFHAT/weTHn3wzcDBAfH19sTrfqRtMkmzen4fPpX97GjUOJiSmYny4rK6vW7KsLpKf7SEpyYjQKWrQIwW431rgN//b3oLZR81+71PT8h4eHk5mZWeb2PXv25KuvvmL06NG4XK4yn2e32/n666/p0aNHucYrinXr1iGlJDExsdS+pJR4PJ4C7VwuV4Hn+dvs27ePkJAQBgwYQFhYGFFRUTRv3jz3Wk0mE0ajkZCQEAA8Hg+ffvopgUCAN954Izcd0JQpU2jevDlff/01o0aNwuPx4PV6eeedd4iLi8u142ReffVVHA4HH330Ua4YvPjiiwG9WtLTTz/NU089xZAhQwDo378/99xzD1OnTmXs2LG5/Vx++eW55z344IO8/vrr/Prrr1xxxRW5eVltNlvudZSn75EjR3L55ZfnPt+xYwctWrSga9euCCGIjIykU6dOJb43bre7yM95VlZWsecoapea9HSeDVwkhLgAsAH1hBCfSSmvEUKMBUYAA2UxG1SklO8B7wH07NlTVvYut6I89NCfTJlykJyVAofDRHb2bQXaLFq0qNJ34acqUkpCQmbhcumivFcvA//807/G7fg3vwd1ATX/tUtNz/+WLVvKXfZx+PDhTJo0qUxBQ6B7OCdNmsQFF1xQUTML9Qe6kC3NdiEEVqu1QLuTz8vfZuDAgTRt2pTOnTszZMgQBg8ezKhRo3Lbm81mTCZTgfM3bdpEUlISDRs2LDC20+nk0KFDhIWFYbVaadSoES1atCjR3k2bNtGvXz+io6MLHTt69CgHDhzgnnvu4b777st9PWd/aH6bevbsWeB5bGxsbonPHO9wSEhIbpvy9N2nT58Cz2+++WYGDRpEjx49GDx4MBdccAHDhg3DYCh+F6DNZqNbt26FXq8LN7zK01k0NSY6pZQTCQYJBT2dE4KCcyjwEHCulLL8ay01SFaWl9dfX0u+rSkYDCpB7cnk5OwVAoxGNT8KhaIwCxcuLLPgBF18TZo0iTPOOKNKBHWrVq0QQrBlyxZGjhxZrnOFEIUCePJXxwkLC2P16tUsXryY+fPn89xzz/HII4+wYsWKQqIyB03T6Nq1K19++WWhY1FRUbm/53gVS6Kk4KKcvZXvvvsuffr0KbEfs7mgbBJCFNibWZm+T76O7t27s3fvXubOncuCBQsYO3YsXbp0Yf78+SUKT8WpRV14J6ei7/GcL4RYK4R4t7YNKo60NE+B5wYDTJ16brn6+PTTPbRqNYexY//B5yv+y1sVaJpk3rx0fvghFbe7esfKQd9gfyaNGtlo1y6MDz/sXiPjFsX06Vto3vxjmjSZxiuvrDktojwVitOB0vJ1FkdxlYsqQlRUFEOGDGHq1KlFLsemp6cXe25sbCzJycm5z1NSUgo8B30J/bzzzsuNis/Ozmb27NkAWCyW3P2VOXTv3p2dO3cSExNDy5YtCzzyi86y0L17d5YuXVpkVHh8fDwJCQns2rWr0DgtW7Ys8xg5y/b5r6OyfYeFhTF69Gjeeecd5syZw4IFC9i5c2eZbaprqOj1wtTKNUgpFwGLgr+X/VNeQ6SkOHnxxVV4PAHuuacrLVtGANCoUSjnntuIBQuS0TTBvfd24ppr2pS53xMnvNx440q8Xo1Dh1ycd14cY8c2q6argKuu2smcOekAtGpl459/OmA2V/99xrBh9dm/v2qWwCrKiRNebr99EU6nnhjh8cf/Jjzcwg03dKhVuxSKfzsVFZw55AjPqggmevvtt+nTpw89e/bkmWeeoXPnzkgpWbhwIc899xz79u0r8rzzzjuPt956iz59+mA0GnnkkUcKBPT
      "text/plain": [
       "<Figure size 720x504 with 2 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – this cell generates Figure 2–19\n",
    "\n",
    "housing_renamed = housing.rename(columns={\n",
    "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
    "    \"population\": \"Population\",\n",
    "    \"median_house_value\": \"Median house value (ᴜsᴅ)\"})\n",
    "housing_renamed[\"Max cluster similarity\"] = similarities.max(axis=1)\n",
    "\n",
    "housing_renamed.plot(kind=\"scatter\", x=\"Longitude\", y=\"Latitude\", grid=True,\n",
    "                     s=housing_renamed[\"Population\"] / 100, label=\"Population\",\n",
    "                     c=\"Max cluster similarity\",\n",
    "                     cmap=\"jet\", colorbar=True,\n",
    "                     legend=True, sharex=False, figsize=(10, 7))\n",
    "plt.plot(cluster_simil.kmeans_.cluster_centers_[:, 1],\n",
    "         cluster_simil.kmeans_.cluster_centers_[:, 0],\n",
    "         linestyle=\"\", color=\"black\", marker=\"X\", markersize=20,\n",
    "         label=\"Cluster centers\")\n",
    "plt.legend(loc=\"upper right\")\n",
    "save_fig(\"district_cluster_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transformation Pipelines"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's build a pipeline to preprocess the numerical attributes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "num_pipeline = Pipeline([\n",
    "    (\"impute\", SimpleImputer(strategy=\"median\")),\n",
    "    (\"standardize\", StandardScaler()),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "\n",
    "num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
       "                ('standardscaler', StandardScaler())])"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import set_config\n",
    "\n",
    "set_config(display='diagram')\n",
    "\n",
    "num_pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.42,  1.01,  1.86,  0.31,  1.37,  0.14,  1.39, -0.94],\n",
       "       [ 0.6 , -0.7 ,  0.91, -0.31, -0.44, -0.69, -0.37,  1.17]])"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_num_prepared = num_pipeline.fit_transform(housing_num)\n",
    "housing_num_prepared[:2].round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Monkey-patching SimpleImputer.get_feature_names_out()\n",
      "Monkey-patching FunctionTransformer.get_feature_names_out()\n"
     ]
    }
   ],
   "source": [
    "def monkey_patch_get_signature_names_out():\n",
    "    \"\"\"Monkey patch some classes which did not handle get_feature_names_out()\n",
    "       correctly in 1.0.0.\"\"\"\n",
    "    from inspect import Signature, signature, Parameter\n",
    "    import pandas as pd\n",
    "    from sklearn.impute import SimpleImputer\n",
    "    from sklearn.pipeline import make_pipeline, Pipeline\n",
    "    from sklearn.preprocessing import FunctionTransformer, StandardScaler\n",
    "\n",
    "    default_get_feature_names_out = StandardScaler.get_feature_names_out\n",
    "\n",
    "    if not hasattr(SimpleImputer, \"get_feature_names_out\"):\n",
    "      print(\"Monkey-patching SimpleImputer.get_feature_names_out()\")\n",
    "      SimpleImputer.get_feature_names_out = default_get_feature_names_out\n",
    "\n",
    "    if not hasattr(FunctionTransformer, \"get_feature_names_out\"):\n",
    "        print(\"Monkey-patching FunctionTransformer.get_feature_names_out()\")\n",
    "        orig_init = FunctionTransformer.__init__\n",
    "        orig_sig = signature(orig_init)\n",
    "\n",
    "        def __init__(*args, feature_names_out=None, **kwargs):\n",
    "            orig_sig.bind(*args, **kwargs)\n",
    "            orig_init(*args, **kwargs)\n",
    "            args[0].feature_names_out = feature_names_out\n",
    "\n",
    "        __init__.__signature__ = Signature(\n",
    "            list(signature(orig_init).parameters.values()) + [\n",
    "                Parameter(\"feature_names_out\", Parameter.KEYWORD_ONLY)])\n",
    "\n",
    "        def get_feature_names_out(self, names=None):\n",
    "            if self.feature_names_out is None:\n",
    "                return default_get_feature_names_out(self, names)\n",
    "            elif callable(self.feature_names_out):\n",
    "                return self.feature_names_out(names)\n",
    "            else:\n",
    "                return self.feature_names_out\n",
    "\n",
    "        FunctionTransformer.__init__ = __init__\n",
    "        FunctionTransformer.get_feature_names_out = get_feature_names_out\n",
    "\n",
    "monkey_patch_get_signature_names_out()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_housing_num_prepared = pd.DataFrame(\n",
    "    housing_num_prepared, columns=num_pipeline.get_feature_names_out(),\n",
    "    index=housing_num.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>longitude</th>\n",
       "      <th>latitude</th>\n",
       "      <th>housing_median_age</th>\n",
       "      <th>total_rooms</th>\n",
       "      <th>total_bedrooms</th>\n",
       "      <th>population</th>\n",
       "      <th>households</th>\n",
       "      <th>median_income</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13096</th>\n",
       "      <td>-1.423037</td>\n",
       "      <td>1.013606</td>\n",
       "      <td>1.861119</td>\n",
       "      <td>0.311912</td>\n",
       "      <td>1.368167</td>\n",
       "      <td>0.137460</td>\n",
       "      <td>1.394812</td>\n",
       "      <td>-0.936491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14973</th>\n",
       "      <td>0.596394</td>\n",
       "      <td>-0.702103</td>\n",
       "      <td>0.907630</td>\n",
       "      <td>-0.308620</td>\n",
       "      <td>-0.435925</td>\n",
       "      <td>-0.693771</td>\n",
       "      <td>-0.373485</td>\n",
       "      <td>1.171942</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
       "13096  -1.423037  1.013606            1.861119     0.311912        1.368167   \n",
       "14973   0.596394 -0.702103            0.907630    -0.308620       -0.435925   \n",
       "\n",
       "       population  households  median_income  \n",
       "13096    0.137460    1.394812      -0.936491  \n",
       "14973   -0.693771   -0.373485       1.171942  "
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_housing_num_prepared.head(2)  # extra code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('simpleimputer', SimpleImputer(strategy='median')),\n",
       " ('standardscaler', StandardScaler())]"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_pipeline.steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "StandardScaler()"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_pipeline[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median'))])"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_pipeline[:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SimpleImputer(strategy='median')"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_pipeline.named_steps[\"simpleimputer\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
       "                ('standardscaler', StandardScaler())])"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_pipeline.set_params(simpleimputer__strategy=\"median\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "\n",
    "num_attribs = [\"longitude\", \"latitude\", \"housing_median_age\", \"total_rooms\",\n",
    "               \"total_bedrooms\", \"population\", \"households\", \"median_income\"]\n",
    "cat_attribs = [\"ocean_proximity\"]\n",
    "\n",
    "cat_pipeline = make_pipeline(\n",
    "    SimpleImputer(strategy=\"most_frequent\"),\n",
    "    OneHotEncoder(handle_unknown=\"ignore\"))\n",
    "\n",
    "preprocessing = ColumnTransformer([\n",
    "    (\"num\", num_pipeline, num_attribs),\n",
    "    (\"cat\", cat_pipeline, cat_attribs),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.compose import make_column_selector, make_column_transformer\n",
    "\n",
    "preprocessing = make_column_transformer(\n",
    "    (num_pipeline, make_column_selector(dtype_include=np.number)),\n",
    "    (cat_pipeline, make_column_selector(dtype_include=object)),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "housing_prepared = preprocessing.fit_transform(housing)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pipeline-1__longitude</th>\n",
       "      <th>pipeline-1__latitude</th>\n",
       "      <th>pipeline-1__housing_median_age</th>\n",
       "      <th>pipeline-1__total_rooms</th>\n",
       "      <th>pipeline-1__total_bedrooms</th>\n",
       "      <th>pipeline-1__population</th>\n",
       "      <th>pipeline-1__households</th>\n",
       "      <th>pipeline-1__median_income</th>\n",
       "      <th>pipeline-2__ocean_proximity_&lt;1H OCEAN</th>\n",
       "      <th>pipeline-2__ocean_proximity_INLAND</th>\n",
       "      <th>pipeline-2__ocean_proximity_ISLAND</th>\n",
       "      <th>pipeline-2__ocean_proximity_NEAR BAY</th>\n",
       "      <th>pipeline-2__ocean_proximity_NEAR OCEAN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>13096</th>\n",
       "      <td>-1.423037</td>\n",
       "      <td>1.013606</td>\n",
       "      <td>1.861119</td>\n",
       "      <td>0.311912</td>\n",
       "      <td>1.368167</td>\n",
       "      <td>0.137460</td>\n",
       "      <td>1.394812</td>\n",
       "      <td>-0.936491</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14973</th>\n",
       "      <td>0.596394</td>\n",
       "      <td>-0.702103</td>\n",
       "      <td>0.907630</td>\n",
       "      <td>-0.308620</td>\n",
       "      <td>-0.435925</td>\n",
       "      <td>-0.693771</td>\n",
       "      <td>-0.373485</td>\n",
       "      <td>1.171942</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       pipeline-1__longitude  pipeline-1__latitude  \\\n",
       "13096              -1.423037              1.013606   \n",
       "14973               0.596394             -0.702103   \n",
       "\n",
       "       pipeline-1__housing_median_age  pipeline-1__total_rooms  \\\n",
       "13096                        1.861119                 0.311912   \n",
       "14973                        0.907630                -0.308620   \n",
       "\n",
       "       pipeline-1__total_bedrooms  pipeline-1__population  \\\n",
       "13096                    1.368167                0.137460   \n",
       "14973                   -0.435925               -0.693771   \n",
       "\n",
       "       pipeline-1__households  pipeline-1__median_income  \\\n",
       "13096                1.394812                  -0.936491   \n",
       "14973               -0.373485                   1.171942   \n",
       "\n",
       "       pipeline-2__ocean_proximity_<1H OCEAN  \\\n",
       "13096                                    0.0   \n",
       "14973                                    1.0   \n",
       "\n",
       "       pipeline-2__ocean_proximity_INLAND  pipeline-2__ocean_proximity_ISLAND  \\\n",
       "13096                                 0.0                                 0.0   \n",
       "14973                                 0.0                                 0.0   \n",
       "\n",
       "       pipeline-2__ocean_proximity_NEAR BAY  \\\n",
       "13096                                   1.0   \n",
       "14973                                   0.0   \n",
       "\n",
       "       pipeline-2__ocean_proximity_NEAR OCEAN  \n",
       "13096                                     0.0  \n",
       "14973                                     0.0  "
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – shows that we can get a DataFrame out if we want\n",
    "housing_prepared_fr = pd.DataFrame(\n",
    "    housing_prepared,\n",
    "    columns=preprocessing.get_feature_names_out(),\n",
    "    index=housing.index)\n",
    "housing_prepared_fr.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "def column_ratio(X):\n",
    "    return X[:, [0]] / X[:, [1]]\n",
    "\n",
    "def ratio_pipeline(name=None):\n",
    "    return make_pipeline(\n",
    "        SimpleImputer(strategy=\"median\"),\n",
    "        FunctionTransformer(column_ratio,\n",
    "                            feature_names_out=[name]),\n",
    "        StandardScaler())\n",
    "\n",
    "log_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n",
    "                             FunctionTransformer(np.log),\n",
    "                             StandardScaler())\n",
    "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n",
    "default_num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n",
    "                                     StandardScaler())\n",
    "preprocessing = ColumnTransformer([\n",
    "        (\"bedrooms_ratio\", ratio_pipeline(\"bedrooms_ratio\"),\n",
    "                           [\"total_bedrooms\", \"total_rooms\"]),\n",
    "        (\"rooms_per_house\", ratio_pipeline(\"rooms_per_house\"),\n",
    "                            [\"total_rooms\", \"households\"]),\n",
    "        (\"people_per_house\", ratio_pipeline(\"people_per_house\"),\n",
    "                             [\"population\", \"households\"]),\n",
    "        (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\",\n",
    "                               \"population\", \"households\", \"median_income\"]),\n",
    "        (\"geo\", cluster_simil, [\"latitude\", \"longitude\"]),\n",
    "        (\"cat\", cat_pipeline, make_column_selector(dtype_include=object)),\n",
    "    ],\n",
    "    remainder=default_num_pipeline)  # one column remaining: housing_median_age"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16512, 24)"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_prepared = preprocessing.fit_transform(housing)\n",
    "housing_prepared.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['bedrooms_ratio__bedrooms_ratio',\n",
       "       'rooms_per_house__rooms_per_house',\n",
       "       'people_per_house__people_per_house', 'log__total_bedrooms',\n",
       "       'log__total_rooms', 'log__population', 'log__households',\n",
       "       'log__median_income', 'geo__Cluster 0 similarity',\n",
       "       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',\n",
       "       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',\n",
       "       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',\n",
       "       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',\n",
       "       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',\n",
       "       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',\n",
       "       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',\n",
       "       'remainder__housing_median_age'], dtype=object)"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessing.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Select and Train a Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training and Evaluating on the Training Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                              SimpleImputer(strategy='median')),\n",
       "                                                             ('standardscaler',\n",
       "                                                              StandardScaler())]),\n",
       "                                   transformers=[('bedrooms_ratio',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='median')),\n",
       "                                                                  ('functiontransformer',\n",
       "                                                                   FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
       "                                                                                       f...\n",
       "                                                   'median_income']),\n",
       "                                                 ('geo',\n",
       "                                                  ClusterSimilarity(random_state=42),\n",
       "                                                  ['latitude', 'longitude']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
       "                                                                  ('onehotencoder',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
       "                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
       "                ('linearregression', LinearRegression())])"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "lin_reg = make_pipeline(preprocessing, LinearRegression())\n",
    "lin_reg.fit(housing, housing_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's try the full preprocessing pipeline on a few training instances:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([243700., 372400., 128800.,  94400., 328300.])"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_predictions = lin_reg.predict(housing)\n",
    "housing_predictions[:5].round(-2)  # -2 = rounded to the nearest hundred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compare against the actual values:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([458300., 483800., 101700.,  96100., 361800.])"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_labels.iloc[:5].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-46.8%, -23.0%, 26.6%, -1.8%, -9.3%\n"
     ]
    }
   ],
   "source": [
    "# extra code – computes the error ratios discussed in the book\n",
    "error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1\n",
    "print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "68687.89176589991"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "lin_rmse = mean_squared_error(housing_labels, housing_predictions,\n",
    "                              squared=False)\n",
    "lin_rmse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                              SimpleImputer(strategy='median')),\n",
       "                                                             ('standardscaler',\n",
       "                                                              StandardScaler())]),\n",
       "                                   transformers=[('bedrooms_ratio',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='median')),\n",
       "                                                                  ('functiontransformer',\n",
       "                                                                   FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
       "                                                                                       f...\n",
       "                                                  ClusterSimilarity(random_state=42),\n",
       "                                                  ['latitude', 'longitude']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
       "                                                                  ('onehotencoder',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
       "                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
       "                ('decisiontreeregressor',\n",
       "                 DecisionTreeRegressor(random_state=42))])"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
    "tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))\n",
    "tree_reg.fit(housing, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "housing_predictions = tree_reg.predict(housing)\n",
    "tree_rmse = mean_squared_error(housing_labels, housing_predictions,\n",
    "                              squared=False)\n",
    "tree_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Better Evaluation Using Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,\n",
    "                              scoring=\"neg_root_mean_squared_error\", cv=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count       10.000000\n",
       "mean     66868.027288\n",
       "std       2060.966425\n",
       "min      63649.536493\n",
       "25%      65338.078316\n",
       "50%      66801.953094\n",
       "75%      68229.934454\n",
       "max      70094.778246\n",
       "dtype: float64"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(tree_rmses).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count       10.000000\n",
       "mean     69858.018195\n",
       "std       4182.205077\n",
       "min      65397.780144\n",
       "25%      68070.536263\n",
       "50%      68619.737842\n",
       "75%      69810.076342\n",
       "max      80959.348171\n",
       "dtype: float64"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – computes the error stats for the linear model\n",
    "lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n",
    "                              scoring=\"neg_root_mean_squared_error\", cv=10)\n",
    "pd.Series(lin_rmses).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Warning:** the following cell may take a few minutes to run:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "forest_reg = make_pipeline(preprocessing,\n",
    "                           RandomForestRegressor(random_state=42))\n",
    "forest_rmses = -cross_val_score(forest_reg, housing, housing_labels,\n",
    "                                scoring=\"neg_root_mean_squared_error\", cv=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count       10.000000\n",
       "mean     47019.561281\n",
       "std       1033.957120\n",
       "min      45458.112527\n",
       "25%      46464.031184\n",
       "50%      46967.596354\n",
       "75%      47325.694987\n",
       "max      49243.765795\n",
       "dtype: float64"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(forest_rmses).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's compare this RMSE measured using cross-validation (the \"validation error\") with the RMSE measured on the training set (the \"training error\"):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17474.619286483998"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "forest_reg.fit(housing, housing_labels)\n",
    "housing_predictions = forest_reg.predict(housing)\n",
    "forest_rmse = mean_squared_error(housing_labels, housing_predictions,\n",
    "                                 squared=False)\n",
    "forest_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The training error is much lower than the validation error, which usually means that the model has overfit the training set. Another possible explanation may be that there's a mismatch between the training data and the validation data, but it's not the case here, since both came from the same dataset that we shuffled and split in two parts."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fine-Tune Your Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grid Search"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Warning:** the following cell may take a few minutes to run:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3,\n",
       "             estimator=Pipeline(steps=[('preprocessing',\n",
       "                                        ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                                                     SimpleImputer(strategy='median')),\n",
       "                                                                                    ('standardscaler',\n",
       "                                                                                     StandardScaler())]),\n",
       "                                                          transformers=[('bedrooms_ratio',\n",
       "                                                                         Pipeline(steps=[('simpleimputer',\n",
       "                                                                                          SimpleImputer(strategy='median')),\n",
       "                                                                                         ('functiontransformer',\n",
       "                                                                                          FunctionTransformer(feature_names_...\n",
       "                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
       "                                       ('random_forest',\n",
       "                                        RandomForestRegressor(random_state=42))]),\n",
       "             param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],\n",
       "                          'random_forest__max_features': [4, 6, 8]},\n",
       "                         {'preprocessing__geo__n_clusters': [10, 15],\n",
       "                          'random_forest__max_features': [6, 8, 10]}],\n",
       "             scoring='neg_root_mean_squared_error')"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "full_pipeline = Pipeline([\n",
    "    (\"preprocessing\", preprocessing),\n",
    "    (\"random_forest\", RandomForestRegressor(random_state=42)),\n",
    "])\n",
    "param_grid = [\n",
    "    {'preprocessing__geo__n_clusters': [5, 8, 10],\n",
    "     'random_forest__max_features': [4, 6, 8]},\n",
    "    {'preprocessing__geo__n_clusters': [10, 15],\n",
    "     'random_forest__max_features': [6, 8, 10]},\n",
    "]\n",
    "grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,\n",
    "                           scoring='neg_root_mean_squared_error')\n",
    "grid_search.fit(housing, housing_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can get the full list of hyperparameters available for tuning by looking at `full_pipeline.get_params().keys()`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['memory', 'steps', 'verbose', 'preprocessing', 'random_forest', 'preprocessing__n_jobs', 'preprocessing__remainder__memory', 'preprocessing__remainder__steps', 'preprocessing__remainder__verbose', 'preprocessing__remainder__simpleimputer', 'preprocessing__remainder__standardscaler', 'preprocessing__remainder__simpleimputer__add_indicator', 'preprocessing__remainder__simpleimputer__copy', 'preprocessing__remainder__simpleimputer__fill_value', 'preprocessing__remainder__simpleimputer__missing_values', 'preprocessing__remainder__simpleimputer__strategy', 'preprocessing__remainder__simpleimputer__verbose', 'preprocessing__remainder__standardscaler__copy', 'preprocessing__remainder__standardscaler__with_mean', 'preprocessing__remainder__standardscaler__with_std', 'preprocessing__remainder', 'preprocessing__sparse_threshold', 'preprocessing__transformer_weights', 'preprocessing__transformers', 'preprocessing__verbose', 'preprocessing__verbose_feature_names_out', 'preprocessing__be...\n"
     ]
    }
   ],
   "source": [
    "# extra code – shows part of the output of get_params().keys()\n",
    "print(str(full_pipeline.get_params().keys())[:1000] + \"...\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The best hyperparameter combination found:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('preprocessing',\n",
       "                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                              SimpleImputer(strategy='median')),\n",
       "                                                             ('standardscaler',\n",
       "                                                              StandardScaler())]),\n",
       "                                   transformers=[('bedrooms_ratio',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='median')),\n",
       "                                                                  ('functiontransformer',\n",
       "                                                                   FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
       "                                                                                       func=...\n",
       "                                                  ClusterSimilarity(n_clusters=15,\n",
       "                                                                    random_state=42),\n",
       "                                                  ['latitude', 'longitude']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('simpleimputer',\n",
       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
       "                                                                  ('onehotencoder',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
       "                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b410ec490>)])),\n",
       "                ('random_forest',\n",
       "                 RandomForestRegressor(max_features=6, random_state=42))])"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_estimator_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's look at the score of each hyperparameter combination tested during the grid search:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>n_clusters</th>\n",
       "      <th>max_features</th>\n",
       "      <th>split0</th>\n",
       "      <th>split1</th>\n",
       "      <th>split2</th>\n",
       "      <th>mean_test_rmse</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>15</td>\n",
       "      <td>6</td>\n",
       "      <td>43460</td>\n",
       "      <td>43919</td>\n",
       "      <td>44748</td>\n",
       "      <td>44042</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>15</td>\n",
       "      <td>8</td>\n",
       "      <td>44132</td>\n",
       "      <td>44075</td>\n",
       "      <td>45010</td>\n",
       "      <td>44406</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>10</td>\n",
       "      <td>44374</td>\n",
       "      <td>44286</td>\n",
       "      <td>45316</td>\n",
       "      <td>44659</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>44683</td>\n",
       "      <td>44655</td>\n",
       "      <td>45657</td>\n",
       "      <td>44999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>44683</td>\n",
       "      <td>44655</td>\n",
       "      <td>45657</td>\n",
       "      <td>44999</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   n_clusters max_features  split0  split1  split2  mean_test_rmse\n",
       "12         15            6   43460   43919   44748           44042\n",
       "13         15            8   44132   44075   45010           44406\n",
       "14         15           10   44374   44286   45316           44659\n",
       "7          10            6   44683   44655   45657           44999\n",
       "9          10            6   44683   44655   45657           44999"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv_res = pd.DataFrame(grid_search.cv_results_)\n",
    "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
    "\n",
    "# extra code – these few lines of code just make the DataFrame look nicer\n",
    "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
    "                 \"param_random_forest__max_features\", \"split0_test_score\",\n",
    "                 \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
    "score_cols = [\"split0\", \"split1\", \"split2\", \"mean_test_rmse\"]\n",
    "cv_res.columns = [\"n_clusters\", \"max_features\"] + score_cols\n",
    "cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)\n",
    "\n",
    "cv_res.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Randomized Search"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Warning:** the following cell may take a few minutes to run:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.experimental import enable_halving_search_cv\n",
    "from sklearn.model_selection import HalvingRandomSearchCV"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Try 30 (`n_iter` × `cv`) random combinations of hyperparameters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3,\n",
       "                   estimator=Pipeline(steps=[('preprocessing',\n",
       "                                              ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                                                           SimpleImputer(strategy='median')),\n",
       "                                                                                          ('standardscaler',\n",
       "                                                                                           StandardScaler())]),\n",
       "                                                                transformers=[('bedrooms_ratio',\n",
       "                                                                               Pipeline(steps=[('simpleimputer',\n",
       "                                                                                                SimpleImputer(strategy='median')),\n",
       "                                                                                               ('functiontransformer',\n",
       "                                                                                                FunctionTransformer(feature_...\n",
       "                                                                               <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
       "                                             ('random_forest',\n",
       "                                              RandomForestRegressor(random_state=42))]),\n",
       "                   param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b103bb760>,\n",
       "                                        'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b410decd0>},\n",
       "                   random_state=42, scoring='neg_root_mean_squared_error')"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from scipy.stats import randint\n",
    "\n",
    "param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),\n",
    "                  'random_forest__max_features': randint(low=2, high=20)}\n",
    "\n",
    "rnd_search = RandomizedSearchCV(\n",
    "    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,\n",
    "    scoring='neg_root_mean_squared_error', random_state=42)\n",
    "\n",
    "rnd_search.fit(housing, housing_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>n_clusters</th>\n",
       "      <th>max_features</th>\n",
       "      <th>split0</th>\n",
       "      <th>split1</th>\n",
       "      <th>split2</th>\n",
       "      <th>mean_test_rmse</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>45</td>\n",
       "      <td>9</td>\n",
       "      <td>41287</td>\n",
       "      <td>42150</td>\n",
       "      <td>42627</td>\n",
       "      <td>42021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>32</td>\n",
       "      <td>7</td>\n",
       "      <td>41690</td>\n",
       "      <td>42542</td>\n",
       "      <td>43224</td>\n",
       "      <td>42485</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>41</td>\n",
       "      <td>16</td>\n",
       "      <td>42223</td>\n",
       "      <td>42959</td>\n",
       "      <td>43321</td>\n",
       "      <td>42834</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>42</td>\n",
       "      <td>4</td>\n",
       "      <td>41818</td>\n",
       "      <td>43094</td>\n",
       "      <td>43817</td>\n",
       "      <td>42910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23</td>\n",
       "      <td>8</td>\n",
       "      <td>42264</td>\n",
       "      <td>42996</td>\n",
       "      <td>43830</td>\n",
       "      <td>43030</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  n_clusters max_features  split0  split1  split2  mean_test_rmse\n",
       "1         45            9   41287   42150   42627           42021\n",
       "8         32            7   41690   42542   43224           42485\n",
       "0         41           16   42223   42959   43321           42834\n",
       "5         42            4   41818   43094   43817           42910\n",
       "2         23            8   42264   42996   43830           43030"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – displays the random search results\n",
    "cv_res = pd.DataFrame(rnd_search.cv_results_)\n",
    "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
    "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
    "                 \"param_random_forest__max_features\", \"split0_test_score\",\n",
    "                 \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
    "cv_res.columns = [\"n_clusters\", \"max_features\"] + score_cols\n",
    "cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)\n",
    "cv_res.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Bonus section: how to choose the sampling distribution for a hyperparameter**\n",
    "\n",
    "* `scipy.stats.randint(a, b+1)`: for hyperparameters with _discrete_ values that range from a to b, and all values in that range seem equally likely.\n",
    "* `scipy.stats.uniform(a, b)`: this is very similar, but for _continuous_ hyperparameters.\n",
    "* `scipy.stats.geom(1 / scale)`: for discrete values, when you want to sample roughly in a given scale. E.g., with scale=1000 most samples will be in this ballpark, but ~10% of all samples will be <100 and ~10% will be >2300.\n",
    "* `scipy.stats.expon(scale)`: this is the continuous equivalent of `geom`. Just set `scale` to the most likely value.\n",
    "* `scipy.stats.reciprocal(a, b)`: when you have almost no idea what the optimal hyperparameter value's scale is. If you set a=0.01 and b=100, then you're just as likely to sample a value between 0.01 and 0.1 as a value between 10 and 100.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are plots of the probability mass functions (for discrete variables), and probability density functions (for continuous variables) for `randint()`, `uniform()`, `geom()` and `expon()`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAuQAAAGxCAYAAAAqD6O8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABnCElEQVR4nO3deXxU5dn/8c9FCCQE2STsIOCGaAUxAor7UlFQ1MdW0VpRK1LFpVYf6erSPv60Wh+1bqCiohRrUR+polSsKMqOArIIskrYwqLse67fH3MSJ/sMZOZMku/79ZpX5txnu84Qrrlyzn3uY+6OiIiIiIiEo1bYAYiIiIiI1GQqyEVEREREQqSCXEREREQkRCrIRURERERCpIJcRERERCREKshFREREREKU1ILczHqb2UIzW2xmQ0qZf7WZzQlek8ysS0XrmlkTM/vQzL4JfjZO1vGIiFRnytkiIsmRtILczNKAp4ELgM5AfzPrXGyxZcAZ7n488CdgWAzrDgE+cvcjgY+CaREROQjK2SIiyZPMM+TdgcXuvtTd9wCvA/2iF3D3Se7+XTA5BWgTw7r9gFeC968AlyTuEEREagzlbBGRJKmdxH21BlZGTecCPcpZ/gbg/RjWbe7uawDcfY2ZNSttY2Y2EBgIkJWVdWKnTp3iPgARkbDNnDlzg7tnJ2FXytkiIpUglrydzILcSmnzUhc0O4tIcj813nXL4u7DCC6n5uTk+IwZM+JZXUQkJZjZimTtqpQ25WwRkTjFkreT2WUlF2gbNd0GWF18ITM7HngB6OfuG2NYd52ZtQzWbQnkVXLcIiI1kXK2iEiSJLMgnw4caWYdzKwOcCUwJnoBM2sHvAVc4+6LYlx3DHBt8P5a4J0EHoOISE2hnC0ikiRJ67Li7vvMbDAwDkgDhrv7PDMbFMx/DvgjcCjwjJkB7HP3nLLWDTb9EPCGmd0AfAv8JFnHJCJSXSlni4gkj7nH1a2vWlB/RBGpqsxsprvnhB1HMilni0hVFkve1pM6RURERERClMxRVqQGys/PJzc3l+3bt4cdikiVkJWVRZs2bahVS+dLpPrasmULeXl57N27N+xQRA5Keno6zZo1o0GDBge1HRXkklAbNmzAzDj66KNVYIhUID8/n1WrVrFhwwaaNSt1eG6RKm/Lli2sW7eO1q1bk5mZSXD/gUiV4+7s3LmTVatWARxUUa4KSRLq+++/p3nz5irGRWJQq1YtmjdvzubNm8MORSRh8vLyaN26NfXq1VMxLlWamVGvXj1at25NXt7BjeCqKkkSav/+/aSnp4cdhkiVkZ6ezr59+8IOQyRh9u7dS2ZmZthhiFSazMzMg+5+pYJcEk5nQERip/8vUhPo91yqk8r4fVZBLiIiIiISIhXkIiIiIiIh0igrknTth7yX1P0tf6hP0vY1YMAANmzYwLvvvpu0fSbScccdx+WXX859990HQPv27Rk8eDB33XXXQW977969HHvssbzwwgucfvrpB729quDyyy/nlFNO4c477ww7FJGUo++GcC1fvpwOHTowffp0cnIiz7D5/PPP+eUvf8nXX3/NKaecwoQJE5IeV5jfFe+++y6///3v+eKLLxI+OIXOkItUoieeeILXXnst7DASZvr06dx8880xL798+XLMjNKesjhs2DBat25dJMF+9913XHPNNTRs2JCGDRtyzTXX8P3338cV48svv4yZlfqaPn16XNuKx7x587j88svp2LEjZlb4R0y0e++9lz//+c8aRUWkhqkK3w1t27ZlzZo1dO3atbDt9ttvp0uXLixZsoS33norlLjC/K7o27cvaWlpjBw5sjIPqVQqyEUqUcOGDWnUqFHS9pfsh2pkZ2dTr169StnW3/72N2644YYibVdddRVffPEF77//Ph988AFffPEF11xzTVzbveKKK1izZk2R189+9jM6dOhQeNYnFmeeeSYvv/xyzMvv2LGD9u3b8+c//5kOHTqUusyPfvQjOnbsmPJfzCJSuZL93XAg0tLSaNGiBbVr/9B5YvHixZx99tm0bduWJk2aHNB29+zZc1Bxhf1dcd111/Hkk08e1DHEQgW5SDGffvopPXv2pH79+jRs2JAePXowd+7cwvlTpkzh7LPPJisri4YNG3LOOeewevVqIHJZsm/fvoXLnnnmmQwaNIjbb7+dxo0b07hxY+6++27y8/MBeOCBBzjuuONKxNCrVy9uu+22Im0FZ5tHjRrF2WefTWZmJkOHDmXjxo3079+fNm3akJmZybHHHstLL71UZN0zzzyTm2++md/+9rc0bdqUZs2acddddxXGAZGxgfv160dmZiaHHXYYw4cPLxFX+/btefTRRwunzYxhw4bxk5/8hKysrBLFZkFhetJJJ2FmnHnmmQDMmDGDRYsWFfmsFixYwAcffMCwYcM45ZRTOPnkkxk6dCjvvvsuCxcuLONfq6TMzExatGhR+GrQoAH/+te/+MUvfpHQkR1OOukkHn30Ua666qpy/2i5+OKLGTVqVMLiEJHESPXvhuJXIs2M0aNHF1nmzTff5LzzzqNevXp07tyZDz/8sNTtFLzfvHkz119/PWZWeILi008/pUePHmRkZNC8eXN+9atfFSm6zzzzTH75y19y1113kZ2dTa9evZgwYQJmxvvvv8+JJ55IZmYmp512Grm5uXzyySd06dKF+vXr07dvXzZu3Fi4rVT4rrj44ouZMWMGixcvjnnbB0IFuUiUffv20a9fP0499VRmz57N1KlTuf3220lLSwNg9uzZnHXWWRxxxBF8/vnnTJkyhZ/+9Kfljhs9cuRI8vPzmTx5MkOHDmXYsGE8/vjjAFx//fV8/fXXTJs2rXD5hQsXMmnSpBJnBAr85je/4eabb2b+/Plccskl7Nq1i27duvHuu+8yb948br/9dm666SY++uijEnHUrl2bSZMm8dRTT/H444/zj3/8o3D+gAEDWLx4MePHj+f//u//GDFiBMuXL6/wM3vggQfo168fs2fP5oorruD6669nxYoVAIXH9cEHH7BmzZrCS54TJ07kiCOOKHLGaPLkydSvX59TTjmlsK1Xr15kZWUxadKkCuMoyxtvvMH27du57rrrDngblal79+5MmzaNnTt3hh2KiMSoKnw3xOJ3v/sdt912G7Nnz+akk07iyiuvZNu2bSWWK+i+Uq9ePR5//HHWrFnDFVdcwapVq7jgggs44YQT+PLLL3nxxRcZNWoUv/nNb4qs/9prr+HuTJw4kREjRhS233vvvTz++ONMnTqV7777jiuuuIIHHniAYcOGMWHCBObNm1eku18qfFe0a9eO5s2b88knnxzwtmOR1Js6zaw38ASQBrzg7g8Vm98JeAnoBvzO3R8N2o8G/hG1aEfgj+7+uJndB9wIrA/m/dbdxyb0QKTa2rJlC99//z0XXXQRhx9+OACdOnUqnP+Xv/yFLl26MGzYsMK2Y445ptxttmzZkieffBIzo1OnTixatIjHHnuMO++8kzZt2tC7d2+GDx9O9+7dARg+fDgnnngiXbp0KXV7t956K5dffnmRtrvvvrvw/cCBA/nPf/7DqFGjOOeccwrbO3fuzAMPPADAUUcdxfPPP89HH31E//79WbRoEe+//z6fffYZvXr1AuCVV16hY8eOFX5m11xzDT/72c8A+NOf/sQTTzzBxIkTOeyww8jOzgbg0EMPpUWLFoXrrFixgpYtWxbZztq1a8nOzi5yZsLMaNasGWvXrq0wjrIMGzaMvn37lthfWFq1asXevXtZvXp14e9YqlLOFomoCt8NsfjVr37FRRddBMCDDz7IiBEjmDVrFqeeemqR5Qq6r5gZDRs2LMzfzzzzDC1btuSZZ56hVq1aHHPMMTz00EPcdNNN/OlPfyq8OtihQwf++te/Fm5v3bp1QOQ74rTTTgNg0KBB3HrrrcycOZNu3boBcO211xae1YfU+a5o1apVTCeoDkbSzpCbWRrwNHAB0Bnob2adiy22CbgNeDS60d0XuntXd+8KnAjsAN6OWuR/C+YrscvBaNKkCQMGDOD888+nT58+PPbYY6xcubJw/pdfflmkyI1Fz54
      "text/plain": [
       "<Figure size 864x504 with 4 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – plots a few distributions you can use in randomized search\n",
    "\n",
    "from scipy.stats import randint, uniform, geom, expon\n",
    "\n",
    "xs1 = np.arange(0, 7 + 1)\n",
    "randint_distrib = randint(0, 7 + 1).pmf(xs1)\n",
    "\n",
    "xs2 = np.linspace(0, 7, 500)\n",
    "uniform_distrib = uniform(0, 7).pdf(xs2)\n",
    "\n",
    "xs3 = np.arange(0, 7 + 1)\n",
    "geom_distrib = geom(0.5).pmf(xs3)\n",
    "\n",
    "xs4 = np.linspace(0, 7, 500)\n",
    "expon_distrib = expon(scale=1).pdf(xs4)\n",
    "\n",
    "plt.figure(figsize=(12, 7))\n",
    "\n",
    "plt.subplot(2, 2, 1)\n",
    "plt.bar(xs1, randint_distrib, label=\"scipy.randint(0, 7 + 1)\")\n",
    "plt.ylabel(\"Probability\")\n",
    "plt.legend()\n",
    "plt.axis([-1, 8, 0, 0.2])\n",
    "\n",
    "plt.subplot(2, 2, 2)\n",
    "plt.fill_between(xs2, uniform_distrib, label=\"scipy.uniform(0, 7)\")\n",
    "plt.ylabel(\"PDF\")\n",
    "plt.legend()\n",
    "plt.axis([-1, 8, 0, 0.2])\n",
    "\n",
    "plt.subplot(2, 2, 3)\n",
    "plt.bar(xs3, geom_distrib, label=\"scipy.geom(0.5)\")\n",
    "plt.xlabel(\"Hyperparameter value\")\n",
    "plt.ylabel(\"Probability\")\n",
    "plt.legend()\n",
    "plt.axis([0, 7, 0, 1])\n",
    "\n",
    "plt.subplot(2, 2, 4)\n",
    "plt.fill_between(xs4, expon_distrib, label=\"scipy.expon(scale=1)\")\n",
    "plt.xlabel(\"Hyperparameter value\")\n",
    "plt.ylabel(\"PDF\")\n",
    "plt.legend()\n",
    "plt.axis([0, 7, 0, 1])\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are the PDF for `expon()` and `reciprocal()` (left column), as well as the PDF of log(X) (right column). The right column shows the distribution of hyperparameter _scales_. You can see that `expon()` favors hyperparameters with roughly the desired scale, with a longer tail towards the smaller scales. But `reciprocal()` does not favor any scale, they are all equally likely:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAuQAAAGxCAYAAAAqD6O8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACNVklEQVR4nOzdeXwV1f3/8dcnO4Rd9kUBBQWpUkTEukHdV9S2VksVsK2gUrUuFfVX96+7qKUKIoIg7oqVqtUqsqkg+77vEJawb4GEJJ/fH/fmmoSskOTem7yfj8d95M7MmZnP3OSe+eTMmTPm7oiIiIiISHjEhDsAEREREZGqTAm5iIiIiEgYKSEXEREREQkjJeQiIiIiImGkhFxEREREJIyUkIuIiIiIhFFEJ+RmNtzMUs1sQSHLzcz+aWYrzGyemXWq6BhFRCRAdbaIyJGJ6IQceAu4pIjllwJtgq9bgMEVEJOIiBTsLVRni4iUWkQn5O4+CdhRRJEewCgPmArUMbMmFROdiIjkpjpbROTIxIU7gKPUDFifa3pDcN6m/AXN7BYCLTIkJyefdtJJJ1VIgCIiZWnmzJnb3L1BuOM4QqqzRaTKKUm9He0JuRUwzwsq6O5DgaEAJ/2io8+YMaM84xIRKRdmtjbcMRyFI6qzO3furDpbRKJWSertiO6yUgIbgBa5ppsDG4tbaX96ZrkFJCIihTqiOltEpLKL9oR8LHBT8M79rsBudz/s0md++5SQi4iEwxHV2SIilV1Ed1kxs/eAbkB9M9sAPALEA7j7EOBL4DJgBZAG9CnJdg9kZLE/PZPkxIg+fBGRqFJedbaISGUX0Rmpu99QzHIHbi/1doHpa3bQ7cSGRxqaiIjkU151tohIZRftXVaO2I8rt4c7BBERERGRyG4hL0/fL98W7hCqvOzsbDZs2MD+/fvDHYpIxEhOTqZ58+bExFTZ9hKJEHv27CE1NZVDhw6FOxSRiBUfH0/Dhg2pVavWUW2nyibkizbtYcf+DOolJ4Q7lCpr27ZtmBknnniikg8RAv+kpqSksG3bNho2VJc6CZ89e/awZcsWmjVrRrVq1TAraMRKkarN3Tlw4AApKSkAR5WUV+ks6MeVaiUPp127dtGoUSMl4yJBMTExNGrUiN27d4c7FKniUlNTadasGdWrV1cyLlIIM6N69eo0a9aM1NTUo9pWlc6EflihfuThlJWVRXx8fLjDEIko8fHxZGZqaFYJr0OHDlGtWrVwhyESFapVq3bUXbuqdEL+/Yqt4Q6hylPLi0he+k5IpNDfokjJlMV3pUon5Ot3HGDtdt1QKCIiIiLhU6UTcoDJGm1FRERERMKoyo6ykmPSsq38setx4Q5DgloO+KJC97fmmcsrbF+9e/dm27ZtfP755xW2z0j26KOPsm7dOoYPH14h+zMzPvroI377299WyP4K0qVLFwYMGMC1114bthhEjkak19HlWc/27t2b1q1b8/DDD5d4nfnz53PJJZewbNkykpOTyyyWbt260aFDB/71r38VWuatt96if//+7Nu3r8z2K+WnyreQT1m5ncys7HCHIVXAK6+8wujRo8MdRkRITU1l4MCB/L//9//CHUqZmTRpEldddRXNmjXDzHjrrbcOK/OPf/yD+++/n+xs1Tki0WT+/Pl89tln3HXXXQCkpaXRtm1b/vrXv+Ypt2XLFurXr8+zzz4LwC9+8Qu6du3KwIEDyzSeMWPG8PTTT4emW7ZsyQsvvHDU2928eTP169fnxRdfzDN/4cKFJCUl8cEHHxz1PqRgVT4h35ueyZz1u8IdhlQBtWvXpk6dOuEOIyIMGzaMLl260Lp163CHUmb27dtHhw4deOWVVwodneKyyy5j7969/Pe//63g6ETkaAwaNIjf/OY3oXGmq1evzsiRIxkyZAjjxo0LlfvLX/5C27Ztuffee0Pz+vTpw+DBg8t09KR69epRs2bNMttejsaNG/Paa6/x0EMPsWjRIiAw4s5NN91Ejx49+P3vf1/m+5SAKp+QQ6DbikhJTZo0ia5du1KjRg1q167NGWecwYIFCwCYOnUqv/71r0lOTqZ27dqcf/75bNy4EQhc7rziiitC2+nWrRv9+vXjzjvvpG7dutStW5f77rsv1Hr6+OOP06FDh8P2f9ZZZ3HHHXccNj8lJYXrr78+tK3LL7+c5cuXA7B161aaNGnC448/Hio/b948kpKS+Pjjj4FAF5IOHTowbNgwjj32WKpVq8bVV1/Ntm0/32eRnZ3NE088QYsWLUhMTOQXv/gFn332WWj5mjVrMDM++eQTLrzwQqpXr0779u355ptv8sT67rvvctVVV5X4cy3us/3qq68455xzqFu3LvXq1ePiiy9m8eLFhf4Oi/u8jsRll13GU089xW9/+9tCx9aPjY3lsssu47333jvi/YhIyaWnp3PXXXfRqFEjkpKS6Nq1K99//32eMl988QUnnngiSUlJnHvuubz//vuYGWvWrAECQ/R++OGHh9VZZ555Jvfccw99+vRh9+7dDB8+nHHjxjFq1ChiY2ND5S666CJ27NjBhAkTCo2zcePGeVqfzzrrLGrWrBlK4pcvX46ZhR5A061bN/r37x96v3btWu677z7M7LARP8aNG0eHDh1ITk6me/furF69usjP7LrrruPqq6/mpptuIjMzkyeeeIKNGzcyePDgItcrCyNGjKB9+/YkJSXRtm1bXnrppdA58YknnqBx48Z5xvu+4YYb6NSpExkZGUCga+K//vUvLr/8cqpXr85xxx132JXp+fPnc8EFF1CtWjXq1atH79698zz7Iedc/corr9CsWTPq1q1Lnz59SEtLK9djV0IOTFRCLiWUmZlJjx49OPvss5k7dy4//fQTd955J7GxscydO5fu3btzwgkn8MMPPzB16lSuu+66IltF3nnnHbKzs5kyZQqvv/46Q4cO5eWXXwbg5ptvZsmSJUybNi1UfunSpfz444/86U9/yrOdtLQ0unfvTlJSEhMnTmTKlCk0adKECy64gLS0NBo0aMBbb73Fk08+yZQpUzhw4AA33HADN9xwQ54+1WvWrGH06NF89tlnfPvttyxfvpybb745tPyVV17h+eef59lnn2X+/Plcc801XHvttcyZMydPPA899BB33HEHc+fO5fTTT+f6668P9WPcsWMHixYtonPnziX6XIFiP9v9+/dz1113MW3aNCZMmEDt2rW58sorQ5V0fsV9XgCTJ0+mRo0aRb6eeuqpQn+3henSpQsTJ04s9XoiUnp///vf+eCDDxg+fDizZ8/mF7/4BZdccgmbNm0CYN26dVx77bVcfvnlzJ07lzvuuIO///3vebYxb948du/enafOyvH4449Tu3ZtbrzxRv72t7/x/PPPc8IJJ+Qpk5CQQMeOHYv83p933nmMHz8eCNRPM2bMIDExkRkzZgAwYcIETjjhBJo1a3bYumPGjKF58+Y8/PDDbNq0KXRsEPiH5Omnn2b48OFMmTKFXbt20a9fv2I/t9dee42UlBR69uzJ008/zbBhw6hXr16x6xWnqO56b7zxBg8++CCPP/44ixcv5sUXX+TZZ5/ltddeA+DBBx+kTZs2oXPSqFGj+Oyzz3j33XdJSPj5qeuPPPIIV111FXPmzOGWW27hpptuCn2OaWlpXHLJJdSoUYNp06bx6aef8uOPP+Y5z0Gg/l+wYAHffvstH3zwAZ9++imvvPLKUR9/Uar8TZ0A81J2s31fOsfUSAx3KBLh9uzZw65du7jyyis5/vjjATjppJMA6NmzJ6eeeipDhw4NlW/Xrl2R22vSpAn//Oc/MTNOOukkli1bxsCBA7n77rtp3rw5l1xyCcOHD6dLly4ADB8+nNNOO41TTz01z3bef/993J0RI0aEWkdef/11GjZsyOeff851113HxRdfzG233UbPnj0577zzSE9PZ9CgQXm2c+DAAUaNGsWxxx4b2sY555zD8uXLadOmDS+88AL33nsvf/jDH4DAyWjSpEm88MILeVoh/va3v3HllVcC8NRTTzFq1CjmzJnD2Wefzbp163B3mjRpUqLPFeC5554r8rP9zW9+k+c
      "text/plain": [
       "<Figure size 864x504 with 4 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# extra code – shows the difference between expon and reciprocal\n",
    "\n",
    "from scipy.stats import reciprocal\n",
    "\n",
    "xs1 = np.linspace(0, 7, 500)\n",
    "expon_distrib = expon(scale=1).pdf(xs1)\n",
    "\n",
    "log_xs2 = np.linspace(-5, 3, 500)\n",
    "log_expon_distrib = np.exp(log_xs2 - np.exp(log_xs2))\n",
    "\n",
    "xs3 = np.linspace(0.001, 1000, 500)\n",
    "reciprocal_distrib = reciprocal(0.001, 1000).pdf(xs3)\n",
    "\n",
    "log_xs4 = np.linspace(np.log(0.001), np.log(1000), 500)\n",
    "log_reciprocal_distrib = uniform(np.log(0.001), np.log(1000)).pdf(log_xs4)\n",
    "\n",
    "plt.figure(figsize=(12, 7))\n",
    "\n",
    "plt.subplot(2, 2, 1)\n",
    "plt.fill_between(xs1, expon_distrib,\n",
    "                 label=\"scipy.expon(scale=1)\")\n",
    "plt.ylabel(\"PDF\")\n",
    "plt.legend()\n",
    "plt.axis([0, 7, 0, 1])\n",
    "\n",
    "plt.subplot(2, 2, 2)\n",
    "plt.fill_between(log_xs2, log_expon_distrib,\n",
    "                 label=\"log(X) with X ~ expon\")\n",
    "plt.legend()\n",
    "plt.axis([-5, 3, 0, 1])\n",
    "\n",
    "plt.subplot(2, 2, 3)\n",
    "plt.fill_between(xs3, reciprocal_distrib,\n",
    "                 label=\"scipy.reciprocal(0.001, 1000)\")\n",
    "plt.xlabel(\"Hyperparameter value\")\n",
    "plt.ylabel(\"PDF\")\n",
    "plt.legend()\n",
    "plt.axis([0.001, 1000, 0, 0.005])\n",
    "\n",
    "plt.subplot(2, 2, 4)\n",
    "plt.fill_between(log_xs4, log_reciprocal_distrib,\n",
    "                 label=\"log(X) with X ~ reciprocal\")\n",
    "plt.xlabel(\"Log of hyperparameter value\")\n",
    "plt.legend()\n",
    "plt.axis([-8, 1, 0, 0.2])\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Analyze the Best Models and Their Errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.07, 0.05, 0.05, 0.01, 0.01, 0.01, 0.01, 0.19, 0.04, 0.01, 0.  ,\n",
       "       0.01, 0.01, 0.01, 0.01, 0.01, 0.  , 0.01, 0.01, 0.01, 0.  , 0.01,\n",
       "       0.01, 0.01, 0.01, 0.01, 0.  , 0.  , 0.02, 0.01, 0.01, 0.01, 0.02,\n",
       "       0.01, 0.  , 0.02, 0.03, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01,\n",
       "       0.01, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.  , 0.07,\n",
       "       0.  , 0.  , 0.  , 0.01])"
      ]
     },
     "execution_count": 142,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_model = rnd_search.best_estimator_  # includes preprocessing\n",
    "feature_importances = final_model[\"random_forest\"].feature_importances_\n",
    "feature_importances.round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0.18694559869103852, 'log__median_income'),\n",
       " (0.0748194905715524, 'cat__ocean_proximity_INLAND'),\n",
       " (0.06926417748515576, 'bedrooms_ratio__bedrooms_ratio'),\n",
       " (0.05446998753775219, 'rooms_per_house__rooms_per_house'),\n",
       " (0.05262301809680712, 'people_per_house__people_per_house'),\n",
       " (0.03819415873915732, 'geo__Cluster 0 similarity'),\n",
       " (0.02879263999929514, 'geo__Cluster 28 similarity'),\n",
       " (0.023530192521380392, 'geo__Cluster 24 similarity'),\n",
       " (0.020544786346378206, 'geo__Cluster 27 similarity'),\n",
       " (0.019873052631077512, 'geo__Cluster 43 similarity'),\n",
       " (0.018597511022930273, 'geo__Cluster 34 similarity'),\n",
       " (0.017409085415656868, 'geo__Cluster 37 similarity'),\n",
       " (0.015546519677632162, 'geo__Cluster 20 similarity'),\n",
       " (0.014230331127504292, 'geo__Cluster 17 similarity'),\n",
       " (0.0141032216204026, 'geo__Cluster 39 similarity'),\n",
       " (0.014065768027447325, 'geo__Cluster 9 similarity'),\n",
       " (0.01354220782825315, 'geo__Cluster 4 similarity'),\n",
       " (0.01348963625822907, 'geo__Cluster 3 similarity'),\n",
       " (0.01338319626383868, 'geo__Cluster 38 similarity'),\n",
       " (0.012240533790212824, 'geo__Cluster 31 similarity'),\n",
       " (0.012089046542256785, 'geo__Cluster 7 similarity'),\n",
       " (0.01152326329703204, 'geo__Cluster 23 similarity'),\n",
       " (0.011397459905603558, 'geo__Cluster 40 similarity'),\n",
       " (0.011282340924816439, 'geo__Cluster 36 similarity'),\n",
       " (0.01104139770781063, 'remainder__housing_median_age'),\n",
       " (0.010671123191312802, 'geo__Cluster 44 similarity'),\n",
       " (0.010296376177202627, 'geo__Cluster 5 similarity'),\n",
       " (0.010184798445004483, 'geo__Cluster 42 similarity'),\n",
       " (0.010121853542225083, 'geo__Cluster 11 similarity'),\n",
       " (0.009795219101117579, 'geo__Cluster 35 similarity'),\n",
       " (0.00952581084310724, 'geo__Cluster 10 similarity'),\n",
       " (0.009433209165984823, 'geo__Cluster 13 similarity'),\n",
       " (0.00915075361116215, 'geo__Cluster 1 similarity'),\n",
       " (0.009021485619463173, 'geo__Cluster 30 similarity'),\n",
       " (0.00894936224917583, 'geo__Cluster 41 similarity'),\n",
       " (0.008901832702357514, 'geo__Cluster 25 similarity'),\n",
       " (0.008897504713401587, 'geo__Cluster 29 similarity'),\n",
       " (0.0086846298524955, 'geo__Cluster 21 similarity'),\n",
       " (0.008061104590483955, 'geo__Cluster 15 similarity'),\n",
       " (0.00786048176566994, 'geo__Cluster 16 similarity'),\n",
       " (0.007793633130749198, 'geo__Cluster 22 similarity'),\n",
       " (0.007501766442066527, 'log__total_rooms'),\n",
       " (0.0072024111938241275, 'geo__Cluster 32 similarity'),\n",
       " (0.006947156598995616, 'log__population'),\n",
       " (0.006800076770899128, 'log__households'),\n",
       " (0.006736105364684462, 'log__total_bedrooms'),\n",
       " (0.006315268213499131, 'geo__Cluster 33 similarity'),\n",
       " (0.005796398579893261, 'geo__Cluster 14 similarity'),\n",
       " (0.005234954623294958, 'geo__Cluster 6 similarity'),\n",
       " (0.0045514083468621595, 'geo__Cluster 12 similarity'),\n",
       " (0.004546042080216035, 'geo__Cluster 18 similarity'),\n",
       " (0.004314514641115755, 'geo__Cluster 2 similarity'),\n",
       " (0.003953528110719969, 'geo__Cluster 19 similarity'),\n",
       " (0.003297404747742136, 'geo__Cluster 26 similarity'),\n",
       " (0.00289453474290887, 'cat__ocean_proximity_<1H OCEAN'),\n",
       " (0.0016978863168109126, 'cat__ocean_proximity_NEAR OCEAN'),\n",
       " (0.0016391131530559377, 'geo__Cluster 8 similarity'),\n",
       " (0.00015061247730531558, 'cat__ocean_proximity_NEAR BAY'),\n",
       " (7.301686597099842e-05, 'cat__ocean_proximity_ISLAND')]"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(zip(feature_importances,\n",
    "           final_model[\"preprocessing\"].get_feature_names_out()),\n",
    "           reverse=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate Your System on the Test Set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "41424.40026462184\n"
     ]
    }
   ],
   "source": [
    "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
    "y_test = strat_test_set[\"median_house_value\"].copy()\n",
    "\n",
    "final_predictions = final_model.predict(X_test)\n",
    "\n",
    "final_rmse = mean_squared_error(y_test, final_predictions, squared=False)\n",
    "print(final_rmse)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can compute a 95% confidence interval for the test RMSE:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([39275.40861216, 43467.27680583])"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from scipy import stats\n",
    "\n",
    "confidence = 0.95\n",
    "squared_errors = (final_predictions - y_test) ** 2\n",
    "np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,\n",
    "                         loc=squared_errors.mean(),\n",
    "                         scale=stats.sem(squared_errors)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We could compute the interval manually like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(39275.40861216077, 43467.2768058342)"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – shows how to compute a confidence interval for the RMSE\n",
    "m = len(squared_errors)\n",
    "mean = squared_errors.mean()\n",
    "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n",
    "tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
    "np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, we could use a z-score rather than a t-score. Since the test set is not too small, it won't make a big difference:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(39276.05610140007, 43466.691749969636)"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extra code – computes a confidence interval again using a z-score\n",
    "zscore = stats.norm.ppf((1 + confidence) / 2)\n",
    "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
    "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model persistence using joblib"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the final model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['my_california_housing_model.pkl']"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import joblib\n",
    "\n",
    "joblib.dump(final_model, \"my_california_housing_model.pkl\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now you can deploy this model to production. For example, the following code could be a script that would run in production:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "\n",
    "# extra code – excluded for conciseness\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.metrics.pairwise import rbf_kernel\n",
    "\n",
    "def column_ratio(X):\n",
    "    return X[:, [0]] / X[:, [1]]\n",
    "\n",
    "#class ClusterSimilarity(BaseEstimator, TransformerMixin):\n",
    "#    [...]\n",
    "\n",
    "final_model_reloaded = joblib.load(\"my_california_housing_model.pkl\")\n",
    "\n",
    "new_data = housing.iloc[:5]  # pretend these are new districts\n",
    "predictions = final_model_reloaded.predict(new_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([442737.15, 457566.06, 105965.  ,  98462.  , 332992.01])"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Also works with pickle, but joblib is more efficient."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise solutions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise: _Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3,\n",
       "             estimator=Pipeline(steps=[('preprocessing',\n",
       "                                        ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                                                     SimpleImputer(strategy='median')),\n",
       "                                                                                    ('standardscaler',\n",
       "                                                                                     StandardScaler())]),\n",
       "                                                          transformers=[('bedrooms_ratio',\n",
       "                                                                         Pipeline(steps=[('simpleimputer',\n",
       "                                                                                          SimpleImputer(strategy='median')),\n",
       "                                                                                         ('functiontransformer',\n",
       "                                                                                          FunctionTransformer(feature_names_...\n",
       "                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
       "                                       ('svr', SVR())]),\n",
       "             param_grid=[{'svr__C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,\n",
       "                                     10000.0, 30000.0],\n",
       "                          'svr__kernel': ['linear']},\n",
       "                         {'svr__C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0,\n",
       "                                     1000.0],\n",
       "                          'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],\n",
       "                          'svr__kernel': ['rbf']}],\n",
       "             scoring='neg_root_mean_squared_error')"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.svm import SVR\n",
    "\n",
    "param_grid = [\n",
    "        {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,\n",
    "                                               3000., 10000., 30000.0]},\n",
    "        {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,\n",
    "                                            1000.0],\n",
    "         'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n",
    "    ]\n",
    "\n",
    "svr_pipeline = Pipeline([(\"preprocessing\", preprocessing), (\"svr\", SVR())])\n",
    "grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3,\n",
    "                           scoring='neg_root_mean_squared_error')\n",
    "grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The best model achieves the following score (evaluated using 3-fold cross validation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "69814.13889867254"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svr_grid_search_rmse = -grid_search.best_score_\n",
    "svr_grid_search_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's much worse than the `RandomForestRegressor` (but to be fair, we trained the model on much less data). Let's check the best hyperparameters found:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'svr__C': 10000.0, 'svr__kernel': 'linear'}"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The linear kernel seems better than the RBF kernel. Notice that the value of `C` is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for `C` (removing the smallest values), because it is likely that higher values of `C` will be better."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise: _Try replacing the `GridSearchCV` with a `RandomizedSearchCV`._"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Warning:** the following cell will take several minutes to run. You can specify `verbose=2` when creating the `RandomizedSearchCV` if you want to see the training details."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3,\n",
       "                   estimator=Pipeline(steps=[('preprocessing',\n",
       "                                              ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
       "                                                                                           SimpleImputer(strategy='median')),\n",
       "                                                                                          ('standardscaler',\n",
       "                                                                                           StandardScaler())]),\n",
       "                                                                transformers=[('bedrooms_ratio',\n",
       "                                                                               Pipeline(steps=[('simpleimputer',\n",
       "                                                                                                SimpleImputer(strategy='median')),\n",
       "                                                                                               ('functiontransformer',\n",
       "                                                                                                FunctionTransformer(feature_...\n",
       "                                                                               <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
       "                                             ('svr', SVR())]),\n",
       "                   n_iter=50,\n",
       "                   param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9ae254b9d0>,\n",
       "                                        'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b734dbe50>,\n",
       "                                        'svr__kernel': ['linear', 'rbf']},\n",
       "                   random_state=42, scoring='neg_root_mean_squared_error')"
      ]
     },
     "execution_count": 154,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from scipy.stats import expon, reciprocal\n",
    "\n",
    "# see https://docs.scipy.org/doc/scipy/reference/stats.html\n",
    "# for `expon()` and `reciprocal()` documentation and more probability distribution functions.\n",
    "\n",
    "# Note: gamma is ignored when kernel is \"linear\"\n",
    "param_distribs = {\n",
    "        'svr__kernel': ['linear', 'rbf'],\n",
    "        'svr__C': reciprocal(20, 200_000),\n",
    "        'svr__gamma': expon(scale=1.0),\n",
    "    }\n",
    "\n",
    "rnd_search = RandomizedSearchCV(svr_pipeline,\n",
    "                                param_distributions=param_distribs,\n",
    "                                n_iter=50, cv=3,\n",
    "                                scoring='neg_root_mean_squared_error',\n",
    "                                random_state=42)\n",
    "rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The best model achieves the following score (evaluated using 3-fold cross validation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "55853.88100300133"
      ]
     },
     "execution_count": 155,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svr_rnd_search_rmse = -rnd_search.best_score_\n",
    "svr_rnd_search_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now that's really much better, but still far from the `RandomForestRegressor`'s performance. Let's check the best hyperparameters found:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'svr__C': 157055.10989448498,\n",
       " 'svr__gamma': 0.26497040005002437,\n",
       " 'svr__kernel': 'rbf'}"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rnd_search.best_params_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This time the search found a good set of hyperparameters for the RBF kernel. Randomized search tends to find better hyperparameters than grid search in the same amount of time."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that we used the `expon()` distribution for `gamma`, with a scale of 1, so `RandomSearch` mostly searched for values roughly of that scale: about 80% of the samples were between 0.1 and 2.3 (roughly 10% were smaller and 10% were larger):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.80066"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(42)\n",
    "\n",
    "s = expon(scale=1).rvs(100_000)  # get 100,000 samples\n",
    "((s > 0.105) & (s < 2.29)).sum() / 100_000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We used the `reciprocal()` distribution for `C`, meaning we did not have a clue what the optimal scale of `C` was before running the random search. It explored the range from 20 to 200 just as much as the range from 2,000 to 20,000 or from 20,000 to 200,000."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise: _Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes._"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's create a new pipeline that runs the previously defined preparation pipeline, and adds a `SelectFromModel` transformer based on a `RandomForestRegressor` before the final regressor:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectFromModel\n",
    "\n",
    "selector_pipeline = Pipeline([\n",
    "    ('preprocessing', preprocessing),\n",
    "    ('selector', SelectFromModel(RandomForestRegressor(random_state=42),\n",
    "                                 threshold=0.005)),  # min feature importance\n",
    "    ('svr', SVR(C=rnd_search.best_params_[\"svr__C\"],\n",
    "                gamma=rnd_search.best_params_[\"svr__gamma\"],\n",
    "                kernel=rnd_search.best_params_[\"svr__kernel\"])),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count        3.000000\n",
       "mean     56211.362086\n",
       "std       1922.002802\n",
       "min      54150.008629\n",
       "25%      55339.929909\n",
       "50%      56529.851189\n",
       "75%      57242.038815\n",
       "max      57954.226441\n",
       "dtype: float64"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selector_rmses = -cross_val_score(selector_pipeline,\n",
    "                                  housing.iloc[:5000],\n",
    "                                  housing_labels.iloc[:5000],\n",
    "                                  scoring=\"neg_root_mean_squared_error\",\n",
    "                                  cv=3)\n",
    "pd.Series(selector_rmses).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Oh well, feature selection does not seem to help. But maybe that's just because the threshold we used was not optimal. Perhaps try tuning it using random search or grid search?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise: _Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()` method, and outputs the model's predictions in its `transform()` method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts._"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Rather than restrict ourselves to k-Nearest Neighbors regressors, let's create a transformer that accepts any regressor. For this, we can extend the `MetaEstimatorMixin` and have a required `estimator` argument in the constructor. The `fit()` method must work on a clone of this estimator, and it must also save `feature_names_in_`. The `MetaEstimatorMixin` will ensure that `estimator` is listed as a required parameters, and it will update `get_params()` and `set_params()` to make the estimator's hyperparameters available for tuning. Lastly, we create a `get_feature_names_out()` method: the output column name is the ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "from sklearn.base import MetaEstimatorMixin, clone\n",
    "\n",
    "class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, estimator):\n",
    "        self.estimator = estimator\n",
    "\n",
    "    def fit(self, X, y=None):\n",
    "        estimator_ = clone(self.estimator)\n",
    "        estimator_.fit(X, y)\n",
    "        self.estimator_ = estimator_\n",
    "        self.n_features_in_ = self.estimator_.n_features_in_\n",
    "        if hasattr(self.estimator, \"feature_names_in_\"):\n",
    "            self.feature_names_in_ = self.estimator.feature_names_in_\n",
    "        return self  # always return self!\n",
    "    \n",
    "    def transform(self, X):\n",
    "        check_is_fitted(self)\n",
    "        predictions = self.estimator_.predict(X)\n",
    "        if predictions.ndim == 1:\n",
    "            predictions = predictions.reshape(-1, 1)\n",
    "        return predictions\n",
    "\n",
    "    def get_feature_names_out(self, names=None):\n",
    "        check_is_fitted(self)\n",
    "        n_outputs = getattr(self.estimator_, \"n_outputs_\", 1)\n",
    "        estimator_class_name = self.estimator_.__class__.__name__\n",
    "        estimator_short_name = estimator_class_name.lower().replace(\"_\", \"\")\n",
    "        return [f\"{estimator_short_name}_prediction_{i}\"\n",
    "                for i in range(n_outputs)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's ensure it complies to Scikit-Learn's API:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils.estimator_checks import check_estimator\n",
    "\n",
    "check_estimator(FeatureFromRegressor(KNeighborsRegressor()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Good! Now let's test it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[456667.        ],\n",
       "       [435250.        ],\n",
       "       [105100.        ],\n",
       "       ...,\n",
       "       [148800.        ],\n",
       "       [500001.        ],\n",
       "       [234333.33333333]])"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "knn_reg = KNeighborsRegressor(n_neighbors=3, weights=\"distance\")\n",
    "knn_transformer = FeatureFromRegressor(knn_reg)\n",
    "geo_features = housing[[\"latitude\", \"longitude\"]]\n",
    "knn_transformer.fit_transform(geo_features, housing_labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And what does its output feature name look like?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['kneighborsregressor_prediction_0']"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "knn_transformer.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Okay, now let's include this transformer in our preprocessing pipeline:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import clone\n",
    "\n",
    "transformers = [(name, clone(transformer), columns)\n",
    "                for name, transformer, columns in preprocessing.transformers]\n",
    "geo_index = [name for name, _, _ in transformers].index(\"geo\")\n",
    "transformers[geo_index] = (\"geo\", knn_transformer, [\"latitude\", \"longitude\"])\n",
    "\n",
    "new_geo_preprocessing = ColumnTransformer(transformers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_geo_pipeline = Pipeline([\n",
    "    ('preprocessing', new_geo_preprocessing),\n",
    "    ('svr', SVR(C=rnd_search.best_params_[\"svr__C\"],\n",
    "                gamma=rnd_search.best_params_[\"svr__gamma\"],\n",
    "                kernel=rnd_search.best_params_[\"svr__kernel\"])),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count         3.000000\n",
       "mean     104992.095758\n",
       "std        3112.486560\n",
       "min      101550.880533\n",
       "25%      103682.876337\n",
       "50%      105814.872141\n",
       "75%      106712.703370\n",
       "max      107610.534600\n",
       "dtype: float64"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_pipe_rmses = -cross_val_score(new_geo_pipeline,\n",
    "                                  housing.iloc[:5000],\n",
    "                                  housing_labels.iloc[:5000],\n",
    "                                  scoring=\"neg_root_mean_squared_error\",\n",
    "                                  cv=3)\n",
    "pd.Series(new_pipe_rmses).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Yikes, that's terrible! Apparently the cluster similarity features were much better. But perhaps we should tune the `KNeighborsRegressor`'s hyperparameters? That's what the next exercise is about."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise: _Automatically explore some preparation options using `RandomSearchCV`._"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=3,\n",
       "                   estimator=Pipeline(steps=[('preprocessing',\n",
       "                                              ColumnTransformer(transformers=[('bedrooms_ratio',\n",
       "                                                                               Pipeline(steps=[('simpleimputer',\n",
       "                                                                                                SimpleImputer(strategy='median')),\n",
       "                                                                                               ('functiontransformer',\n",
       "                                                                                                FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
       "                                                                                                                    func=<function column_ratio at 0x7f9b505e5670>)),\n",
       "                                                                                               ('standardscaler',\n",
       "                                                                                                StandardScaler())]),\n",
       "                                                                               ['...\n",
       "                   param_distributions={'preprocessing__geo__estimator__n_neighbors': range(1, 30),\n",
       "                                        'preprocessing__geo__estimator__weights': ['distance',\n",
       "                                                                                   'uniform'],\n",
       "                                        'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9acb940bb0>,\n",
       "                                        'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9acb940a30>},\n",
       "                   random_state=42, scoring='neg_root_mean_squared_error')"
      ]
     },
     "execution_count": 167,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "param_distribs = {\n",
    "    \"preprocessing__geo__estimator__n_neighbors\": range(1, 30),\n",
    "    \"preprocessing__geo__estimator__weights\": [\"distance\", \"uniform\"],\n",
    "    \"svr__C\": reciprocal(20, 200_000),\n",
    "    \"svr__gamma\": expon(scale=1.0),\n",
    "}\n",
    "\n",
    "new_geo_rnd_search = RandomizedSearchCV(new_geo_pipeline,\n",
    "                                        param_distributions=param_distribs,\n",
    "                                        n_iter=50,\n",
    "                                        cv=3,\n",
    "                                        scoring='neg_root_mean_squared_error',\n",
    "                                        random_state=42)\n",
    "new_geo_rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "106775.63787128967"
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_geo_rnd_search_rmse = -new_geo_rnd_search.best_score_\n",
    "new_geo_rnd_search_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Oh well... at least we tried! It looks like the cluster similarity features are definitely better than the KNN feature. But perhaps you could try having both? And maybe training on the full training set would help as well."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise: _Try to implement the `StandardScalerClone` class again from scratch, then add support for the `inverse_transform()` method: executing `scaler.inverse_transform(scaler.fit_transform(X))` should return an array very close to `X`. Then add support for feature names: set `feature_names_in_` in the `fit()` method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the `get_feature_names_out()` method: it should have one optional `input_features=None` argument. If passed, the method should check that its length matches `n_features_in_`, and it should match `feature_names_in_` if it is defined, then `input_features` should be returned. If `input_features` is `None`, then the method should return `feature_names_in_` if it is defined or `np.array([\"x0\", \"x1\", ...])` with length `n_features_in_` otherwise._"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.utils.validation import check_array, check_is_fitted\n",
    "\n",
    "class StandardScalerClone(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self, with_mean=True):  # no *args or **kwargs!\n",
    "        self.with_mean = with_mean\n",
    "\n",
    "    def fit(self, X, y=None):  # y is required even though we don't use it\n",
    "        X_orig = X\n",
    "        X = check_array(X)  # checks that X is an array with finite float values\n",
    "        self.mean_ = X.mean(axis=0)\n",
    "        self.scale_ = X.std(axis=0)\n",
    "        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()\n",
    "        if hasattr(X_orig, \"columns\"):\n",
    "            self.feature_names_in_ = np.array(X_orig.columns, dtype=object)\n",
    "        return self  # always return self!\n",
    "\n",
    "    def transform(self, X):\n",
    "        check_is_fitted(self)  # looks for learned attributes (with trailing _)\n",
    "        X = check_array(X)\n",
    "        if self.n_features_in_ != X.shape[1]:\n",
    "            raise ValueError(\"Unexpected number of features\")\n",
    "        if self.with_mean:\n",
    "            X = X - self.mean_\n",
    "        return X / self.scale_\n",
    "    \n",
    "    def inverse_transform(self, X):\n",
    "        check_is_fitted(self)\n",
    "        X = check_array(X)\n",
    "        if self.n_features_in_ != X.shape[1]:\n",
    "            raise ValueError(\"Unexpected number of features\")\n",
    "        X = X * self.scale_\n",
    "        return X + self.mean_ if self.with_mean else X\n",
    "    \n",
    "    def get_feature_names_out(self, input_features=None):\n",
    "        if input_features is None:\n",
    "            return getattr(self, \"feature_names_in_\",\n",
    "                           [f\"x{i}\" for i in range(self.n_features_in_)])\n",
    "        else:\n",
    "            if len(input_features) != self.n_features_in_:\n",
    "                raise ValueError(\"Invalid number of features\")\n",
    "            if hasattr(self, \"feature_names_in_\") and not np.all(\n",
    "                self.feature_names_in_ == input_features\n",
    "            ):\n",
    "                raise ValueError(\"input_features ≠ feature_names_in_\")\n",
    "            return input_features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's test our custom transformer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils.estimator_checks import check_estimator\n",
    " \n",
    "check_estimator(StandardScalerClone())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "No errors, that's a great start, we respect the Scikit-Learn API."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's ensure the transformation works as expected:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "X = np.random.rand(1000, 3)\n",
    "\n",
    "scaler = StandardScalerClone()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
    "assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How about setting `with_mean=False`?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScalerClone(with_mean=False)\n",
    "X_scaled_uncentered = scaler.fit_transform(X)\n",
    "\n",
    "assert np.allclose(X_scaled_uncentered, X / X.std(axis=0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And does the inverse work?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScalerClone()\n",
    "X_back = scaler.inverse_transform(scaler.fit_transform(X))\n",
    "assert np.allclose(X, X_back)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How about the feature names out?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert np.all(scaler.get_feature_names_out() == [\"x0\", \"x1\", \"x2\"])\n",
    "assert np.all(scaler.get_feature_names_out([\"a\", \"b\", \"c\"]) == [\"a\", \"b\", \"c\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And if we fit a DataFrame, are the feature in and out ok?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame({\"a\": np.random.rand(100), \"b\": np.random.rand(100)})\n",
    "scaler = StandardScalerClone()\n",
    "X_scaled = scaler.fit_transform(df)\n",
    "\n",
    "assert np.all(scaler.feature_names_in_ == [\"a\", \"b\"])\n",
    "assert np.all(scaler.get_feature_names_out() == [\"a\", \"b\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All good! That's all for today! 😀"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Congratulations! You already know quite a lot about Machine Learning. :)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "nav_menu": {
   "height": "279px",
   "width": "309px"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								{
 								 "cells": [
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Large change: replace os.path with pathlib, move to Python 3.7

											
										
										
											2021-10-15 10:46:27 +02:00
+								    "**Chapter 2 – End-to-end Machine Learning project**"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "*This notebook contains all the sample code and solutions to the exercises in chapter 2.*"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Make notebooks 1 to 9 runnable in Colab without changes

											
										
										
											2019-11-05 15:26:52 +01:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "<table align=\"left\">\n",
 								    "  <td>\n",
-												Replace handson-ml2 with handson-ml3, and fix figure chapter numbers

											
										
										
											2021-11-23 03:42:16 +01:00
+								    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml3/blob/main/02_end_to_end_machine_learning_project.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
-												Make notebooks 1 to 9 runnable in Colab without changes

											
										
										
											2019-11-05 15:26:52 +01:00
+								    "  </td>\n",
-												Add 'Open in Kaggle' button

											
										
										
											2021-05-25 21:31:19 +02:00
+								    "  <td>\n",
-												Replace handson-ml2 with handson-ml3, and fix figure chapter numbers

											
										
										
											2021-11-23 03:42:16 +01:00
+								    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml3/blob/main/02_end_to_end_machine_learning_project.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
-												Add 'Open in Kaggle' button

											
										
										
											2021-05-25 21:31:19 +02:00
+								    "  </td>\n",
-												Make notebooks 1 to 9 runnable in Colab without changes

											
										
										
											2019-11-05 15:26:52 +01:00
+								    "</table>"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
 								   "execution_count": 1,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "Welcome to Machine Learning!\n"
 								     ]
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "print(\"Welcome to Machine Learning!\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Require Python 3.7+

											
										
										
											2022-02-19 11:03:20 +01:00
+								    "This project requires Python 3.7 or above:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "execution_count": 2,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Drop Python 2 (woohoo!) and import matplotlib as mpl

											
										
										
											2019-01-16 16:42:00 +01:00
+								    "import sys\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "\n",
-												Require Python 3.7+

											
										
										
											2022-02-19 11:03:20 +01:00
+								    "assert sys.version_info >= (3, 7)"
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "It also requires Scikit-Learn ≥ 1.0.1:"
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 3,
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Create image directory and check for sklearn >= 0.20

											
										
										
											2019-01-21 11:42:31 +01:00
+								    "import sklearn\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "\n",
-												Remove scikit-learn html sections

											
										
										
											2021-10-29 01:12:47 +02:00
+								    "assert sklearn.__version__ >= \"1.0.1\""
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								    "# Get the Data"
 								   ]
 								  },
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*"
 								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Download the Data"
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "execution_count": 4,
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								   "metadata": {},
 								   "outputs": [],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Large change: replace os.path with pathlib, move to Python 3.7

											
										
										
											2021-10-15 10:46:27 +02:00
+								    "from pathlib import Path\n",
-												Tarballs like housing.tgz include the parent directory

											
										
										
											2022-02-20 21:51:32 +01:00
+								    "import pandas as pd\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "import tarfile\n",
-												Update libraries to latest version, including TensorFlow 2.4.1 and Scikit-Learn 0.24.1

											
										
										
											2021-02-14 03:02:09 +01:00
+								    "import urllib.request\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
-												Large change: replace os.path with pathlib, move to Python 3.7

											
										
										
											2021-10-15 10:46:27 +02:00
+								    "def load_housing_data():\n",
-												Tarballs like housing.tgz include the parent directory

											
										
										
											2022-02-20 21:51:32 +01:00
+								    "    tarball_path = Path(\"datasets/housing.tgz\")\n",
 								    "    if not tarball_path.is_file():\n",
 								    "        Path(\"datasets\").mkdir(parents=True, exist_ok=True)\n",
 								    "        url = \"https://github.com/ageron/data/raw/main/housing.tgz\"\n",
 								    "        urllib.request.urlretrieve(url, tarball_path)\n",
 								    "        with tarfile.open(tarball_path) as housing_tarball:\n",
 								    "            housing_tarball.extractall(path=\"datasets\")\n",
 								    "    return pd.read_csv(Path(\"datasets/housing/housing.csv\"))\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
-												Large change: replace os.path with pathlib, move to Python 3.7

											
										
										
											2021-10-15 10:46:27 +02:00
+								    "housing = load_housing_data()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Take a Quick Look at the Data Structure"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Crop long outputs so they show up nicer on github.com

											
										
										
											2019-04-15 18:06:57 +02:00
+								   "execution_count": 5,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "      <th>median_house_value</th>\n",
 								       "      <th>ocean_proximity</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>-122.23</td>\n",
 								       "      <td>37.88</td>\n",
 								       "      <td>41.0</td>\n",
 								       "      <td>880.0</td>\n",
 								       "      <td>129.0</td>\n",
 								       "      <td>322.0</td>\n",
 								       "      <td>126.0</td>\n",
 								       "      <td>8.3252</td>\n",
 								       "      <td>452600.0</td>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>-122.22</td>\n",
 								       "      <td>37.86</td>\n",
 								       "      <td>21.0</td>\n",
 								       "      <td>7099.0</td>\n",
 								       "      <td>1106.0</td>\n",
 								       "      <td>2401.0</td>\n",
 								       "      <td>1138.0</td>\n",
 								       "      <td>8.3014</td>\n",
 								       "      <td>358500.0</td>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>2</th>\n",
 								       "      <td>-122.24</td>\n",
 								       "      <td>37.85</td>\n",
 								       "      <td>52.0</td>\n",
 								       "      <td>1467.0</td>\n",
 								       "      <td>190.0</td>\n",
 								       "      <td>496.0</td>\n",
 								       "      <td>177.0</td>\n",
 								       "      <td>7.2574</td>\n",
 								       "      <td>352100.0</td>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>3</th>\n",
 								       "      <td>-122.25</td>\n",
 								       "      <td>37.85</td>\n",
 								       "      <td>52.0</td>\n",
 								       "      <td>1274.0</td>\n",
 								       "      <td>235.0</td>\n",
 								       "      <td>558.0</td>\n",
 								       "      <td>219.0</td>\n",
 								       "      <td>5.6431</td>\n",
 								       "      <td>341300.0</td>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>4</th>\n",
 								       "      <td>-122.25</td>\n",
 								       "      <td>37.85</td>\n",
 								       "      <td>52.0</td>\n",
 								       "      <td>1627.0</td>\n",
 								       "      <td>280.0</td>\n",
 								       "      <td>565.0</td>\n",
 								       "      <td>259.0</td>\n",
 								       "      <td>3.8462</td>\n",
 								       "      <td>342200.0</td>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
 								       "0    -122.23     37.88                41.0        880.0           129.0   \n",
 								       "1    -122.22     37.86                21.0       7099.0          1106.0   \n",
 								       "2    -122.24     37.85                52.0       1467.0           190.0   \n",
 								       "3    -122.25     37.85                52.0       1274.0           235.0   \n",
 								       "4    -122.25     37.85                52.0       1627.0           280.0   \n",
 								       "\n",
 								       "   population  households  median_income  median_house_value ocean_proximity  \n",
 								       "0       322.0       126.0         8.3252            452600.0        NEAR BAY  \n",
 								       "1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  \n",
 								       "2       496.0       177.0         7.2574            352100.0        NEAR BAY  \n",
 								       "3       558.0       219.0         5.6431            341300.0        NEAR BAY  \n",
 								       "4       565.0       259.0         3.8462            342200.0        NEAR BAY  "
 								      ]
 								     },
 								     "execution_count": 5,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "housing.head()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Crop long outputs so they show up nicer on github.com

											
										
										
											2019-04-15 18:06:57 +02:00
+								   "execution_count": 6,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "<class 'pandas.core.frame.DataFrame'>\n",
 								      "RangeIndex: 20640 entries, 0 to 20639\n",
 								      "Data columns (total 10 columns):\n",
 								      " #   Column              Non-Null Count  Dtype  \n",
 								      "---  ------              --------------  -----  \n",
 								      " 0   longitude           20640 non-null  float64\n",
 								      " 1   latitude            20640 non-null  float64\n",
 								      " 2   housing_median_age  20640 non-null  float64\n",
 								      " 3   total_rooms         20640 non-null  float64\n",
 								      " 4   total_bedrooms      20433 non-null  float64\n",
 								      " 5   population          20640 non-null  float64\n",
 								      " 6   households          20640 non-null  float64\n",
 								      " 7   median_income       20640 non-null  float64\n",
 								      " 8   median_house_value  20640 non-null  float64\n",
 								      " 9   ocean_proximity     20640 non-null  object \n",
 								      "dtypes: float64(9), object(1)\n",
 								      "memory usage: 1.6+ MB\n"
 								     ]
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "housing.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Crop long outputs so they show up nicer on github.com

											
										
										
											2019-04-15 18:06:57 +02:00
+								   "execution_count": 7,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "<1H OCEAN     9136\n",
 								       "INLAND        6551\n",
 								       "NEAR OCEAN    2658\n",
 								       "NEAR BAY      2290\n",
 								       "ISLAND           5\n",
 								       "Name: ocean_proximity, dtype: int64"
 								      ]
 								     },
 								     "execution_count": 7,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "housing[\"ocean_proximity\"].value_counts()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Crop long outputs so they show up nicer on github.com

											
										
										
											2019-04-15 18:06:57 +02:00
+								   "execution_count": 8,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "      <th>median_house_value</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>count</th>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20433.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "      <td>20640.000000</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>mean</th>\n",
 								       "      <td>-119.569704</td>\n",
 								       "      <td>35.631861</td>\n",
 								       "      <td>28.639486</td>\n",
 								       "      <td>2635.763081</td>\n",
 								       "      <td>537.870553</td>\n",
 								       "      <td>1425.476744</td>\n",
 								       "      <td>499.539680</td>\n",
 								       "      <td>3.870671</td>\n",
 								       "      <td>206855.816909</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>std</th>\n",
 								       "      <td>2.003532</td>\n",
 								       "      <td>2.135952</td>\n",
 								       "      <td>12.585558</td>\n",
 								       "      <td>2181.615252</td>\n",
 								       "      <td>421.385070</td>\n",
 								       "      <td>1132.462122</td>\n",
 								       "      <td>382.329753</td>\n",
 								       "      <td>1.899822</td>\n",
 								       "      <td>115395.615874</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>min</th>\n",
 								       "      <td>-124.350000</td>\n",
 								       "      <td>32.540000</td>\n",
 								       "      <td>1.000000</td>\n",
 								       "      <td>2.000000</td>\n",
 								       "      <td>1.000000</td>\n",
 								       "      <td>3.000000</td>\n",
 								       "      <td>1.000000</td>\n",
 								       "      <td>0.499900</td>\n",
 								       "      <td>14999.000000</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>25%</th>\n",
 								       "      <td>-121.800000</td>\n",
 								       "      <td>33.930000</td>\n",
 								       "      <td>18.000000</td>\n",
 								       "      <td>1447.750000</td>\n",
 								       "      <td>296.000000</td>\n",
 								       "      <td>787.000000</td>\n",
 								       "      <td>280.000000</td>\n",
 								       "      <td>2.563400</td>\n",
 								       "      <td>119600.000000</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>50%</th>\n",
 								       "      <td>-118.490000</td>\n",
 								       "      <td>34.260000</td>\n",
 								       "      <td>29.000000</td>\n",
 								       "      <td>2127.000000</td>\n",
 								       "      <td>435.000000</td>\n",
 								       "      <td>1166.000000</td>\n",
 								       "      <td>409.000000</td>\n",
 								       "      <td>3.534800</td>\n",
 								       "      <td>179700.000000</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>75%</th>\n",
 								       "      <td>-118.010000</td>\n",
 								       "      <td>37.710000</td>\n",
 								       "      <td>37.000000</td>\n",
 								       "      <td>3148.000000</td>\n",
 								       "      <td>647.000000</td>\n",
 								       "      <td>1725.000000</td>\n",
 								       "      <td>605.000000</td>\n",
 								       "      <td>4.743250</td>\n",
 								       "      <td>264725.000000</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>max</th>\n",
 								       "      <td>-114.310000</td>\n",
 								       "      <td>41.950000</td>\n",
 								       "      <td>52.000000</td>\n",
 								       "      <td>39320.000000</td>\n",
 								       "      <td>6445.000000</td>\n",
 								       "      <td>35682.000000</td>\n",
 								       "      <td>6082.000000</td>\n",
 								       "      <td>15.000100</td>\n",
 								       "      <td>500001.000000</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "          longitude      latitude  housing_median_age   total_rooms  \\\n",
 								       "count  20640.000000  20640.000000        20640.000000  20640.000000   \n",
 								       "mean    -119.569704     35.631861           28.639486   2635.763081   \n",
 								       "std        2.003532      2.135952           12.585558   2181.615252   \n",
 								       "min     -124.350000     32.540000            1.000000      2.000000   \n",
 								       "25%     -121.800000     33.930000           18.000000   1447.750000   \n",
 								       "50%     -118.490000     34.260000           29.000000   2127.000000   \n",
 								       "75%     -118.010000     37.710000           37.000000   3148.000000   \n",
 								       "max     -114.310000     41.950000           52.000000  39320.000000   \n",
 								       "\n",
 								       "       total_bedrooms    population    households  median_income  \\\n",
 								       "count    20433.000000  20640.000000  20640.000000   20640.000000   \n",
 								       "mean       537.870553   1425.476744    499.539680       3.870671   \n",
 								       "std        421.385070   1132.462122    382.329753       1.899822   \n",
 								       "min          1.000000      3.000000      1.000000       0.499900   \n",
 								       "25%        296.000000    787.000000    280.000000       2.563400   \n",
 								       "50%        435.000000   1166.000000    409.000000       3.534800   \n",
 								       "75%        647.000000   1725.000000    605.000000       4.743250   \n",
 								       "max       6445.000000  35682.000000   6082.000000      15.000100   \n",
 								       "\n",
 								       "       median_house_value  \n",
 								       "count        20640.000000  \n",
 								       "mean        206855.816909  \n",
 								       "std         115395.615874  \n",
 								       "min          14999.000000  \n",
 								       "25%         119600.000000  \n",
 								       "50%         179700.000000  \n",
 								       "75%         264725.000000  \n",
 								       "max         500001.000000  "
 								      ]
 								     },
 								     "execution_count": 8,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing.describe()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "The following cell is not shown either in the book. It creates the `images/end_to_end_project` folder (if it doesn't already exist), and it defines the `save_fig()` function which is used through this notebook to save the figures in high-res for the book."
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   ]
 								  },
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 9,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – code to save the figures as high-res PNGs for the book\n",
-												Clarify a few messages

											
										
										
											2021-10-28 05:02:31 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n",
 								    "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
 								    "\n",
 								    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
 								    "    path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n",
 								    "    if tight_layout:\n",
 								    "        plt.tight_layout()\n",
 								    "    plt.savefig(path, format=fig_extension, dpi=resolution)"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 10,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIwCAYAAACx/zuEAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACBGklEQVR4nOzde7xcVX3//9ebO4IoFDkNCTWoQblEQVIKxepRRKJQwQt+4w8FFBu1WNHGSqKt19JGK6hopUZFQuUWL5SUixiQU0vLRUAkXEQCpBASCSpCghpJ+Pz+WGvIzmTOmTlnbntm3s/HYx5n9tq3z9pnZs1ee6+1tiICMzMzMzMzG90W3Q7AzMzMzMys7FxxMjMzMzMzq8MVJzMzMzMzszpccTIzMzMzM6vDFSczMzMzM7M6XHEyMzMzMzOrwxWnPifpHEmXdnifJ0pa28btr5V0Yru2b2at04oySNInJN3eqpiqtr2rpJA03I7tm/USSSOSvtzlGJZL+lA3Y2in6nOYXP68uYsh2Ti44mTtcBHwvMpEO096zKx/SJqaTyJmVM36HPCKwnIdvyBkZh3zp8BXuh1EB00C/rPbQVhjtup2ANZ/IuJ3wO+6HYeZ9YeIWAu07S62mZVHRDzS7Rg6KSJ+0e0YrHG+4zRAJG0r6QuSHpb0e0nXS3pZYf5wvtp7mKQbJP1W0k2SXlq1nXdKeiDP/09Jfy0pCvOfbqqXb0d/HNg3bzsqt6hr3Z6uvkUv6QW56cDvJd0t6aga+Zos6UJJj+bXZZKmteaomVmrSJop6b/z9/TXkq6UtHdhkfvz3x/n8mEkr/f0XWtJnwBOAI4slCnDo92tqi5nJP2ppJtzmfIT4M9qxLlPLkfWSFot6QJJf9zSg2FWXltI+idJv8yf/89J2gJA0s6SFubv8O8kXSVp38qKtZrqF84tds3Tz5L073nbv5d0n6QPFJavPg8ISbMlfVvSE3n5t1Xt488k3VL5Xkt6XaNNcAvxvTaXDb/L5dQUSa+Q9FOl5nWXSvqjqnXfIenOvN+fS/pg5Vjl+Y2cw1SXUfPzsr/Lx+KzkrYrzP+EpNslzZJ0by6n/qNyfBvI759K+kH+/z4u6VpJh1Qts5ek/yrE/Tpt3sRwIM+9XHEaLJ8F/h/wTuAAYCnwfUmTqpb7Z2Au8FLgV8B5kgSQv1xfB/4V2B9YDHxyjH1eBJwO3E26HT0pp9WVC5+LSZ/TQ3LcnwC2LSzzDOAa4PekpjyHAKuAq/I8MyuPHYAvAAcBw8BjwH9K2ibPPyj/nUkqK95YYxufAxYBV7GxTPnfRnYuaQfgMuA+YAapnPtc1TKTgB8Bt+d4Xg3sCCwunhCZ9bHjgPXAnwPvAz5AOncAOId0seFo0vfjt6TziO3Hsf1/BKYDRwEvIv22P1RnnY8BlwAvIZ1DnC3puQCSdgQuBX4GHAh8GPiXccRT8UlSXv8M2Dnv52PAbFJ5tS/pHIS8378C/ikvszcwBzgV+Os8v+45zCieyMvunbc1C/ho1TJTSf+TNwCvIZ3TndZgPp8J/DvwF6T/4a3A5YWKbSXu9cDBwImkC+A+9wKICL/6+EUq5C4lnbD8ATi+MG9L4F7gH/P0MBDAEYVlDs1pU/L0BcD3q/axIH2Unp4+EVhbmP4EcHuN2AJ4c1XacuBD+f1rgA3AnxTmvyyvd2KefidwD6CqfP0KeEu3j79ffg36q1IGjTJvh/wdf1menpq/3zOqltukDKm1zTHWfbqcIZ0A/QbYsTD/bXmZ4Tz9KeDqqm3snJc5qNvH0y+/2vkCRoDrqtKWkC6YTsvfg5cX5j2LdAHkXXl6k9//nFY5t9g1Ty8GvjlGDE+fB+TpAP65ML0VqcL2tjz9buDXwPaFZf6/4ve6Tp4r8RXPfd6X015aSKsuhx4A3l61rQ8Ad+b3dc9hCvl78xjxvQdYVhXH74FnFdI+WlxmnP9zkSo9leN5BKnSNLmwzJ/jcy8iwn2cBsjzga2B/6kkRMQGSdcB+1Qte1vh/cr8dzdgBenqUHUnxhuAv2pptMnewEMR8UDVvp4qTB8I7AmsyTfFKp5ByrOZlYSk5wOfJl3RfQ7pSuwWwJ90KIS9gdsi9ZmquK5qmQOBl1c3N8qeD9zYruDMSuK2qumVpHOAvUm/v09/ZyLiMUlL2fw8YixnAd9R6gawBPjPiPivRmOKiPWSHskxQTovuT1S/+qKG8YRz2b7AB7Of5dWpe0GIOk5wB7AVyWdVVhmK1JFBBo7h9lMbrb3AeAFpLvdW+ZX0f9FxGOF6cr/qC5Ju5HK4VcCQ3nb27OxHH4RsDIiincBf4zPvQAPDjFIKp/sqDGvOu3JGvMqTVQ0yjYmIgpxVWxdeF89r5YtSLeZZ9WY9+uJhWVmbfKfpCY5785/1wN3AtuMtVKDKj/qT5cbkrauWqbRMuUyoNZwyA/XSDPrN09WTQfpezHW96dyXvBUjeU2+R5GxBW5md1rgcOAyyR9OyLeMYGYoHXnJZud+0REdVpln5W/72H0psKNlDebriAdDFxIajb4QdId8tdT1aSYsY9HPQtJFaYPku7urQOuZmM53MjxHNhzL1ecBscyUlO9l5Ha9yNpS1K71PPHsZ272NgPoaJ6utof2PxqCcAjpP4J5HiGitOkE6rJkvaIiAcL+yoWDrcAbwV+GRG/qRu9mXVF7lS9N3ByRFyT017Kpr9Df8h/a5UXRbXKlMpIXMUyZP+qZe4ETpC0Q0Q8kdMOrlrmFuAtpCu61ScnZoPsTjb21/kRgKSdSP2VvpmXeQR4hqSdIuLxnLZ/9YYi4pekfjb/LukK4AJJ74mIdROI6y7geEnbF+461TsvaUpEPCzpIeD5EXHuKIs1cg5T7VDSXapPVxIqfbla6GXA+yPisrz96nOvu0hx7x4RlVZHM/C5F+DBIQZGPkk4C5ifR0fZO08PMb7nJZwJvEbS30maJukkUufEsSwHnivppUoPm6x0MPwhcLKkGZIOIPVb+H1hvatInT3PlbR/Hpji86Sr1BXnka4CX5JHv9lT0sslnT4Io7uY9ZBHgV8Cf5VHmnoF8G9s+n1eTXqUwRGShiQ9a5RtLQf2k/TCXKZsnU+YrgdOlbSvpD9n86u05+f9nZ2XOZzNO13/K6nfxkVKI3U9T9KrJS2Q9MyJZ9+st0XEPaQBGr4q6S8kTQe+BTzOxguwN5AGN/jn/D1/E3mwhApJn5J0TD6H2Js0CMx9E6w0QToP2AB8TWlEzFcDH6mEPcFtNuITwIeVRtJ7oaT9JB0vaV6e38g5TLWfkyotx+Wy572kCkor/Rx4Wz5Wf0q6w/WHwvwlpAG9Fkp6Sb4LdkaOu3I8B/bcyxWnwXIqaTSqb5Jusb4YmBkRqxrdQERcR+rP9H5Se+BjgM+waYWn2neBy0m3gh9hYyEwh3T3awT4Dqnz6erCvp4iVcq2IBXG55JG41lXWOa3wMvzdr5NKqQWkjpzP9povsysvfL3+f+Ryp3bSRWUf2DT7/N6UtnyLlKb/UtG2dzXSFdFbyKVKYfm9Hfmvz8Gvgr8fVUMa0kjeU0jXTH9HKlcLC6zMm/vKeD7wB051nXFWM0G1DtI/fwW57/PIJ1H/A4gIn5NGpXvcFL/oNmk73nROtIIcD8l9bt+JvCXEw0of6//kjTq3U9II+p9Is8e69ykKRHxdVKZ83ZSXv6blN/78/y65zA1tvmfpPi/QDrHOpw0al8rvZPUd+pmUqXpbNLFqEoMlbi3Jf2PF5L+X0E+noN87qU8EobZhEn6PPDqiJje7VjMzMxssEk6mjSk9m65WaA1QdJLSBfcZ0TEzV0Op6vcx8nGTdLfkW7lriU94+Q9bLwtbmZmZtYxkk4g3f14ENiPdMfmP11pmhhJbyA1ubyH9KiHM0h31W7pYlil4IqTTcQM0ohTzyLdkp4HfLGrEZmZmdmgGiKNRDcJ+AVpZMxTAST9G+l5bbV8KyLe05EIO2SURylUvDYi/ruBzTyT1A1jD1LTuxHgg+Fmam6qZ2ZmZmb9KT+3aKdRZj8eEatHmdeTJL1gjNkPVT3vysbJFSczMzMzM7M6PKqemZm
 								      "text/plain": [
 								       "<Figure size 864x576 with 9 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "import matplotlib.pyplot as plt\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – the next 5 lines define the default font sizes\n",
-												Make font sizes consistent across notebooks

											
										
										
											2021-11-27 11:03:26 +01:00
+								    "plt.rc('font', size=14)\n",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								    "plt.rc('axes', labelsize=14, titlesize=14)\n",
 								    "plt.rc('legend', fontsize=14)\n",
-												Remove redundant comment

											
										
										
											2021-12-08 03:16:42 +01:00
+								    "plt.rc('xtick', labelsize=10)\n",
 								    "plt.rc('ytick', labelsize=10)\n",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing.hist(bins=50, figsize=(12, 8))\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"attribute_histogram_plots\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Create a Test Set"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 11,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "import numpy as np\n",
 								    "\n",
 								    "def shuffle_and_split_data(data, test_ratio):\n",
 								    "    shuffled_indices = np.random.permutation(len(data))\n",
 								    "    test_set_size = int(len(data) * test_ratio)\n",
 								    "    test_indices = shuffled_indices[:test_set_size]\n",
 								    "    train_indices = shuffled_indices[test_set_size:]\n",
 								    "    return data.iloc[train_indices], data.iloc[test_indices]"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 12,
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "16512"
 								      ]
 								     },
 								     "execution_count": 12,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "train_set, test_set = shuffle_and_split_data(housing, 0.2)\n",
 								    "len(train_set)"
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 13,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "4128"
 								      ]
 								     },
 								     "execution_count": 13,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "len(test_set)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8)

											
										
										
											2018-04-03 16:45:53 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "To ensure that this notebook's outputs remain the same every time we run it, we need to set the random seed:"
-												Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8)

											
										
										
											2018-04-03 16:45:53 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 14,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "np.random.seed(42)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Sadly, this won't guarantee that this notebook will output exactly the same results as in the book, since there are other possible sources of variation. The most important is the fact that algorithms get tweaked over time when libraries evolve. So please tolerate some minor differences: hopefully, most of the outputs should be the same, or at least in the right ballpark."
-												Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8)

											
										
										
											2018-04-03 16:45:53 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Note: another source of randomness is the order of Python sets: it is based on Python's `hash()` function, which is randomly \"salted\" when Python starts up (this started in Python 3.3, to prevent some denial-of-service attacks). To remove this randomness, the solution is to set the `PYTHONHASHSEED` environment variable to `\"0\"` _before_ Python even starts up. Nothing will happen if you do it after that. Luckily, if you're running this notebook on Colab, the variable is already set for you."
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 15,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
-												Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8)

											
										
										
											2018-04-03 16:45:53 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from zlib import crc32\n",
 								    "\n",
 								    "def is_id_in_test_set(identifier, test_ratio):\n",
 								    "    return crc32(np.int64(identifier)) < test_ratio * 2**32\n",
 								    "\n",
 								    "def split_data_with_id_hash(data, test_ratio, id_column):\n",
 								    "    ids = data[id_column]\n",
 								    "    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))\n",
 								    "    return data.loc[~in_test_set], data.loc[in_test_set]"
-												Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8)

											
										
										
											2018-04-03 16:45:53 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 16,
-												Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8)

											
										
										
											2018-04-03 16:45:53 +02:00
+								   "metadata": {},
 								   "outputs": [],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_with_id = housing.reset_index()  # adds an `index` column\n",
 								    "train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, \"index\")"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 17,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
 								    "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, \"id\")"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 18,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.model_selection import train_test_split\n",
 								    "\n",
 								    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 19,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "44"
 								      ]
 								     },
 								     "execution_count": 19,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "test_set[\"total_bedrooms\"].isnull().sum()"
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "To find the probability that a random sample of 1,000 people contains less than 48.5% female or more than 53.5% female when the population's female ratio is 51.1%, we use the [binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution). The `cdf()` method of the binomial distribution gives us the probability that the number of females will be equal or less than the given value."
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 20,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "0.10736798530929946\n"
 								     ]
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – shows how to compute the 10.7% proba of getting a bad sample\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
 								    "from scipy.stats import binom\n",
 								    "\n",
 								    "sample_size = 1000\n",
 								    "ratio_female = 0.511\n",
 								    "proba_too_small = binom(sample_size, ratio_female).cdf(485 - 1)\n",
 								    "proba_too_large = 1 - binom(sample_size, ratio_female).cdf(535)\n",
 								    "print(proba_too_small + proba_too_large)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "If you prefer simulations over maths, here's how you could get roughly the same result:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 21,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "0.1071"
 								      ]
 								     },
 								     "execution_count": 21,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – shows another way to estimate the probability of bad sample\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
 								    "np.random.seed(42)\n",
 								    "\n",
 								    "samples = (np.random.rand(100_000, sample_size) < ratio_female).sum(axis=1)\n",
 								    "((samples < 485) | (samples > 535)).mean()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 22,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Use pd.cut() for income_cat

											
										
										
											2019-03-15 16:49:03 +01:00
+								    "housing[\"income_cat\"] = pd.cut(housing[\"median_income\"],\n",
 								    "                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],\n",
 								    "                               labels=[1, 2, 3, 4, 5])"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 23,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAiEUlEQVR4nO3de5gV1Znv8e9PvBEVL1F7EFA4CXFEeDTSQ3AcTauZiNETnOQYyTERo4aJMUYnZBSSY66HSC460ZnRCcYEGTUMJ8YDx0sSRFsnEy8BxRDxAioqQsC7tBdM43v+qNVh2+7urrb37tp79+/zPPvZVauqVr17KbxU1aq1FBGYmZn1t22KDsDMzAYmJyAzMyuEE5CZmRXCCcjMzArhBGRmZoVwAjIzs0JsW3QA/WXPPfeMkSNHFh0GAK+88go77bRT0WHUHLdLeW6Xrrltyquldlm2bNmzEbFXuW0DJgGNHDmSpUuXFh0GAK2trbS0tBQdRs1xu5Tnduma26a8WmoXSU90tc234MzMrBBOQGZmVggnIDMzK4QTkJmZFcIJyMzMCuEEZGZmhXACMjOzQjgBmZlZIZyAzMysEANmJASzIoyccWOf65g+rp1TK1DPmtnH9bkOs0ryFZCZmRXCCcjMzArRbwlI0v6Slpd8XpZ0rqQ9JC2WtCp9715yzExJqyU9LOmYkvLxklakbZdKUn/9DjMzq4x+S0AR8XBEHBwRBwPjgVeB64EZwJKIGA0sSetIGgNMAQ4EJgGXSRqUqrscmAaMTp9J/fU7zMysMoq6BXc08GhEPAFMBq5K5VcBJ6TlycD8iNgcEY8Dq4EJkoYCQyLizogIYF7JMWZmVieKSkBTgJ+l5aaIWA+QvvdO5cOAp0qOWZvKhqXlzuVmZlZH+r0btqTtgY8CM3vatUxZdFNe7lzTyG7V0dTURGtra/5Aq6itra1mYqkljdgu08e197mOpsGVqafR2hYa8/+ZSqiXdiniPaBjgXsjYkNa3yBpaESsT7fXNqbytcCIkuOGA+tS+fAy5W8TEXOAOQDNzc1RKzME1tJshbWkEdulEu/vTB/XzkUr+v5Hdc3JLX2uo9Y04v8zlVAv7VLELbhPsvX2G8AiYGpangosLCmfImkHSaPIOhvck27TbZI0MfV+O6XkGDMzqxP9egUk6V3A3wJ/X1I8G1gg6XTgSeBEgIh4QNICYCXQDpwVEVvSMWcCc4HBwM3pY2ZmdaRfE1BEvAq8u1PZc2S94srtPwuYVaZ8KTC2GjGamVn/8EgIZmZWCCcgMzMrhBOQmZkVwgnIzMwK4QRkZmaFcAIyM7NCOAGZmVkhPCW3VYSnnjaz3vIVkJmZFcIJyMzMCuEEZGZmhXACMjOzQjgBmZlZIZyAzMysEE5AZmZWCCcgMzMrhBOQmZkVwgnIzMwK4QRkZmaFcAIyM7NC9GsCkrSbpJ9LekjSg5IOlbSHpMWSVqXv3Uv2nylptaSHJR1TUj5e0oq07VJJ6s/fYWZmfdffV0CXAL+MiL8EDgIeBGYASyJiNLAkrSNpDDAFOBCYBFwmaVCq53JgGjA6fSb1548wM7O+e8cJSNJ7Je3Yi/2HAEcAVwJExBsR8SIwGbgq7XYVcEJangzMj4jNEfE4sBqYIGkoMCQi7oyIAOaVHGNmZnUiVwKS9B1JU9OyJC0GHgHWS/pAznP9N+AZ4KeS7pP0Y0k7AU0RsR4gfe+d9h8GPFVy/NpUNiwtdy43M7M6kndCupOBk9LyscDBwMRUPhs4Mue5DgHOjoi7JV1Cut3WhXLPdaKb8rdXIE0ju1VHU1MTra2tOcKsvra2tpqJpVKmj2vvcx1NgytTTy21rduluhrxz1Il1Eu75E1ATWy96vgIsCAi7pH0PLA0Zx1rgbURcXda/zlZAtogaWhErE+31zaW7D+i5PjhwLpUPrxM+dtExBxgDkBzc3O0tLTkDLW6WltbqZVYKqUSM5lOH9fORSv6PknvmpNb+lxHpbhdqqsR/yxVQr20S95nQM8B+6XlDwO3puVtKX9F8jYR8UfgKUn7p6KjgZXAImBqKpsKLEzLi4ApknaQNIqss8E96TbdJkkTU++3U0qOMTOzOpH3n1XXAddKegTYA/hlKj+YrHNAXmcD10jaHngM+AxZElwg6XTgSeBEgIh4QNICsiTVDpwVEVtSPWcCc4HBwM3pY2ZmdSRvAvoS8ASwL3BeRLySyoeSdYnOJSKWA81lNh3dxf6zgFllypcCY/Oe18zMak/eBLQP8E8R8Wan8h/y1uc0ZmZmueR9BvQ4sGeZ8j3SNjMzs17Jm4BE+a7OOwOvVy4cMzMbKLq9BSfp0rQYwIWSXi3ZPAiYACyvTmhmZtbIenoGNC59CzgAeKNk2xvAvcAPqhCXmZk1uG4TUEQcCSDpp8A5EfFyv0RlZmYNL+8zoK8AQzoXShouqamyIZmZ2UCQNwHNIxsDrrNjgH+vXDhmZjZQ5E1AfwXcUab8Pyn/YqmZmVm38iagbYEdypTv2EW5mZlZt/ImoLvJxl/r7Czgd5ULx8zMBoq8Q/F8FbhV0kFk02YDHAW8H/hQNQIzM7PGlusKKCLuAg4lG8H6Y8DHyYbgOTQiflu98MzMrFHlnuUqIu4HPlXFWMzMbADpMgFJ2iMinu9Y7q6Sjv3MzMzy6u4K6Jk0VfZG4FnKD0baMUjpoGoEZ2Zmjau7BHQU0HFlc2Q/xGJmZgNIlwkoIm4HkLQtcCDwfyNiXX8FZmZmja3HXnAR0Q58H9iu+uGYmdlAkfdF1LuA8dUMxMzMBpa83bCvAH4gaV9gGfBK6caIuDdPJZLWAJuALUB7RDSnHnb/AYwE1gCfiIgX0v4zgdPT/l+MiF+l8vHAXGAwcBPZVBHlOkmYmVmNypuArk3fF5fZ1ttecEdGxLMl6zOAJRExW9KMtH6+pDHAFLLnT/sAt0h6X0RsAS4HppFdmd0ETAJu7kUMZmZWsLwJaFQVY5gMtKTlq4BW4PxUPj8iNgOPS1oNTEhXUUMi4k4ASfOAE3ACMjOrK3mfAe0HPB0RT5R+gKfTtrwC+LWkZZKmpbKmiFgPkL73TuXDgKdKjl2byoal5c7lZmZWR/JeAd0GDAU2dirfNW3LewvusIhYJ2lvYLGkh7rZV2XKopvyt1eQJblpAE1NTbS2tuYMs7ra2tpqJpZKmT6uvc91NA2uTD211LZul+pqxD9LlVAv7ZI3AXWMeNDZu+nUIaE7He8RRcRGSdcDE4ANacSF9ZJKk9xaYETJ4cOBdal8eJnycuebA8wBaG5ujpaWlryhVlVrayu1EkulnDrjxj7XMX1cOxetyD08YZfWnNzS5zoqxe1SXY34Z6kS6qVduv2/WtKitBjA1ZI2l2weBIwFco2GLWknYJuI2JSWPwx8C1gETAVmp++F6ZBFwLWSLibrhDAauCcitkjaJGki2TxFpwD/nCcGMzOrHT39s+q59C3gBeC1km1vAL8h66KdRxNwvaSO814bEb+U9DtggaTTgSeBEwEi4gFJC4CVQDtwVuoBB9nkeHPJumHfjDsgmJnVnW4TUER8Bv78/s4PIiL37bYydT0GHFSm/Dng6C6OmQXMKlO+lOzqy8zM6lTeXnDfpuTqR9JfSDpD0l9XJywzM2t0eRPQjcDZAJJ2BpaSjQ93u6RTqhSbmZk1sLwJaDxwa1r+GPAy2fs6nwW+XIW4zMysweVNQLsAL6blDwPXR8SfyJLSe6oQl5mZNbi8CehJ4LDUffoYYHEq3wN4tRqBmZlZY8v7dtvFwL8DbcATwB2p/AhgRRXiMjOzBpcrAUXEjyQtIxuZYHFEvJk2PQpcUK3gzMysceUe3yO9e7O0U1nfxxkxM7MBqcsEJOlLwGUR8Xpa7lJElJsnyMzMrEvdXQGdTTY/z+tpuStB+YnqzMzMutRlAoqIUeWWzczMKiFvN2wzM7OK6u4Z0NfyVhIR36pMOGZmNlB09wzoxE7r+wHvYuvkb/uQvYS6hmxeHzMzs9y6ewY0rmNZ0mfIJn6bGhFPprJ9gZ8C11Q7SDMzazx5nwF9DTi3I/kApOXpwNerEZiZmTW2vAm
 								      "text/plain": [
 								       "<Figure size 432x288 with 1 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing[\"income_cat\"].value_counts().sort_index().plot.bar(rot=0, grid=True)\n",
 								    "plt.xlabel(\"Income category\")\n",
 								    "plt.ylabel(\"Number of districts\")\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"housing_income_cat_bar_plot\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 24,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Adding missing figure in chapter 02

											
										
										
											2017-06-08 14:23:33 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.model_selection import StratifiedShuffleSplit\n",
 								    "\n",
 								    "splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)\n",
 								    "strat_splits = []\n",
 								    "for train_index, test_index in splitter.split(housing, housing[\"income_cat\"]):\n",
 								    "    strat_train_set_n = housing.loc[train_index]\n",
 								    "    strat_test_set_n = housing.loc[test_index]\n",
 								    "    strat_splits.append([strat_train_set_n, strat_test_set_n])"
-												Adding missing figure in chapter 02

											
										
										
											2017-06-08 14:23:33 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 25,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "strat_train_set, strat_test_set = strat_splits[0]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "It's much shorter to get a single stratified split:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 26,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "strat_train_set, strat_test_set = train_test_split(\n",
 								    "    housing, test_size=0.2, stratify=housing[\"income_cat\"], random_state=42)"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 27,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "3    0.350533\n",
 								       "2    0.318798\n",
 								       "4    0.176357\n",
 								       "5    0.114341\n",
 								       "1    0.039971\n",
 								       "Name: income_cat, dtype: float64"
 								      ]
 								     },
 								     "execution_count": 27,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)"
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 28,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>Overall %</th>\n",
 								       "      <th>Stratified %</th>\n",
 								       "      <th>Random %</th>\n",
 								       "      <th>Strat. Error %</th>\n",
 								       "      <th>Rand. Error %</th>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>Income Category</th>\n",
 								       "      <th></th>\n",
 								       "      <th></th>\n",
 								       "      <th></th>\n",
 								       "      <th></th>\n",
 								       "      <th></th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>3.98</td>\n",
 								       "      <td>4.00</td>\n",
 								       "      <td>4.24</td>\n",
 								       "      <td>0.36</td>\n",
 								       "      <td>6.45</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>2</th>\n",
 								       "      <td>31.88</td>\n",
 								       "      <td>31.88</td>\n",
 								       "      <td>30.74</td>\n",
 								       "      <td>-0.02</td>\n",
 								       "      <td>-3.59</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>3</th>\n",
 								       "      <td>35.06</td>\n",
 								       "      <td>35.05</td>\n",
 								       "      <td>34.52</td>\n",
 								       "      <td>-0.01</td>\n",
 								       "      <td>-1.53</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>4</th>\n",
 								       "      <td>17.63</td>\n",
 								       "      <td>17.64</td>\n",
 								       "      <td>18.41</td>\n",
 								       "      <td>0.03</td>\n",
 								       "      <td>4.42</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>5</th>\n",
 								       "      <td>11.44</td>\n",
 								       "      <td>11.43</td>\n",
 								       "      <td>12.09</td>\n",
 								       "      <td>-0.08</td>\n",
 								       "      <td>5.63</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "                 Overall %  Stratified %  Random %  Strat. Error %  \\\n",
 								       "Income Category                                                      \n",
 								       "1                     3.98          4.00      4.24            0.36   \n",
 								       "2                    31.88         31.88     30.74           -0.02   \n",
 								       "3                    35.06         35.05     34.52           -0.01   \n",
 								       "4                    17.63         17.64     18.41            0.03   \n",
 								       "5                    11.44         11.43     12.09           -0.08   \n",
 								       "\n",
 								       "                 Rand. Error %  \n",
 								       "Income Category                 \n",
 								       "1                         6.45  \n",
 								       "2                        -3.59  \n",
 								       "3                        -1.53  \n",
 								       "4                         4.42  \n",
 								       "5                         5.63  "
 								      ]
 								     },
 								     "execution_count": 28,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – computes the data for Figure 2–10\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "def income_cat_proportions(data):\n",
 								    "    return data[\"income_cat\"].value_counts() / len(data)\n",
 								    "\n",
 								    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
 								    "\n",
 								    "compare_props = pd.DataFrame({\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    \"Overall %\": income_cat_proportions(housing),\n",
 								    "    \"Stratified %\": income_cat_proportions(strat_test_set),\n",
 								    "    \"Random %\": income_cat_proportions(test_set),\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "}).sort_index()\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "compare_props.index.name = \"Income Category\"\n",
 								    "compare_props[\"Strat. Error %\"] = (compare_props[\"Stratified %\"] /\n",
 								    "                                   compare_props[\"Overall %\"] - 1)\n",
 								    "compare_props[\"Rand. Error %\"] = (compare_props[\"Random %\"] /\n",
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "                                  compare_props[\"Overall %\"] - 1)\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "(compare_props * 100).round(2)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 29,
-												Clarify stratified sampling paragraph in ch02

											
										
										
											2017-10-16 14:19:08 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "for set_ in (strat_train_set, strat_test_set):\n",
 								    "    set_.drop(\"income_cat\", axis=1, inplace=True)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								    "# Discover and Visualize the Data to Gain Insights"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 30,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
 								    "housing = strat_train_set.copy()"
 								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Visualizing Geographical Data"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 31,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABo/ElEQVR4nO2deXwV1d24nzNzl4QEAgYNSwDBiAgoUWkRQWRR64LYvlLaSmtbq7Z9tVZFoa2lirb9qWi1KrW1vl14a4uIb2WxiwsggoiiBiQUMYJCQFEiW0K4y8z5/TF3LneZu+YuITnP5xPlznLmzNzkfOe7CyklCoVCoVAUGq3YE1AoFApF50QJIIVCoVAUBSWAFAqFQlEUlABSKBQKRVFQAkihUCgURUEJIIVCoVAUBVexJ5ANPXv2lCeeeGJBrtXS0kJZWVlBrtVe6ezPoLPfP6hnAOoZvPnmm3ullMfncsxjUgCdeOKJrF+/viDXWrlyJePHjy/Itdornf0ZdPb7B/UMQD0DIcSHuR5TmeAUCoVCURSUAFIoFApFUVACSKFQKBRFoeACSAihCyHeFkIsC32eK4TYIoTYKIT4uxCie6HnpFAoFIrCUwwN6IfAfyI+vwAMl1KeDmwFflyEOSkUCoWiwBRUAAkhqoFLgSfsbVLK56WUwdDH14DqQs4pkqZmHxt27qep2VesKbSJY33+CoWic1HoMOyHgJlA1wT7rwaeKthsInjytQ+Zs2wzHl0QNCX3XXE6U2r7FmMqWbG4bhezntmIW9MImOYxN3+FQtH5EIXqBySEmAxcIqX8byHEeOBWKeXkiP23AyOB/5IOkxJCXAdcB1BVVXXWggULcja3z1r87NrfGrVNE4IhvbrSeriF8vLyqH2GKfEbJh5dQ9dE1tfN5ThbPj6EGfHY7Pm3ZVyb5uZmSruU5WSuxyLNzc1xvwOdDfUM1DOYMGHCm1LKkbkcs5Aa0BhgihDiEqAE6CaE+IuU8utCiG8Ck4FJTsIHQEr5OPA4wMiRI2VbEsIa9hyibud+avt1p0eZh9H/7yX8RvSjKPPq/PXcM9j3fl1U8llbNI2mZh+N+1qp7lHK6oa9OdNYNuzcz29fXschXzC8ravXxbyRw6go9VDdo5TKcm9WYwMs/ucLXL/iyDGrXUU+92yeQ2dPQAT1DEA9g3xQMAEkpfwxoQCDCA3o60KIi4BZwHlSysP5nsfPnn2H+a/tCH8+e2APAka8zAsYkuoepeyL2NbU7GPWMxs5EjA5ggnAzGc2MqamZ8qFLVJw+Q0TwzQJmmQ8jhPVPUoJmGbUtiNBg2vnr8ej620SGvbifSSgpzXXtiz2bRUUTmT7whA5F4VCkR/aQymeRwEv8IIQAuA1KeX38nGhhj2HooQPwGvb9zkee+uFg+MWwcZ9rbg1LbwQA7g1jcZ9rUkXTCfBFUs64ySistzLfVeczq1PbwDAlBLTlAQk+IKWVpStgGvc10qswS3RXNuiHebDh5XtC0PsXO49pz38mSgUHY+iJKJKKVfa/h8pZY2Usp+Usjb0kxfhA1C3c3/axw7pFR8n4aRpBEwz5VuyLbiSkc44yVj/wWf4DYnfkARNiFXqbKGRKdU9SonVD53mGrnYH/IFORIwmfnMxrQi8tpybjKcnnuq5+A0l8Z9rSqyUKHIA52qEkJtv+5pH3uwNRi3zdY0StwaXb0uStwa911xekqtwklwuXWB1yUyGicRTppdLNkKuMpyL9U9SlPeczaLfS7OTUaZR8cXNKK2pXoOTnMRoe0KhSK3dCrbQk1VV64a3Z/5a5Mv1gDdSp0fzZTavoyp6ZmRr6Ky3MvsS4cyZ2k9bl3DkFaYd6bjJCIdzW72pUOzvkZFqZs1s8YlnWu22mFbz02EbUbTNAGGxKsLhCZSCnqnucjQdoVCkVs6lQYEcNflp/HizeO487JTEx7j1gXD+lREbYtM8qws9zKiX/e0F/TFdbu4+7nNeFwaAVMye/JQptT2zXicRKTS7Mq8OsP7ViQ9JhWp5pqtdpjpuekk20b5fgKWMJFCsOyGsSn9Sk5zyWVQhEKhOEqn0oBsaqq6UlPVlePKvMxYWEfQtN5y3ZpA0wRzp0Yvfm0NvbYXQ5u7l23momG9MlrUkkWIpdLs/MG2aRPpzisb7dAmnXPT/R6cgkW8ukaL34g7Np25vLN+bdr3oVAo0qdTCiCbyIWmzKPT4jfiFj/DlFmHXkP2kXOR2AuvLgQBw+SOy4Yx/ewBUcfcdflpXHX2ify7/mPuf35rVOBAZGpVLkOdEwmEbMetLPcmPDeTiLZcmPSSzUWhUOSGTi2AIPVC4zfMNgmQti6GThrU7c9uAgHTR0ULoZqqrrT4DX778raopNRSt4vGfa38a9PHzFm6GV0DU8LcqdmHOrclJyobMhHkthltZoxwTHde+chHUigU8XR6AeRE5ALk0bU2CZC2LoaN+1rRRXzpmzlLnc14iQTeum1N/PKfW6wNIUvUjKc3ZC0wcqHZZUKmgtzWbut3HwAEw/p0S+s6TlpdemcqFIpMUQIoBqckxEgB4jdMrh9fE3desrfmtvhGqnuUEjDik1fdukj77X/25KHcuWRT3BgBQ/J8/cd8LUaTSkcDyHXkWqprZiPIMy13lEirmzehJKt7UigUyVECKAKnBahxXytfPbcna2ZN5Ml1O5i34j0eX7WNeSsbwgtaOs7xTHwKsYvxHZcNs8xuERimTPn2b49Rv/sgGgLiUkrhx3/fxH8+Pshdl58GpK8BtFWziyTd4IJMBHk2JsJEWp3f4QVAoVC0HSWAInBagOwkxOoepfxmZQO+oIwqbzO0d7ec+kKcFuPpZw8AYZnd3LrACLWLSDa+LfAW1+1i5qKN+Bzq3dnMX7uDq84+kR5lnow0gLZodjaZCop0BXk2JsJEWp1H73TZCgpFQVACKIJkSYiJFrS6nftz5gtJthhPHzWAi4b1SrjYO5mw7PF8wdRv8HcsrmfmxUMy1gDaGi2WL19SNibCRFqdvv+9rOehUCgSowRQBPYCdNuiDehCw5Bm1ILutKDV9uueM19IqsU40WKfyITlNF4i1mxrIhA0Cq4BpCMosolKy9ZE6KTVrVypBJBCkQ+UbSEGy1AlLNtbRB3oRNn6NVVds64AEEs2b+3JCnk6jZeMD5oOO95LPhvQpaqCsLhuF2PuXc7Xn1jHmHuXs6RuV9pjT6nty5pZE/nLNaNYM2ti2iHnuapQoVAokqM0oBBNzT7qdx+0/CURJiu7EnJluTehzyPR9kzf3LN5a0+mNY3o1z0ugs8fNB1CESxq+3WnpqprwTWAZM+vrf41lVCqULRflAAionAlwtFfUr/7IOMGHw8kXtBit2dbvidTx34qrSm22sPFD7/i2IDvqtH9qanqmvQe85mg6XTNfPiHVJKpQtF+6PQCyKnSQCSmlFw7f31GVQPa+uaezlt75EKaSmuyx9uwcz8lLp2AcbRKgtel8cCXT2fyiL4Jx4f0BWouF/hc5xrlo+mdQqHInk4vgBr3tSLNxCHKAL6gmVHVAKfqBbmsEuC0kK6ZNTGr5FGQjD6pZ9Lxf3m2zk9eSi5QG/Yc4qEXt/LPTR9T4tYwJG1e4HOZa1To0kEKhSI1nT4IocyjJ82RsQkYMlTWJTWbdh2Iq7yczpt7pq0GIoMOgISOc3tcsISCK+JbNyWsadibdPzdB47g0pwFKsDPnn2H8x9cxbJ3PsaQ0OI329zV1J7zmJqeWQUSxJKvpncKhSJ7Or0GtOXjg2kf69QlNZamZh93P7c5bvst5w9O+qadyjxkm7YOtPoz8ovEjjt78lB0TSMY0oQChozSBBIl4/pjhLQtUJN1Y9UEWWl9+TCV5aPpnUKhaBudWgNaXLeLG/5Wl/bxibqkRuL0pg1w/wtbo0KII7WdZKHU9jztUORr56/nSBptppuafaza+ikzF0WPO2fp5qTaTKLQ7TsuG0qJW6PMq+NxacyebHVYTdaN1W8kLheUiFTPIlva0jBPoVDkh06rATXsOcRNC+oyOqdPRXbtpcFqCmdrGrFFMq8fX5NQqwGYuWgDvqAM73dpVvC
 								      "text/plain": [
 								       "<Figure size 432x288 with 1 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True)\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"bad_visualization_plot\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 32,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACxSklEQVR4nOz9eZhcV3qfCb7n7rEvua/YNxIkQBIsssgSiyxKllylvV2S3S23ZLul8TqaGXsky26Pxt3W2B7bbftx99M9ctv9qNttl6WyZEmlKtUqFmshixtAAiRA7LmvkbFH3P3MHzcjkZnITGQCiaWI+z6iChmZcePEjczznW/7fUJKSUxMTExMzL1Gud8LiImJiYl5OIkNUExMTEzMfSE2QDExMTEx94XYAMXExMTE3BdiAxQTExMTc1+IDVBMTExMzH1Bu98LuB26u7vl3r1778lrNZtNUqnUPXmtB5WH/R487O8f4nsA8T14++23F6WUPbt5ze9LA7R3717eeuute/Jar7zyCi+++OI9ea0HlYf9Hjzs7x/iewDxPRBCjO32NeMQXExMTEzMfSE2QDExMTEx94XYAMXExMTE3BfuuQESQqhCiNNCiC8sf/1PhBAXhBDvCSF+TwiRv9driomJiYm599wPD+iXgfOrvv4qcFxK+ThwEfi1+7CmmJiYmJh7zD01QEKIYeAzwP/aeUxK+RUppb/85evA8L1c02r8IKTl+vhBeL+WcEd8v68/Jibm4eJel2H/C+BXgMwm3/+LwH+8Z6tZxWLd5t3JKgCmrvDYYJ5cUr8fS7ktKi2Xc9NVgkCiquL7bv0xMTEPH+JezQMSQvwo8Gkp5V8VQrwI/C0p5Y+u+v7fBU4BPy03WJQQ4peAXwLo6+t76nOf+9yurc0PJeWWiwAUITBUBQSkTY1Go0E6nb7pOaEERdz5a+/WdRqOjyIEQoCUEEpJ2tyd80XnHuzWWr/f2Ox34GEivgfxPXjppZfellKe2s1r3ksD9A+BPw/4gAVkgd+VUv6cEOLngb8MvCylbN3qWqdOnZJ30ohquz4N1ydtaGiqwisX57ky1yCXNAilJJCSwVyCjx/o4o3vfntN89mdeBp+EOIGIYaq0HD8XfNYWq7Pm9eWKKbMlceWmg5PjObRVAVDVdDU24+2fv0bf4I5evz71rtafd9v5z487A2IEN8DiO+BEGLXDdA9C8FJKX+N5QKDVR7QzwkhfgT4VeCT2zE+d8rYYoMvvT+L54fomsKz+wpcn29wZb6BoSskTZ20qTKYS0Se0Cr8IOTcdBVLUzEtFccPODtd4dl9Xbfc2FYbLgG0/YCulLnj62yEoSqoqsDxA0wtup7tB5yeqIDkjoyGH4S0vYDcNt/znWz2d2ooNuJ2Dwyr1xITE3N3eBCkeP5HwAS+KoQAeF1K+ZfvxgvZrs+X3p8lbWqkMjofzlT4259/j5rj44eS7pRJd9qkO2Pys0+P3LQJukFIEEhMSwXA1FSajo8bhFtumOsNV9V2uTLVpO9gYkfX2QxNjXJW74wtMdFuYigKQhWkDG3FIN2ugXOXCxpM7dbv+U68w7uRw7rdA8P6tQRhPLY+JuZucF8MkJTyFeCV5X8fvFev23B9PD8kldFpex5vjJcptz0E4PshczUbQxMc7EuT0NWbnr+Rp6Gq4pan5PWGK21oSCFpOB65hLHt62xFpeXwzniZiXLkRHZnTJ7f34OWUu7IwHXWdKv3fCfe4Z08dytu58Cw0VoqXoB/m4eDmJiYzXmo/qLShoauKTRdj0bbp1R3MVQVIQRZy0BRo/+tND388OZS5o6nYfsBS00H2w94bDB/y41pteGCqOjhUG+GQModXWczbNfnj87O0HB8Rgop8pbGW9fL/PH707w7WaHUdG7bwGmqQkJXb/meVzb7VZ5SEMgVD2or7uS5W6EAXhiVpgPbMvQbraXzeExMzO7yIITg7hmWofGZ44P80blpyi0HpKSYMqjZPq4fIEOJpigM5C00ZeNNKpfUeXZf145yFZqqcKw/y+mJMgoCQ1d4Zl8XKVPdlZxHw/WxvQBdVVGFoOGEFBIGoRQ0bY8LszU++9TNIcXtoirilu/5dr3DO33uZnTCaK4nOT1fZrSQJJvUb2noN1pL5/GYmJjd5aEyQAAjXUl+4eN7qbQ9+rIJXrkwT6XtIyXsKSY5MZLj2GCOpLH21qxPkO9kM6+0XM7P1hAIJHCsP7uS39iNsE7a0LB0lbrtYIuowMHUFfZ1J3h8JIfjSRLGzSHFnXCr99zxDs9OV2g6/koeZ7sGervP3U6hwuowWq5o0JXWaTgBp0YLWMbWv/IbrSWhq3H4LSbmLvDQGSCIPKF+Q+O/eHKEkWKSqaUmCw2Xvd1p+jImJ0YKazacOy29XtkMrSjfc362tuP8xlYbr2Vo/NjjQ/yn0xOMl5osNR36MwkUoXBxrsFwPnnXTvCr13U73mGH7Tx3u5/D+txP0tCxvZDtBtHWr+Xbkw9h81NMzD3goTRAHXJJnZeO9OIGIQoQwoab350kyG+3cm41N8JJISGSJ0YKdKXNNT8z0pXkL79wgMWmw/eulCi3PTRFIZQSyY0qrt0sdd7MINzudbfysnZSqLAbIb2derkxMTE756E2QHDrjSaU3JEBudPNsLPxen7IdMWm7flcnm/w2adGbjJClqFRBLozFgd6s3hBiK4qVNsubhBSabmcHi/jh5KUoXFipHDbpc53q3JtM3ZiyO8kHNh5b7vdjxQTE3MzD70B2ojVG5AiuCMDcqeboRuEuF5kfHRVIWVaLDZszkxU+OThnk1P/34YYuk31luzXf7D98ZZqLuoimAgZ2H7IZ862ntbm+xueHY7YaeGvBNG61TAJbeZA9vIq4uJibk7xAZoHRs1IT6xyoAI4FD/zVqqW52a7yQ3YqgKIZK255MyLdwgwDJulAZv5/R/rD/L22NLLNRdulImoZRMVdvUWi4nR7L0ZpPbfi+r17WblWu3es3bMeQ7lTvazKuLiYm5O8QGaBWbNSGmTJVn93WxUHe4OF/n4mydK2pjZUPbTnJ8JzmF9ZvxEyMFLs83WGzYWIbKaD6Fqm1eGrze4LVcn5bjI4Sg7fqMl1tcW2ygqgKE4L96Zi8jXZER2q4HcKee3Wq2W1ywE0N+OyHCzby6WAghJubuEBugVWy0AXUeN1SFK4uNm+RtTo0WdjUXstFm3JU2+exTI5yZqACgatxys+8YvErL5b2JChOVNo7rM11xmC7bqEIwXEiw1HT5z6cn+cUX9qOpyo48gDvx7Drs1FBs15DfTohwM69OxkVwMTF3hTjDuor1igWrmxA369ZvuP6udfGv3oyLKRNLUzk7XcEPQrrSJp883MPHD3Tx7L6umzyEjYbRda6XNDVOjXYxVExQbXvYfoiuqkwu2Vyca/Ddq4t8OFvf9D1u5QFoqkJyWVX8drhbKggbfZa3ChFupnQRExNzd4g9oFV0NqAzk2WWGg6moa5pQtzodJw2tF3Lhdzq1L7Z6X+zENbq65kaPLe/m2vzTZKGQ7nlk9BUvDAkZRh86/IiB3pS99wD2E4u6Xaq0m43RLgbXl1MTMz2iP+61iGRCAlSgFh18t/sdGwZ2m3pw23E7Zzat/Ka1l/Pk5K9PSmO9mVp2tHIBkNTeXK0iJQSOwjvuQdwK329Ssvl9Wsl3ry2xOvXSlRb3rav3TEmT+8rbug1brWmO/HqYmJitkfsAS3TCWG9N1EhaWoUUiaOHzC7Sgl5s9PxZo/v9OR+O6f2rbym5LJxXF3Bt6cryULNZrQrgSYEw8UUaUul5S1L+hjaPfcAtrp/d5pfixtKY2IeXGIDxI0QVsv2ubTQ4MmRIqbGipJAy/XJJgxg8w1t/eO3K9+z0xDQrUJYq6+nAK9eWqRq+zw5WuTsZIVK26XhWPzY44MrOmmbvce72aC50WvejV6juMk0JubB4aE3QJ1Ttq4oZBMGvif5ygdTHB/IsdjyKHghb18v8+Se4m3pv93OyX07p/bVG+mtvKbO9Vquj6kpPDlSwAtDPn6gyFzN5hOHuimmrE2vD9s3qLu5we92r9HdGHo
 								      "text/plain": [
 								       "<Figure size 432x288 with 1 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True, alpha=0.2)\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"better_visualization_plot\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 33,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAArMAAAHoCAYAAABaRmeyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOzdd3hURRfA4d9szaaHklBCr9KkCSJIVcACKjZsoGIXsRfsoljAggUV/bBSFStYEJQqSFOQJr0mISEkpG3fne+P2ZCEJKSQbALM+zxrNnfvnTt7E+LZuWfOCCklmqZpmqZpmnYqMlR1BzRN0zRN0zStvHQwq2mapmmapp2ydDCraZqmaZqmnbJ0MKtpmqZpmqadsnQwq2mapmmapp2ydDCraZqmaZqmnbJ0MKtpmqZpmnaaEELsFUJsFEKsF0KsDWyrIYRYIITYEfgak2//sUKInUKIbUKIQfm2dwm0s1MI8Y4QQgS2W4UQswPbVwkhGuc7ZmTgHDuEECOD9Z51MKtpmqZpmnZ66Sel7Cil7Br4/gngdyllC+D3wPcIIdoAw4G2wGDgfSGEMXDMB8AdQIvAY3Bg+yggXUrZHHgLeC3QVg3gOaA70A14Ln/QXJl0MKtpmqZpmnZ6uwz4PPD8c+DyfNtnSSldUso9wE6gmxCiLhAppVwp1epaXxx3TG5bc4ABgVHbQcACKWWalDIdWEBeAFypdDCraZqmaZp2+pDAb0KIdUKIOwLb4qSUSQCBr7GB7fWBA/mOPRjYVj/w/PjtBY6RUnqBDKDmCdqqdKZgnKSi1apVSzZu3Liqu1GsnJwcwsLCqrobZzT9M6ha+vpXLX39q5a+/lUrJyeH//77L1VKWbuq+tBcCGmvhHaTYDPgzLfpIynlR8ft1lNKmSiEiAUWCCH+O0GTooht8gTby3tMpTolg9nGjRuzdu3aqu5GsRYvXkzfvn2ruhtnNP0zqFr6+lctff2rlr7+VWvx4sX069dvX1X2wQ7cWQntPg/OfHmwRZJSJga+pgghvkPlryYLIepKKZMCKQQpgd0PAg3yHR4PJAa2xxexPf8xB4UQJiAKSAts73vcMYvL+BbLRacZaJqmaZqmVSCBGi2s6EeJ5xUiTAgRkfscGAhsAn4EcqsLjAR+CDz/ERgeqFDQBDXRa3UgFSFLCHFuIB92xHHH5LZ1FfBHIK92PjBQCBETmPg1MLCt0p2SI7OapmmapmnVlQDMVXPqOOC7QBUtEzBDSvmrEGIN8JUQYhSwH7gaQEq5WQjxFbAF8AL3Sil9gbbuBj4DbMAvgQfAVOBLIcRO1Ijs8EBbaUKIF4E1gf3GSSnTKvPN5tLBrKZpmqZp2mlASrkbOLuI7UeAAcUcMx4YX8T2tUC7IrY7CQTDRbz2CfBJ2Xp98nQwq2mapmmaVoFy0wy04NA5s5qmaZqmadopS39w0DRN0055mZmZpKSk4PF4qrorREVFsXXr1qruxmnLbDYTGxtLZGRkVXelWFWYM3tG0sGspmmadkrLzMwkOTmZ+vXrY7PZCEx+qTJZWVlERERUaR9OV1JKHA4HCQkJANU6oNWCRwezmqZp2iktJSWF+vXrExoaWtVd0SqZEILQ0FDq169PYmJitQ1mdc5scOlrrWmapp3SPB4PNputqruhBZHNZqsWKSXF0WkGwaUngGmapmmnvKpOLdCCS/+8tfz0yKymaZqmaVoF0mkGwaVHZjVN0zTtDPbZZ58RHh5+0u0sXrwYIQSpqakV0CtNK72gB7NCCKMQ4h8hxLzA9xOFEP8JIf4VQnwnhIgOdp80TdM0DUC63Ui3u6q7Ue01btyY119/vcC28847j6SkJGrWrFlFvao+cnNmK/qhFa0qRmbvB/IX4FsAtJNSdgC2A2OroE+apmnaGUpKiX3GDJLbtSPRZiPRZiO5TRvs06Yhpazq7p0yLBYLderU0fms5KUZVPRDK1pQg1khRDxwCfC/3G1Syt+klN7At38B8cHsk6ZpmnbmklKSfuutHL3jDrybN4PfD34/3q1bOXrXXaSPGFFpAW3fvn256667uP/++4mJiSEmJoZHH30Uv98PQHp6OiNHjiQmJgabzcYFF1zA5s2bjx2fmx4wd+5cWrZsSUhICP369WP37t3H9nn++edp165dgfOWlFawa9cuLrvsMurUqUNYWBidO3dm3rx5Bfq9b98+Hn30UYQQx4LXotIMvv32W9q3b4/VaqVBgwaMHz++wPVs3LgxL730EnfeeSeRkZHEx8czceLEcl5R7UwV7JHZScBjgL+Y128Ffglab8opIR2mLIZ560F/aC/a0aM+7PbifsyapmnVg3POHJxff43MySn0mszJwfnddzhmz66080+fPh2/38/KlSuZMmUKH330EZMmTQLg5ptvZtWqVfzwww+sXr2a0NBQBg8ejMPhOHa8y+XihRde4NNPP2XlypX4fD6uuOKKkwrAs7Ozueiii1iwYAEbNmzgyiuvZNiwYfz333+AClDj4+N59tlnSUpKIikpqch21q1bx9VXX82wYcPYuHEjr776Kq+88grvvfdegf3eeust2rdvz99//83jjz/OY489xsqVK8vd/+pApxkEV9BGrYUQlwIpUsp1Qoi+Rbz+FOAFphdz/B3AHQBxcXEsXry40vp6Ih4fbE5UQeyO/TBtNzSoUXCf7OzsKutfdXDokI/ERDXY3ry5mcjI4GeznOk/g6qmr3/VOtOuf1RUFFlZWeU6Nvull4oMZHPJnBwyXnoJ3yWXlLpNn89Xqv74fD7i4uIYP348Qgjq16/PmDFjeOONN+jXrx8//vgjv/zyC506dQLg/fffp23btkydOpWRI0fidDrxer288sordOjQAYAPPviADh06MHfuXPr164fL5cLv9xfoj9PpBDi27fjvmzZtStOmTY/tP2bMGL7//numT5/OY489htlsxmAwYLFYCAsLO3as3W4H1O+f1Wrltddeo1evXjzyyCMADB06lE2bNvHqq69y8803q+srJf369WPkyJGACuAnTZrEzz//XGhE+XhOp7PI3/Ps7OwSr712eglmCkZPYKgQ4mIgBIgUQkyTUt4ohBgJXAoMkMV8nJRSfgR8BNC1a1fZt2/fIHW7oBtehhnbAKP63mIE18cF91m8eDFV1b+q5vdLrNadeAOJI+3aWdi4sVHQ+3Em/wyqA339q9aZdv23bt1aruVjpZRk5rttXxz/1q2Eh4UhDKX7YF7a5WyNRiPnnXdegVWs+vbty0svvcSBAwcwGAwMGDAAs1mNyUVERNC+fXt2795NREQEISEhGAwG+vbte2yftm3bUq9ePfbu3UtERARWqxWDwVCgPyEhIcfaK+r7nJwcXnjhBebNm0dSUhIejwen00mnTp2O7SOEwGq1Fmg3dwW28PBwIiIi2LlzJ5dcckmBfQYMGMCrr76KlJLIyEiEEHTp0qXAPvHx8WRkZJR4DUNCQo4F+vlVhw9yetGE4ApaMCulHEtgcldgZPaRQCA7GHgc6COltAerP+WRlgZffQGcw7FgNrocqyd6fWAyVmTPqg+DQVCzppGUFB8mEzRqVLUp63a7D78fwsNP0wuuaVpwBHlS04nSBMoywcpgMBRqq6SVsx555BF+/fVXXn/9dVq0aEFoaCgjRozAXcYqD1LKYvuaf3tuIJ7/tdy84VOZnrAVPNWhzux7QASwQAixXgjxYVV3qDgeDxj2AQcACSY/fHNv2doY/T+wXAetxsCR8t0VK7WjRyXPPefh0Ufd7NsXvD8Mf/xRn4svDmX48Ag++6xO0M57vFGjdhAV9RcxMX9x+eVbcbtP/T+OmqZVHCEElm7dStzP3KVLpc3QX7VqVYFg86+//qJevXq0adPmWC5trszMTDZu3EibNm2ObfP7/axZs+bY9/v37ycxMZGzzjoLgNq1a5OcnFzgHOvXrz9hn5YvX86IESO48sor6dChA/Hx8ezatavAPhaLBZ/Pd8J22rRpw/Llywu1HR8fX66RdE0rTpUEs1LKxVLKSwPPm0spG0gpOwYed1VFn/L7axN0Ggltroe5+f4dxsXBtbcDh8G4Ea5
 								      "text/plain": [
 								       "<Figure size 720x504 with 2 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True,\n",
 								    "             s=housing[\"population\"] / 100, label=\"population\",\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "             c=\"median_house_value\", cmap=\"jet\", colorbar=True,\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "             legend=True, sharex=False, figsize=(10, 7))\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"housing_prices_scatterplot\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Adding missing figure in chapter 02

											
										
										
											2017-06-08 14:23:33 +02:00
+								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Adding missing figure in chapter 02

											
										
										
											2017-06-08 14:23:33 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "The argument `sharex=False` fixes a display bug: without it, the x-axis values and label are not displayed (see: https://github.com/pandas-dev/pandas/issues/10611)."
-												Adding missing figure in chapter 02

											
										
										
											2017-06-08 14:23:33 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "The next cell generates the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid."
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 34,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAosAAAHoCAYAAAAhYqV+AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOyddZgkxdnAf9Xd47Kut3u754YczuHuwV1CIB8WQiBIgISgIYFAcA6HBNfg7hJcDs5d1313fKa7vj9m1nXWbqV/z7PP7s50VVdVV1e//dYrQkqJiYmJiYmJiYmJSVcom7sBJiYmJiYmJiYmIxdTWDQxMTExMTExMekWU1g0MTExMTExMTHpFlNYNDExMTExMTEx6RZTWDQxMTExMTExMekWU1g0MTExMTExMTHpFlNYNDExMTExMTHpA0KIdUKIhUKIBUKI7xOfpQsh3hdCrEz8Tmtz/JVCiFVCiOVCiAPbfL5dop5VQoi7hBAi8blNCPFc4vNvhBDFbcqcnjjHSiHE6cPYbVNYNDExMTExMTFJgr2llHOllNsn/r8C+FBKOQ34MPE/QojZwInAHOAgYL4QQk2UuQ84G5iW+Dko8flvgTop5VTgduDmRF3pwDXATsCOwDVthdKhZtiFRSGEKoT4SQjxRuL/W4QQy4QQvwghXhZCpA53m0xMTExMTExM+skRwH8Sf/8HOLLN589KKcNSyrXAKmBHIUQe4JVSfiXjmVEe71Cmua4XgX0TWscDgfellLVSyjrgfVoFzCFnc2gWLwSWtvn/fWALKeVWwArgys3QJhMTExMTExOT3pDAe0KIH4QQZyc+y5FSlgEkfmcnPp8AbGxTdlPiswmJvzt+3q6MlDIGNAAZPdQ1LGjDdSIAIUQBcChwI3AxgJTyvTaHfA0c21s9ntQMmZlX2OnzaFTHThivTR+cBndANyQIUOOmBUNOVDewqCPfUiBqSDQBYpjGZSQS0Q2so+BaDSdjdUzGWr+klNTX+7HbrTgc1s3dnHHD5p5HMUOi9mPd3lBaRXVd42Zf7KcKIQNDUG8ZLAZCbT56UEr5YJv/d5VSlgohsoH3hRDLeqiuq3GSPXze3zJDzrAKi8AdwJ8ATzffnwk811slmXmFXPvoe11+t2hRKXnWBvLdEQ6c0oBdG5yxNKSktDFEnseOqgzPfbKpIUiux442TOfrL+VNYVIdGnZN7f3gMcq6ugBFqY5xLTC3RUrJ+vogxWnOzd2UQWdjfZB87/CtA0NNOBzl1Te+Z+aMfLbaomhzN2dcMBLujwpfmBR78uv2ridcMUQtSo4AcM4Q1HsthNrYInZCSlma+F0phHiZuP1ghRAiT0pZlthirkwcvgloq9kqAEoTnxd08XnbMpuEEBqQAtQmPt+rQ5lP+tHFfjFswqIQ4jCgUkr5gxBiry6+/wsQA57qpvzZxI1Bycgp6OoQAObMyUPKPOqCEZ5fuYFDJlaQ6WqvaZQSHlgwhY825LBHQSW/23YVva37AshwWqkJRMh221rromtxf6DU1DRRVlaP36Z127ac7BS83vaLjZSSNWsrMQwjqfOVVtpZsjqFuTPryEyLJFW2KRyjzqIOWKidWJiJzWYZUB2bg2F7tTMx2YwM1VpnMnoYSWudYPi1XUIIF6BIKZsSfx8AXA+8BpwO3JT4/WqiyGvA00KI24B84o4s30opdSFEkxBiZ+Ab4NfA3W3KnA58RXyn9SMppRRCvAv8vY1TywEMo9necI71rsDhQohDADvgFUI8KaU8NeECfhiwb8LYsxMJNfCDAJNmze12zgohEAKcLhthbRJPLteY0fghVtmqVX6lYh5Plm1BWFp5f20WP6+o4cTcz3rtgCHjW9EWdeiXzJgU6BKqejhm7YYaVNF5KCKGSOqmLqnwcvlt+yGQKIrkjiveIC0lmHSbB8r6jdWj9mEUjhms1sbO1uRgMFbHJKwbLFeVUTtXO2Ikltz166uoqmrczK0ZP2zu+yOqS1RF9Koo6UhT0/A/G7pCAJtBtZADvJzYQdKAp6WU7wghvgOeF0L8FtgAHAcgpVwshHgeWEJcGXa+lLJZe3Ue8G/AAbyd+AF4BHhCCLGKuEbxxERdtUKIG4DvEsddL6WsHcrOtmXYhEUp5ZUkpOCEZvHShKB4EHA5sKeUg2uCYLNp1IRsZEwqwmONAaDrgpf+txdhZ9w2J2xYWRDemotnlgzmqdsRaPKhaho2h73PZXKL88iZmNvjMRtXbKC2vKb9hwK23GVrlCRsYe68W8WQGpGIwOOWuKf8iiOPHRq7z55Y8eMygr7BXYjqqmpIyUxDEWNPaOkri79eiNNppbAgY9jPHdENDAn2MSg0jiUyM1OIGgYOy/g1JTHpmpWrytA0lUnFcZ8NbRzPESnlGmDrLj6vAfbtpsyNxP00On7+PbBFF5+HSAibXXz3KPBocq0eHIZbi9sV9wA24oaiAF9LKc8dzBOkTZ3NnGIVIQTvve/Gv94WVwZroCHZd65g7p7b9rm+aAwsvY1cYs9G13Vqyipxely4U7wD6UYnJs4oYuKMgdsY7TJPoqpgsUgMCdttKzeL7d2UrWeiaYN73jWLllM0cwqqNhKm+vAjpWTpd4vxeBxsOWfisJ+/MRxDNyRpjtFnXjCeCER0/FGdLJfp4GLSnnXrqnA4rC3rh8M+MubI5tiGHs9sltd9KeUnUsrDEn9PlVIWJgJczh1sQRFgTXmrlszpNLAslrACRINkW2+AfxxV3qd6DAOO/FcR9tO3YN7VUwhGehBsEl8FfQF0Xe9VUGy7bSwlvPhfF9den85339u6LTNY7LiD5IN3I/ztuhj/+yzClCmDb5nSU42rVxvM2qIJu7uJ7LwmPv4kNujnT7ZNQ1Own3VsbkOhzX3+8cR4Gevx0k8TkzHCmBXM/UHBC2+4WL1uCrOzaxCiCYBddwlwym/qefr9VPJcUf5xWTl2S99Wrl822PlwsRsQLNlk44NFbn61bVP3bWhsIhIKY7P3LvC1FTv/eWsqN/49nUBQcOttqXz4Xgk77RjuUxv7y847SXbeaei2nnvSFx50qJ+16yRSQk2t5MijA6xc5iY7e2jfZfqtw2xTsN9G/8kU6mdDB+15nMT5x5oTxLD3ZwQPns9wUEcWMTn6tiHNedk3NKGTRhVuZWTYJfbEZrJZHLeMWWHxzkdSWLPBQkwX/P5OJwXZ69lvez93vZjBkwvTiGQIVgdt/OrPxZx5cB13XljWa535aVEEEiEkuhRMyW7vNSylJOQPUFsZtyO0Ox1YbVY8aSlJtf3Jpz34A3FBKRiE199w9SgsDsbC0VUdlZWClasUtpijk5JcF/pMU5Nkw8a4oNiMqsLiJcaQC4uDwWAv2IYhCQbB5Rp4zYLhV+CMpQcyjL3+9Bef4aBWyadgQj52m8UMETUGkVISCkfZVGIBo3RUCIwmw8fIfxr3k1Xr44IiQDCs8OkCF2U1GlfcmUuwWkGPCiSCQEjl0bfS+W6Zo9c6s1N0Pr9mDVcfXck7V6xldkGrACelpKGmDl+jj5yJ+eRMzCc9J7NXQfGXhYLDj7Rw/EkWNmyIf7bVlhGs1njoG6dTMmdOz6FsBmPZ7ljH198oTJ3p4rAjHEyb6WLt2qF5OLhcYO1gAhOJwoT8vp0vFpOsX28QCIz+fa2mJsn02THSsmLccdfwOxiZmHRHHVkUTMjHYbeaguIYRQiBw26lYEI+dWRt7ub0SrPN4mD/mHTNmBUWiUCLusqAhV/bmTJ5BpHvBCwCfqQlRnsoInj18745n2xdFOLqoyvZbUar47ah61RuKkNRFLLyc9A0DU3Tel1UQyHYez8rb76t8PIrCvsdFJea7runkiOP8DNjRoTLLqnjxON9SXZ+4FxzrQ2/X9DYKKirF9x979AYNSuK4Il/23E6IcULTidccpGV6dN73+ratMlg2iwfW8z1kTOhiXfeHbitY22tZK99/HjSGjnwED9NTcMnhC74WVJRAboODz+SXJzMjsy/38I++zv45NPRt2VoMvKISRX7KIyBapI8dptlVJgaNG9DD/aPSdeMSUHaMMDYCGQJsALVgje+9SJ
 								      "text/plain": [
 								       "<Figure size 720x504 with 2 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Make notebooks 1 to 9 runnable in Colab without changes

											
										
										
											2019-11-05 15:26:52 +01:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – this cell generates the first figure in the chapter\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
-												Make notebooks 1 to 9 runnable in Colab without changes

											
										
										
											2019-11-05 15:26:52 +01:00
+								    "# Download the California image\n",
 								    "filename = \"california.png\"\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "if not (IMAGES_PATH / filename).is_file():\n",
-												Move datasets to project ageron/data to shrink this repo

											
										
										
											2022-02-19 09:36:43 +01:00
+								    "    homl3_root = \"https://github.com/ageron/handson-ml3/raw/main/\"\n",
 								    "    url = homl3_root + \"images/end_to_end_project/\" + filename\n",
-												Large change: replace os.path with pathlib, move to Python 3.7

											
										
										
											2021-10-15 10:46:27 +02:00
+								    "    print(\"Downloading\", filename)\n",
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "    urllib.request.urlretrieve(url, IMAGES_PATH / filename)\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_renamed = housing.rename(columns={\n",
 								    "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
 								    "    \"population\": \"Population\",\n",
 								    "    \"median_house_value\": \"Median house value (ᴜsᴅ)\"})\n",
 								    "housing_renamed.plot(\n",
 								    "             kind=\"scatter\", x=\"Longitude\", y=\"Latitude\",\n",
 								    "             s=housing_renamed[\"Population\"] / 100, label=\"Population\",\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "             c=\"Median house value (ᴜsᴅ)\", cmap=\"jet\", colorbar=True,\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "             legend=True, sharex=False, figsize=(10, 7))\n",
-												Large change: replace os.path with pathlib, move to Python 3.7

											
										
										
											2021-10-15 10:46:27 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "california_img = plt.imread(IMAGES_PATH / filename)\n",
 								    "axis = -124.55, -113.95, 32.45, 42.05\n",
 								    "plt.axis(axis)\n",
 								    "plt.imshow(california_img, extent=axis)\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "\n",
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								    "save_fig(\"california_housing_prices_plot\")\n",
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								    "plt.show()"
 								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Looking for Correlations"
 								   ]
 								  },
-												Add fundamentals and training_linear_models notebooks

											
										
										
											2016-05-22 16:01:18 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 35,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
 								    "corr_matrix = housing.corr()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 36,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "median_house_value    1.000000\n",
 								       "median_income         0.688380\n",
 								       "total_rooms           0.137455\n",
 								       "housing_median_age    0.102175\n",
 								       "households            0.071426\n",
 								       "total_bedrooms        0.054635\n",
 								       "population           -0.020153\n",
 								       "longitude            -0.050859\n",
 								       "latitude             -0.139584\n",
 								       "Name: median_house_value, dtype: float64"
 								      ]
 								     },
 								     "execution_count": 36,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 37,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1AAAAJECAYAAAAYK8UIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOz9d7yl11mfjV9P3b2d3svMmd6lUZdlWZLlhowLprhhIEAIEAh5CeH9ERJIIQkhL4GEgCmmuGPjXmTJ6nU0vffT+z67t6eu3x9rnzPnzIzkGUuyZfm5Pp+Z2bPLs5/d1lr3uu/7+1WEEAQEBAQEBAQEBAQEBAR8Z9Tv9wkEBAQEBAQEBAQEBAT8oBAEUAEBAQEBAQEBAQEBAddIEEAFBAQEBAQEBAQEBARcI0EAFRAQEBAQEBAQEBAQcI0EAVRAQEBAQEBAQEBAQMA1EgRQAQEBAQEBAQEBAQEB10gQQAUEBAQEBAQEBAQEBFwjQQAVEBAQEBAQEBAQEBBwjQQBVEBAQEBAQEBAQEBAwDUSBFABAQEBAQEBAQEBAQHXyHUHUIqi7FAU5X8rivINRVG6m9e9S1GUPa/86QUEBAQEBAQEBAQEBLx2uK4ASlGU+4EXgF7gHiDSvGk98O9f2VMLCAgICAgICAgICAh4bXG9Gaj/CPyGEOLdgL3q+seAm1+pkwoICAgICAgICAgICHgtcr0B1Dbg61e5Pge0vPzTCQgICAgICAgICAgIeO1yvQFUHlm+dzk3AFMv/3QCAgICAgICAgICAgJeu1xvAPVJ4A8VRekDBKArivJG4H8Af/9Kn1xAQEBAQEBAQEBAQMBrCUUIce13VhQD+FvgJwEF8Jv/fhL4iBDCexXOMSAgICAgICAgICAg4DXBdQVQKw9SlPXAHmQG65AQ4twrfWIBAQEBAQEBAQEBAQGvNb6rACogICAgICAgICAgIOCHEf167qwoyp+81O1CiH/58k4nICAgICAgICAgICDgtct1BVDAjsv+bwCbm8c5+IqcUUBAQEBAQEBAQEBAwGuU6wqghBBvuvw6RVHCwF8DT75SJxUQEBAQEBAQEBAQEPBa5BXpgVIUZSvwoBCi/+WfUkBAQEBAQEBAQEBAwGuT6/WBejHagfgrdKyAgICAgICAgICAgIDXJNcrIvEbl18FdAMfAL7+Sp1UQEBAQEBAQEBAQEDAa5HrNdIdvewqH1gEHgH+QAhRfgXPLSAgICAgICAgICAg4DXFD7UPVFtbmxgaGvp+n0ZAwMviwIEDWSFE+yt93OD3EfB6IPh9BAS8OMHvIyDgxXmp38f1ypi/rhgaGmL//v3f79MICHhZKIoy/mocd/n3IYTgwRNzTOXr3L2pg5GOOA8en+OLh6cZao3yc3eu48tHZ/jqkRkyMYN/dd8mtvemABhfqvLZ/ZMcGi9QqtuULZftvSl+5U0b+INvnOLAeJ54SOPGwQxPnM3iC0EibKAqgnzFpuG/Gq/s9U1Ig7ChUnyF3jxTUxhuj7GlM8EzF5dYLNuoCnQmTXrTMRYrFqW6TcPx8YVAAAoKg20RNnQksBzBbLHObKFO3fGIh3USYR3bFagKREyNquXiC9jWk2KwNcJUvkGhZrOpM8EDu3u5cTDDx54a5aFTcxTrLoMtUbb0JOlMhHnn7h7OL1T4kT99auWcf+ue9fzS/ZuB783v46b/+C2yNffVeJqXRFMgrKu4vsAX0Boz6UyFKNQd9vRn+Lk7hvnPXz/FdKFOa9ykLR4iW7GZyNVoi5nsGUpzbq7CQqlBueHSlQrzxz+5my3dqZd1Xo7n89WjM+SrDm/Z3kVvOrJyW932+MqRGRquR0vUZK7U4KahFnb1p1/muxHw3fBq/z4u58hkgRfGcmzsTLClO8nXj80SMTR2DSR44E+ewfEE92/tYKFsc2KmyEBLlD/5qT386SPnSYR1/tmdg3zob/ZTtVz+xd0jnJgp8vxojhsGMrx5awd/+OBZEiGdP/2p3fzqpw5Ttlx+8y0b+cwLU5yaLbG7P82Hbhvijx86S2vc5NfvWcfP/N1BHF/w6/eN8I/7p5nK19jWk2K4LcqXDs8Q0jV+8Y4B/r/HZBHW7etaeGEsj+MLUmENTVXJ1RwAfvHWfv7iuUkA3r2riy8dncMXkArraJpCrirvd9tgnGfHKwBs64pxYq4KSGGCsKFSc+T4vaND59iCHFs2ZlTO5uX1hgoo4HjyfR1JqZwvytv2dJkcmrMBiBoqluvjNXMl2ztjHJ+Xz/XGoQiPj9VXnnf1jBHRod4c0oZjMFq9+uccUsFqPvDW/ijPTdbk598SYbbYwPIEhgojHTFONV/j+2/q4ZMvzADQmwoxX7ZwfdBVGGyJciErj/HLd/bzf56S7+XewRQHxosI5Lh3/7YOvnl8AV1T+LOf2sWvf/YYri/4zTdv5H88dJaG69MSM7hzfRtfOzZL2ND41/eN8PtfPwPAj+3pxkPl+HSRd+7q4Y8eOrvymjYD3/yv7wBe+vfxHTNQ38k8dzU/aEa6e/fuFUEAFfCDjqIoB4QQe1/p4y7/PpYqFn//rBxDetMR3ntjH7/1uaNM5msYmsK79/TxyOl5Dk8WiId03rKti9+4fxMAXzg0xVePzHB6rsxS1cZQVVIRg/fc0MvfPztGqeGiqQoq4AmxMoj6Qv4JeG0QNzUipsZS1V75XHQFQoaKAtRsf83kqwCGBgOtMeq2R6G2HGCBrioIZGDm+IKQpmJ7HoamETU1hlpjzJcbeL6gOxXh3i2d/Pwbhvnpj+1jYqlGqeHQmQgx0BJjW2+Kt2zrYt/oEv/hKyfXnPPYpQnwVf19TC5VecMfPvZKH/67QlUgamooQH8mykhnjP1jBfI1G1NXiZsauZqDLwQKCj3pCMW6Q6kur4uaGu+/ZYDffvvW7/hctutjaAqKolxx2/hSlX86OA3Apq4Eb9/RvXLbiZki3zoxj+cLpvI1BltjcmH8hnWv2PsQcO282r+Py/mrJy9SbsiV+fa+JMenSgAcmsjxxLklAEKagusLPCG/0z975yDHpmSHSCaq88jpRQA6k2HKDRfP91EUheG2GBcXZVCyvTfF8ekiACMdCU7OlhBCoKoKt69v48ycfN6WmMmJGXm5LaaTrcpzUxUFQ1OoNwMZDfBe6Tcp4LsmpClYzaiwNWaSr8mAMRNRWapdmo0ihrryGYY0sJofogZ0pcMAJEIGp+fXdiBdy/xxLSp8O67xz/ZrOFZAQMAPGKmIQVcqjKLAhs44mqqwuz+Npip0pyLsGUwz3BYjbGjEwwY3D7esPHZDR4LOZJhk2KA1amLqKt3pMLeta6UzKY9paiobuxIoyMlS11SihlwEBlw/KmC+UvqqzeMlIjrrO2JEDG3l+lhIoyMZJmzqhHQFTZU7gyryc8xETdoTITqTYTLRECFdRVdl0JWK6JiGRjykk4zoxEwdU1Poa4nQl4nQlQyTjBh0p8Js6oqjayrbe1MkwzrJsEFLPER/a5R4SKc3E2FjV2LNOf/I1le8IulF6UqFCevfv2+r0vyjAhFTpSVmEjI0WuMmD+zsIRkxCBsamYhBezJMezyEpqokIwbr26OkowbRkIahKSTCBndt7PiOz3lipsifPXaejz83juVeuazsSITJRA00VWFDx1qB3r6M/NzChsquvjQAGzsTVxwj4PXJ8me9rj3G5s4khqYQD+l85PYh1ObPaENHHEOTg5iqKNw10oGuKiTDOu+/ZYCQoaEoCm/Y0Mb65vdruC3GvVs6UBSFiKnz4VsHCDfv96bN7XQlQwgBvakId21oQ1EUkhGDD986iK7KjYB7tnSRihgAdKVC9GVk5lRR4I716ZXX0JcOrVxWFbnpt8zNg8mVy1s6wiuXDZU1c1rKvHQ5zFpWD9+RVZdDvDirb0usqi27fGRa9bQkjZc44HdBfNWJR3Rl5bkVZCCzzLqW8Jr7rSYZvjTH7OiKrlzORNYWzG1qjvmqAj9zW7/ciFUU3r6zb+U+hgJbuhMoity4e8eqjZwdPXGGWmMA3DiYvq7XucwPdQ9UkIEKeD3wvdhBFELg+mJlUgNoOB6mpqK
 								      "text/plain": [
 								       "<Figure size 864x576 with 16 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

											
										
										
											2017-09-15 14:40:13 +02:00
+								    "from pandas.plotting import scatter_matrix\n",
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "\n",
 								    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
 								    "              \"housing_median_age\"]\n",
 								    "scatter_matrix(housing[attributes], figsize=(12, 8))\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"scatter_matrix_plot\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 38,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOz9ebBtWZ7Xh33WWns+853ffWPONWV3U109IMA0AjcKS2FwWAjCstU2ROCwHQxGCgOehJFbQiGDHBIh2dhGAuxGYDkUaoeEUTNU0E031V1zZWVVzm+683DGPe+1lv9Y+5y87+XLzPeysiozq+4v4sV9Z5+z91lnD+u3fr/f9/f9Cmstl3Zpl3Zpl3ZpP2iTH/UALu3SLu3SLu1H0y4d0KVd2qVd2qV9JHbpgC7t0i7t0i7tI7FLB3Rpl3Zpl3ZpH4ldOqBLu7RLu7RL+0js0gFd2qVd2qVd2kdi3kc9gI+zbWxs2Fu3bq1ep2lKp9P56Ab0BPZJGesnZZzwyRnrJ2Wc8MkZ6+U4P7h95StfObXWbj7yTWvt5b93+feTP/mT9qL943/8j+0nxT4pY/2kjNPaT85YPynjtPaTM9bLcX5wA75s32WOvUzBXdqlXdqlXdpHYpcO6NIu7dIu7dI+Ert0QJd2aZd2aZf2kdilA7q0S7u0S7u0j8R+oA5ICHFbCPEtIcTXhRBfbretCSF+RQjxWvt3dOHzf14I8boQ4hUhxO+/sP0n2+O8LoT4D4QQot0eCiH+Trv9S0KIWxf2+YX2O14TQvzCD/BnX9qlXdqlXdoj7KOAYf8ea+3phdd/DviH1tq/JIT4c+3rPyuE+AzwR4DPArvAPxBCPG+t1cB/DPxx4J8B/zXwLwB/D/hjwNha+6wQ4o8A/y7wh4UQa8C/CXwBsMBXhBC/bK0dfz9+oDEWbS1KCKQU73iv1gYAX8kH3l/uZ7WlwRJIiee9+xph+XlhQbes5koIam0w1tI0ZrX/8nubxlAbgxSCJPDwPElVaeZ1jbKCyFM0uGN6nkRYKBpNow2V0VS1oeN7KE+S1TV53hBHHoFSbiy4E1w2DWleozwBBpCC0JNUlUH5glB6gCWvGn7rjQOSyIfGMjc1gZH0uyG+UighqIymLDReIKgqQ+ArfCXJ8hoRCGLhEYYeXd8HAeOsYJ5XRIHC9z1sY8iqBuUJBAIpBLHyiCIPqy2lNUhryWpNIAUNYIyhqBuwgq1eTNFoylpzOF6QNZqm1gS+wvMUoZDMqoqibOiGPkiBNoZGG+rG4PkSD0HZaCyQBB7GQt1od59INyZPKXqBT2UNTW1IIg+jYV6XFIVmrRPi+x5CQKMNZa3RGE6mGed5xVYvYq0bkVcNX7t7RCAVCoHyJdoYjIZBHGCVQJea3GiaRiOEYJREDKOQTDdMFgW1MfieRFqB8iShp7DtfaSEIPDdtSlqTakbrIHIV1SNAQHDOCQOPLS1GGPJiprTNKexhq4fsNaJEEBRa37ttft0A59+FBKHPpFUzMuKs0WO8iWjMAQpEBaEEpRVwzgrqLSmo3yG/QgfyaKukQjWuhFSCLKqwQqLsILIVygp8ZX7Z4V7VoyxZFVDpTVaW3xPEiqFlO7ZVUK4Z9JCXjbuOTIGYyyekkSeotaGrGnwhSQJPIRyY7WCB/7W2lBrg3LrZbS1q/HU2qCte+atcOd5+f3L/XwlCX212vfh594+ONU8cs4Rlgfmlnebqy5u/0evvcWXvrPgZz7d5fe98PT7zn9PYh+HPqA/APxc+/+/AXwR+LPt9v/MWlsCbwkhXgd+WghxG+hba38DQAjxN4E/iHNAfwD4C+2x/nPgr7bR0e8HfsVae97u8ys4p/W3P+wfU9Saw2mBsRYpBDuDiKi9aYpac+c05XheArDVD7m53iHy1Wq/aV7x+vGCfuQTB4rPXR0wTIJ3/Z68bjicFJhWVSOrG2Z5Q5LX/JPXTvj8zRGRr7hzlvLm8ZzvHMyYFTWjTsCntvvc2ujwm7fPuX2aUjWabuix1g1pjGWzGzLLa04WBW8epxzMcnwp8aQg9iX70wLj7muGnYBQQmWhrjVH84JGa/LaoiR4ErSFQLlbrhd7TLKSP/pszZ/4R19d/S4DKCD0YJD4CAF53WC1JW/cdiHAGhAStIZuKNke9bg2CMmqhteOU9KyRhtDJ/Ao2om/ri2BAt/3GHZC1js+jQGJZW+aE0nFtCgJPMWiqtEaIl/iKUGgPP4Ht3L+d3/1V/GVpGg0nlCMEo+ytiyqCm0AAbGvqKuG0rrfLCxYC8aAVKCkmzAabd33S7etH/sEnsCTCgVowBOSg1mGFBJfSm5udhHWMM8b5lXFybQmM2/fFyHwJ3+s4U/+R19GAr4H0oKWEHkKX0E38piXmqJqqGoIfNjpx+yOYuZZw96sIC8rhABfCpLQI/B9d+6NJPAlg9hDSMEsazhZlPhCUFs3SXZCj6c3uvzU02sMopDvHE34te8ec3+WUVaGtU7IM1tdxouK37u24C//w28ggGEs2elHKCU4npXMsgYEhJ5g1IlAWhptmKQVaQUNEAnodhQhYKXAV4r1Tsgw8Sgay3laMYg8OqHHs1tdro5itvsJNzcSam24fZrynb0prx7P0drSS3yu9COujmI2+zEKMFaQZxX/r9+8zXhRcp43VJVmaxDRDRXTrGGSVYS+4qn1hM9eH1A30Is85kVDN1QcTQuO5gWTtKZoGgQC33P3TzcIKI1u7xFDL/ZpNCSBW9SdpyVpqRnEAS/sdOmFPsZaDqYFFosFFG6uqRpDUetHzjlpWTHOGja6IXGgeHarS1Gbd8xVF+ewP/NLv8nXD3IA/m//DH7mxh5/53/+u77XaXJlP2gHZIH/Rghhgf+rtfavAdvW2gMAa+2BEGKr/exVXISztPvttrr9/8Pbl/vca4/VCCGmwPrF7Y/Y50MzYyyH0wJfudVsow2H04IbawkA+5OcSV7Ri9xpH6cVgZJcHyUcTguEtdw9y4h9t9oMlOClvSk/+9T6A5HQ8nuUgHnRkFYNQrjV0CsHc3b6EZ4UpGXD1++NudqPOUkLDmcFi1ITKIWUkjsnKb/11glR4LPZC3nzZMHBdMG1UnNrs8M37o4x1pDmNQeTjMYYtLQUecl50TAIFUhFXjZMi4J+6COs4ThtaGpoLFSAwDkVA/iioRvCndOGBjcxNw+fR6BuIJ/VqxvU4C58qd1NZIGgPbaxBjWeMZ4rZlmDH0p0YyhrmJQNYbu/BfIGurphbAwnk5TNXsh5VuFLy34JUsBJahC4h8PUhsxCx9MYY5ikBokh9KGyDfO8oTHOwXoS0gbGeUPQjle1f5cmjBsHWFQ7JqPdb6mqGiEhVNCNAxZZRVpDP4QGTVlpvrs/phcqqkYzyyF/SM6rBBddLV+3J1dpKGuNAk4WmkBCYcAHdAPHk5y985ygdfBF5a6fLy15XaNkTeAJun6AZwSTzE1QRW2Ifcl5pqkbQxwq1rsBb52l5GXFp68O+NIbpxxMM3RjCZRglpZ85c2SRQX//MhdwwY4yw1VlVFb0AZCH7IaUm1ZlDndEGblg/dLYaFYaHxgEIEKLW+dlMQ+dOMIJeBsoWmM5s6Zi8IbbenFiv1pzluHc44XJdZYKm2Y5hVlpd3njMFagZQwsJa7pxmLoqZsDL3Y43CaM1tUSE/w9GaPaV7z1mmKtpbPXR1w9yxjqxdw5zzjdFYwLxskgqNpAUJwcz3hJK149WjBM5tdAik5npeczkteuNLnaJZzPCsIPcXOMKJuDF+5M+azu318T5KWjYtarHNCi3YeeNScE3uSN08q9yxZiy/hn75+yhdujuiE3mquujaMV3PYr715b+V8lvaluzP+wStvfmiR0A/aAf0Oa+1+62R+RQjx3ff47KMCSvse2z/oPg9+qRB/HJfeY3t7my9+8Yur9xaLxQOvH3XAujHvCGXfap1H1Ri0sSzfNhamUvCWkjRtiKxrjZCCwlrOpERbyz+573Exk7f8HiHcMU2bIrAWbjQafywxukQcvswMyJVEG0uv1jxvLcKATN0BO9rgVQLVCHqeoelZPCGIp4pPCQ0CdGL
 								      "text/plain": [
 								       "<Figure size 432x288 with 1 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "             alpha=0.1, grid=True)\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "save_fig(\"income_vs_house_value_scatterplot\")  # extra code\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Experimenting with Attribute Combinations"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 39,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing[\"rooms_per_house\"] = housing[\"total_rooms\"] / housing[\"households\"]\n",
 								    "housing[\"bedrooms_ratio\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n",
 								    "housing[\"people_per_house\"] = housing[\"population\"] / housing[\"households\"]"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 40,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "median_house_value    1.000000\n",
 								       "median_income         0.688380\n",
 								       "rooms_per_house       0.143663\n",
 								       "total_rooms           0.137455\n",
 								       "housing_median_age    0.102175\n",
 								       "households            0.071426\n",
 								       "total_bedrooms        0.054635\n",
 								       "population           -0.020153\n",
 								       "people_per_house     -0.038224\n",
 								       "longitude            -0.050859\n",
 								       "latitude             -0.139584\n",
 								       "bedrooms_ratio       -0.256397\n",
 								       "Name: median_house_value, dtype: float64"
 								      ]
 								     },
 								     "execution_count": 40,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "corr_matrix = housing.corr()\n",
 								    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "# Prepare the Data for Machine Learning Algorithms"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Let's revert to the original training set and separate the target (note that `strat_train_set.drop()` creates a copy of `strat_train_set` without the column, it doesn't actually modify `strat_train_set` itself, unless you pass `inplace=True`):"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 41,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "housing_labels = strat_train_set[\"median_house_value\"].copy()"
 								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Data Cleaning"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "In the book 3 options are listed to handle the NaN values:\n",
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								    "\n",
 								    "```python\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing.dropna(subset=[\"total_bedrooms\"], inplace=True)    # option 1\n",
 								    "\n",
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								    "housing.drop(\"total_bedrooms\", axis=1)       # option 2\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								    "median = housing[\"total_bedrooms\"].median()  # option 3\n",
 								    "housing[\"total_bedrooms\"].fillna(median, inplace=True)\n",
 								    "```\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "For each option, we'll create a copy of `housing` and work on that copy to avoid breaking `housing`. We'll also show the output of each option, but filtering on the rows that originally contained a NaN value."
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 42,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "      <th>ocean_proximity</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>14452</th>\n",
 								       "      <td>-120.67</td>\n",
 								       "      <td>40.50</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>5343.0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2503.0</td>\n",
 								       "      <td>902.0</td>\n",
 								       "      <td>3.5962</td>\n",
 								       "      <td>INLAND</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>18217</th>\n",
 								       "      <td>-117.96</td>\n",
 								       "      <td>34.03</td>\n",
 								       "      <td>35.0</td>\n",
 								       "      <td>2093.0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>1755.0</td>\n",
 								       "      <td>403.0</td>\n",
 								       "      <td>3.4115</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>11889</th>\n",
 								       "      <td>-118.05</td>\n",
 								       "      <td>34.04</td>\n",
 								       "      <td>33.0</td>\n",
 								       "      <td>1348.0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>1098.0</td>\n",
 								       "      <td>257.0</td>\n",
 								       "      <td>4.2917</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>20325</th>\n",
 								       "      <td>-118.88</td>\n",
 								       "      <td>34.17</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>4260.0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>1701.0</td>\n",
 								       "      <td>669.0</td>\n",
 								       "      <td>5.1033</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14360</th>\n",
 								       "      <td>-117.87</td>\n",
 								       "      <td>33.62</td>\n",
 								       "      <td>8.0</td>\n",
 								       "      <td>1266.0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>375.0</td>\n",
 								       "      <td>183.0</td>\n",
 								       "      <td>9.8020</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
 								       "14452    -120.67     40.50                15.0       5343.0             NaN   \n",
 								       "18217    -117.96     34.03                35.0       2093.0             NaN   \n",
 								       "11889    -118.05     34.04                33.0       1348.0             NaN   \n",
 								       "20325    -118.88     34.17                15.0       4260.0             NaN   \n",
 								       "14360    -117.87     33.62                 8.0       1266.0             NaN   \n",
 								       "\n",
 								       "       population  households  median_income ocean_proximity  \n",
 								       "14452      2503.0       902.0         3.5962          INLAND  \n",
 								       "18217      1755.0       403.0         3.4115       <1H OCEAN  \n",
 								       "11889      1098.0       257.0         4.2917       <1H OCEAN  \n",
 								       "20325      1701.0       669.0         5.1033       <1H OCEAN  \n",
 								       "14360       375.0       183.0         9.8020       <1H OCEAN  "
 								      ]
 								     },
 								     "execution_count": 42,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "null_rows_idx = housing.isnull().any(axis=1)\n",
 								    "housing.loc[null_rows_idx].head()"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 43,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "      <th>ocean_proximity</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "Empty DataFrame\n",
 								       "Columns: [longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, ocean_proximity]\n",
 								       "Index: []"
 								      ]
 								     },
 								     "execution_count": 43,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_option1 = housing.copy()\n",
 								    "\n",
 								    "housing_option1.dropna(subset=[\"total_bedrooms\"], inplace=True)  # option 1\n",
 								    "\n",
 								    "housing_option1.loc[null_rows_idx].head()"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 44,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "      <th>ocean_proximity</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>14452</th>\n",
 								       "      <td>-120.67</td>\n",
 								       "      <td>40.50</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>5343.0</td>\n",
 								       "      <td>2503.0</td>\n",
 								       "      <td>902.0</td>\n",
 								       "      <td>3.5962</td>\n",
 								       "      <td>INLAND</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>18217</th>\n",
 								       "      <td>-117.96</td>\n",
 								       "      <td>34.03</td>\n",
 								       "      <td>35.0</td>\n",
 								       "      <td>2093.0</td>\n",
 								       "      <td>1755.0</td>\n",
 								       "      <td>403.0</td>\n",
 								       "      <td>3.4115</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>11889</th>\n",
 								       "      <td>-118.05</td>\n",
 								       "      <td>34.04</td>\n",
 								       "      <td>33.0</td>\n",
 								       "      <td>1348.0</td>\n",
 								       "      <td>1098.0</td>\n",
 								       "      <td>257.0</td>\n",
 								       "      <td>4.2917</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>20325</th>\n",
 								       "      <td>-118.88</td>\n",
 								       "      <td>34.17</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>4260.0</td>\n",
 								       "      <td>1701.0</td>\n",
 								       "      <td>669.0</td>\n",
 								       "      <td>5.1033</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14360</th>\n",
 								       "      <td>-117.87</td>\n",
 								       "      <td>33.62</td>\n",
 								       "      <td>8.0</td>\n",
 								       "      <td>1266.0</td>\n",
 								       "      <td>375.0</td>\n",
 								       "      <td>183.0</td>\n",
 								       "      <td>9.8020</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       longitude  latitude  housing_median_age  total_rooms  population  \\\n",
 								       "14452    -120.67     40.50                15.0       5343.0      2503.0   \n",
 								       "18217    -117.96     34.03                35.0       2093.0      1755.0   \n",
 								       "11889    -118.05     34.04                33.0       1348.0      1098.0   \n",
 								       "20325    -118.88     34.17                15.0       4260.0      1701.0   \n",
 								       "14360    -117.87     33.62                 8.0       1266.0       375.0   \n",
 								       "\n",
 								       "       households  median_income ocean_proximity  \n",
 								       "14452       902.0         3.5962          INLAND  \n",
 								       "18217       403.0         3.4115       <1H OCEAN  \n",
 								       "11889       257.0         4.2917       <1H OCEAN  \n",
 								       "20325       669.0         5.1033       <1H OCEAN  \n",
 								       "14360       183.0         9.8020       <1H OCEAN  "
 								      ]
 								     },
 								     "execution_count": 44,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_option2 = housing.copy()\n",
 								    "\n",
 								    "housing_option2.drop(\"total_bedrooms\", axis=1, inplace=True)  # option 2\n",
 								    "\n",
 								    "housing_option2.loc[null_rows_idx].head()"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 45,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "      <th>ocean_proximity</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>14452</th>\n",
 								       "      <td>-120.67</td>\n",
 								       "      <td>40.50</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>5343.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>2503.0</td>\n",
 								       "      <td>902.0</td>\n",
 								       "      <td>3.5962</td>\n",
 								       "      <td>INLAND</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>18217</th>\n",
 								       "      <td>-117.96</td>\n",
 								       "      <td>34.03</td>\n",
 								       "      <td>35.0</td>\n",
 								       "      <td>2093.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1755.0</td>\n",
 								       "      <td>403.0</td>\n",
 								       "      <td>3.4115</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>11889</th>\n",
 								       "      <td>-118.05</td>\n",
 								       "      <td>34.04</td>\n",
 								       "      <td>33.0</td>\n",
 								       "      <td>1348.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1098.0</td>\n",
 								       "      <td>257.0</td>\n",
 								       "      <td>4.2917</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>20325</th>\n",
 								       "      <td>-118.88</td>\n",
 								       "      <td>34.17</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>4260.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1701.0</td>\n",
 								       "      <td>669.0</td>\n",
 								       "      <td>5.1033</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14360</th>\n",
 								       "      <td>-117.87</td>\n",
 								       "      <td>33.62</td>\n",
 								       "      <td>8.0</td>\n",
 								       "      <td>1266.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>375.0</td>\n",
 								       "      <td>183.0</td>\n",
 								       "      <td>9.8020</td>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
 								       "14452    -120.67     40.50                15.0       5343.0           434.0   \n",
 								       "18217    -117.96     34.03                35.0       2093.0           434.0   \n",
 								       "11889    -118.05     34.04                33.0       1348.0           434.0   \n",
 								       "20325    -118.88     34.17                15.0       4260.0           434.0   \n",
 								       "14360    -117.87     33.62                 8.0       1266.0           434.0   \n",
 								       "\n",
 								       "       population  households  median_income ocean_proximity  \n",
 								       "14452      2503.0       902.0         3.5962          INLAND  \n",
 								       "18217      1755.0       403.0         3.4115       <1H OCEAN  \n",
 								       "11889      1098.0       257.0         4.2917       <1H OCEAN  \n",
 								       "20325      1701.0       669.0         5.1033       <1H OCEAN  \n",
 								       "14360       375.0       183.0         9.8020       <1H OCEAN  "
 								      ]
 								     },
 								     "execution_count": 45,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_option3 = housing.copy()\n",
 								    "\n",
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "median = housing[\"total_bedrooms\"].median()\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_option3[\"total_bedrooms\"].fillna(median, inplace=True)  # option 3\n",
 								    "\n",
 								    "housing_option3.loc[null_rows_idx].head()"
-												Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20)

											
										
										
											2018-12-21 03:18:31 +01:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 46,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "from sklearn.impute import SimpleImputer\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "\n",
-												Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20)

											
										
										
											2018-12-21 03:18:31 +01:00
+								    "imputer = SimpleImputer(strategy=\"median\")"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   "cell_type": "markdown",
 								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Separating out the numerical attributes to use the `\"median\"` strategy (as it cannot be calculated on text attributes like `ocean_proximity`):"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 47,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_num = housing.select_dtypes(include=[np.number])"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 48,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "SimpleImputer(strategy='median')"
 								      ]
 								     },
 								     "execution_count": 48,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "imputer.fit(housing_num)"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 49,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([-118.51  ,   34.26  ,   29.    , 2125.    ,  434.    , 1167.    ,\n",
 								       "        408.    ,    3.5385])"
 								      ]
 								     },
 								     "execution_count": 49,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "imputer.statistics_"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   "cell_type": "markdown",
 								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "Check that this is the same as manually computing the median of each attribute:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 50,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([-118.51  ,   34.26  ,   29.    , 2125.    ,  434.    , 1167.    ,\n",
 								       "        408.    ,    3.5385])"
 								      ]
 								     },
 								     "execution_count": 50,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "housing_num.median().values"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   "cell_type": "markdown",
 								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "Transform the training set:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 51,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "X = imputer.transform(housing_num)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 52,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',\n",
 								       "       'total_bedrooms', 'population', 'households', 'median_income'],\n",
 								       "      dtype=object)"
 								      ]
 								     },
 								     "execution_count": 52,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "imputer.feature_names_in_"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 53,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "                          index=housing_num.index)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 54,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>14452</th>\n",
 								       "      <td>-120.67</td>\n",
 								       "      <td>40.50</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>5343.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>2503.0</td>\n",
 								       "      <td>902.0</td>\n",
 								       "      <td>3.5962</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>18217</th>\n",
 								       "      <td>-117.96</td>\n",
 								       "      <td>34.03</td>\n",
 								       "      <td>35.0</td>\n",
 								       "      <td>2093.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1755.0</td>\n",
 								       "      <td>403.0</td>\n",
 								       "      <td>3.4115</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>11889</th>\n",
 								       "      <td>-118.05</td>\n",
 								       "      <td>34.04</td>\n",
 								       "      <td>33.0</td>\n",
 								       "      <td>1348.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1098.0</td>\n",
 								       "      <td>257.0</td>\n",
 								       "      <td>4.2917</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>20325</th>\n",
 								       "      <td>-118.88</td>\n",
 								       "      <td>34.17</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>4260.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1701.0</td>\n",
 								       "      <td>669.0</td>\n",
 								       "      <td>5.1033</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14360</th>\n",
 								       "      <td>-117.87</td>\n",
 								       "      <td>33.62</td>\n",
 								       "      <td>8.0</td>\n",
 								       "      <td>1266.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>375.0</td>\n",
 								       "      <td>183.0</td>\n",
 								       "      <td>9.8020</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
 								       "14452    -120.67     40.50                15.0       5343.0           434.0   \n",
 								       "18217    -117.96     34.03                35.0       2093.0           434.0   \n",
 								       "11889    -118.05     34.04                33.0       1348.0           434.0   \n",
 								       "20325    -118.88     34.17                15.0       4260.0           434.0   \n",
 								       "14360    -117.87     33.62                 8.0       1266.0           434.0   \n",
 								       "\n",
 								       "       population  households  median_income  \n",
 								       "14452      2503.0       902.0         3.5962  \n",
 								       "18217      1755.0       403.0         3.4115  \n",
 								       "11889      1098.0       257.0         4.2917  \n",
 								       "20325      1701.0       669.0         5.1033  \n",
 								       "14360       375.0       183.0         9.8020  "
 								      ]
 								     },
 								     "execution_count": 54,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_tr.loc[null_rows_idx].head()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 55,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "'median'"
 								      ]
 								     },
 								     "execution_count": 55,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "imputer.strategy"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 56,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Add index=housing.index when wrapping array in a DataFrame, fixes #426

											
										
										
											2019-05-12 15:28:56 +02:00
+								    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
 								    "                          index=housing_num.index)"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 57,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>14452</th>\n",
 								       "      <td>-120.67</td>\n",
 								       "      <td>40.50</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>5343.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>2503.0</td>\n",
 								       "      <td>902.0</td>\n",
 								       "      <td>3.5962</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>18217</th>\n",
 								       "      <td>-117.96</td>\n",
 								       "      <td>34.03</td>\n",
 								       "      <td>35.0</td>\n",
 								       "      <td>2093.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1755.0</td>\n",
 								       "      <td>403.0</td>\n",
 								       "      <td>3.4115</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>11889</th>\n",
 								       "      <td>-118.05</td>\n",
 								       "      <td>34.04</td>\n",
 								       "      <td>33.0</td>\n",
 								       "      <td>1348.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1098.0</td>\n",
 								       "      <td>257.0</td>\n",
 								       "      <td>4.2917</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>20325</th>\n",
 								       "      <td>-118.88</td>\n",
 								       "      <td>34.17</td>\n",
 								       "      <td>15.0</td>\n",
 								       "      <td>4260.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>1701.0</td>\n",
 								       "      <td>669.0</td>\n",
 								       "      <td>5.1033</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14360</th>\n",
 								       "      <td>-117.87</td>\n",
 								       "      <td>33.62</td>\n",
 								       "      <td>8.0</td>\n",
 								       "      <td>1266.0</td>\n",
 								       "      <td>434.0</td>\n",
 								       "      <td>375.0</td>\n",
 								       "      <td>183.0</td>\n",
 								       "      <td>9.8020</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
 								       "14452    -120.67     40.50                15.0       5343.0           434.0   \n",
 								       "18217    -117.96     34.03                35.0       2093.0           434.0   \n",
 								       "11889    -118.05     34.04                33.0       1348.0           434.0   \n",
 								       "20325    -118.88     34.17                15.0       4260.0           434.0   \n",
 								       "14360    -117.87     33.62                 8.0       1266.0           434.0   \n",
 								       "\n",
 								       "       population  households  median_income  \n",
 								       "14452      2503.0       902.0         3.5962  \n",
 								       "18217      1755.0       403.0         3.4115  \n",
 								       "11889      1098.0       257.0         4.2917  \n",
 								       "20325      1701.0       669.0         5.1033  \n",
 								       "14360       375.0       183.0         9.8020  "
 								      ]
 								     },
 								     "execution_count": 57,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "housing_tr.loc[null_rows_idx].head()  # not shown in the book"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 58,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "#from sklearn import set_config\n",
 								    "#\n",
 								    "# set_config(pandas_in_out=True)  # not available yet – see SLEP014"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Now let's drop some outliers:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 59,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.ensemble import IsolationForest\n",
 								    "\n",
 								    "isolation_forest = IsolationForest(random_state=42)\n",
 								    "outlier_pred = isolation_forest.fit_predict(X)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 60,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([-1,  1,  1, ...,  1,  1,  1])"
 								      ]
 								     },
 								     "execution_count": 60,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "outlier_pred"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 61,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "#housing = housing.iloc[outlier_pred == 1]\n",
 								    "#housing_labels = housing_labels.iloc[outlier_pred == 1]"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Handling Text and Categorical Attributes"
 								   ]
 								  },
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "Now let's preprocess the categorical input feature, `ocean_proximity`:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 62,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>ocean_proximity</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>13096</th>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14973</th>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>3785</th>\n",
 								       "      <td>INLAND</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14689</th>\n",
 								       "      <td>INLAND</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>20507</th>\n",
 								       "      <td>NEAR OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1286</th>\n",
 								       "      <td>INLAND</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>18078</th>\n",
 								       "      <td>&lt;1H OCEAN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>4396</th>\n",
 								       "      <td>NEAR BAY</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "      ocean_proximity\n",
 								       "13096        NEAR BAY\n",
 								       "14973       <1H OCEAN\n",
 								       "3785           INLAND\n",
 								       "14689          INLAND\n",
 								       "20507      NEAR OCEAN\n",
 								       "1286           INLAND\n",
 								       "18078       <1H OCEAN\n",
 								       "4396         NEAR BAY"
 								      ]
 								     },
 								     "execution_count": 62,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "housing_cat = housing[[\"ocean_proximity\"]]\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_cat.head(8)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 63,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   "source": [
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "from sklearn.preprocessing import OrdinalEncoder\n",
 								    "\n",
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "ordinal_encoder = OrdinalEncoder()\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 64,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[3.],\n",
 								       "       [0.],\n",
 								       "       [1.],\n",
 								       "       [1.],\n",
 								       "       [4.],\n",
 								       "       [1.],\n",
 								       "       [0.],\n",
 								       "       [3.]])"
 								      ]
 								     },
 								     "execution_count": 64,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "housing_cat_encoded[:8]"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 65,
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],\n",
 								       "       dtype=object)]"
 								      ]
 								     },
 								     "execution_count": 65,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "ordinal_encoder.categories_"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 66,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "from sklearn.preprocessing import OneHotEncoder\n",
-												Clarify why we are using OrdinalEncoder and OneHotEncoder

											
										
										
											2018-05-07 20:17:30 +02:00
+								    "\n",
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "cat_encoder = OneHotEncoder()\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 67,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "<16512x5 sparse matrix of type '<class 'numpy.float64'>'\n",
 								       "\twith 16512 stored elements in Compressed Sparse Row format>"
 								      ]
 								     },
 								     "execution_count": 67,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "housing_cat_1hot"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 68,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0., 0., 0., 1., 0.],\n",
 								       "       [1., 0., 0., 0., 0.],\n",
 								       "       [0., 1., 0., 0., 0.],\n",
 								       "       ...,\n",
 								       "       [0., 0., 0., 0., 1.],\n",
 								       "       [1., 0., 0., 0., 0.],\n",
 								       "       [0., 0., 0., 0., 1.]])"
 								      ]
 								     },
 								     "execution_count": 68,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   "source": [
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "housing_cat_1hot.toarray()"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 69,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0., 0., 0., 1., 0.],\n",
 								       "       [1., 0., 0., 0., 0.],\n",
 								       "       [0., 1., 0., 0., 0.],\n",
 								       "       ...,\n",
 								       "       [0., 0., 0., 0., 1.],\n",
 								       "       [1., 0., 0., 0., 0.],\n",
 								       "       [0., 0., 0., 0., 1.]])"
 								      ]
 								     },
 								     "execution_count": 69,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Use OrdinalEncoder and OneHotEncoder from Scikit-Learn 0.20 instead of CategoricalEncoder

											
										
										
											2018-05-07 11:27:59 +02:00
+								    "cat_encoder = OneHotEncoder(sparse=False)\n",
 								    "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n",
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								    "housing_cat_1hot"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 70,
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],\n",
 								       "       dtype=object)]"
 								      ]
 								     },
 								     "execution_count": 70,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   "source": [
 								    "cat_encoder.categories_"
 								   ]
 								  },
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 71,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>ocean_proximity_INLAND</th>\n",
 								       "      <th>ocean_proximity_NEAR BAY</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>1</td>\n",
 								       "      <td>0</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "   ocean_proximity_INLAND  ocean_proximity_NEAR BAY\n",
 								       "0                       1                         0\n",
 								       "1                       0                         1"
 								      ]
 								     },
 								     "execution_count": 71,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "df_test = pd.DataFrame({\"ocean_proximity\": [\"INLAND\", \"NEAR BAY\"]})\n",
 								    "pd.get_dummies(df_test)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 72,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0., 1., 0., 0., 0.],\n",
 								       "       [0., 0., 0., 1., 0.]])"
 								      ]
 								     },
 								     "execution_count": 72,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "cat_encoder.transform(df_test)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 73,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>ocean_proximity_&lt;2H OCEAN</th>\n",
 								       "      <th>ocean_proximity_ISLAND</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>1</td>\n",
 								       "      <td>0</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "   ocean_proximity_<2H OCEAN  ocean_proximity_ISLAND\n",
 								       "0                          1                       0\n",
 								       "1                          0                       1"
 								      ]
 								     },
 								     "execution_count": 73,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "df_test_unknown = pd.DataFrame({\"ocean_proximity\": [\"<2H OCEAN\", \"ISLAND\"]})\n",
 								    "pd.get_dummies(df_test_unknown)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 74,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0., 0., 0., 0., 0.],\n",
 								       "       [0., 0., 1., 0., 0.]])"
 								      ]
 								     },
 								     "execution_count": 74,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "cat_encoder.handle_unknown = \"ignore\"\n",
 								    "cat_encoder.transform(df_test_unknown)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 75,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array(['ocean_proximity'], dtype=object)"
 								      ]
 								     },
 								     "execution_count": 75,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "cat_encoder.feature_names_in_"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 76,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array(['ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',\n",
 								       "       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',\n",
 								       "       'ocean_proximity_NEAR OCEAN'], dtype=object)"
 								      ]
 								     },
 								     "execution_count": 76,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "cat_encoder.get_feature_names_out()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 77,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "df_output = pd.DataFrame(cat_encoder.transform(df_test_unknown),\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "                         columns=cat_encoder.get_feature_names_out(),\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "                         index=df_test_unknown.index)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 78,
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>ocean_proximity_&lt;1H OCEAN</th>\n",
 								       "      <th>ocean_proximity_INLAND</th>\n",
 								       "      <th>ocean_proximity_ISLAND</th>\n",
 								       "      <th>ocean_proximity_NEAR BAY</th>\n",
 								       "      <th>ocean_proximity_NEAR OCEAN</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>1.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "   ocean_proximity_<1H OCEAN  ocean_proximity_INLAND  ocean_proximity_ISLAND  \\\n",
 								       "0                        0.0                     0.0                     0.0   \n",
 								       "1                        0.0                     0.0                     1.0   \n",
 								       "\n",
 								       "   ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  \n",
 								       "0                       0.0                         0.0  \n",
 								       "1                       0.0                         0.0  "
 								      ]
 								     },
 								     "execution_count": 78,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "source": [
 								    "df_output"
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Feature Scaling"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 79,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import MinMaxScaler\n",
 								    "\n",
 								    "min_max_scaler = MinMaxScaler(feature_range=(-1, 1))\n",
 								    "housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 80,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import StandardScaler\n",
 								    "\n",
 								    "std_scaler = StandardScaler()\n",
 								    "housing_num_std_scaled = std_scaler.fit_transform(housing_num)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 81,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAi8AAADICAYAAAAgCgFXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgMElEQVR4nO3debgcVbnv8e+PgATCIIjkhjAENXqZrkFiDIIYQCWAHpAjj/igDILcI6hR8SqIQzzKIQoyqfAAigFlkCOjIB4jsEU0DIEDBMIJBAgQiAREkEQIBN77x1oNnU4Ptffuce/f53n66epV1VVvrXRV3r1qVS1FBGZmZma9YrVOB2BmZmbWH05ezMzMrKc4eTEzM7Oe4uTFzMzMeoqTFzMzM+spTl7MzMysp6ze6QDaZaONNopx48Y1XG7ZsmWMGjWq9QF1OddD4npI2l0Pt99++9MR8eZmrKvosd9KQ+V35P3oLkNxP4oe+8MmeRk3bhxz5sxpuFxfXx9TpkxpfUBdzvWQuB6SdteDpEeata6ix34rDZXfkfejuwzF/Sh67PuykZmZmfUUJy9mZmbWU5y8mJmZWU9x8mJmZmY9xcmLmZmZ9ZRhc7dRM4075pqVPi+csXeHIjEzMxt+3PJiZmZmPWXAyYukt0ka2cxgzMzMzBoplLxI+g9JB+dpSZoF3A8slvSeVgZoZmZmVq5oy8uBwPw8vScwAZgMnA/MaH5YZmZmZtUV7bA7GliUp/cCLomIWyU9A3T2udtmZmY2rBRtefkbsEWe/hBwfZ5eHVCzgzIzMzOrpWjLy6XAhZLuBzYEfpfLJwALWhCXmZmZWVVFk5cvA48AmwNfjYhluXwMcGYrAjMzMzOrpmjysglwSkS8WlF+KrBZUyMyMzMzq6Non5eHgY2qlG+Y55mZmZm1RdHkRUBUKV8HeLF54ZiZmZnVV/eykaTT82QAJ0j6Z9nsEcAk4M7WhGZmZma2qkZ9XrbL7wK2Al4qm/cScAdwUgviMjMzM6uqbvISEbsCSPo5MC0i/tGWqHpM5SjT4JGmzczMWqVon5evA+tVFkraVNLo5oZkZmZmVlvR5OV80phGlfYAftG8cMzMzMzqK5q8vBu4sUr5n4CJzQvHzMzMrL6iycvqwJpVykfWKDczMzNriaLJyy3AZ6uUHwXcVmQFkkZKulXSXZLulfSdXL6hpFmSHsjvG5R951hJCyTNl7RHWfkOkubmeadL8uCQZmZmw0TR4QGOA66X9E7guly2G7A98IGC61gO7BYRSyWtAdwk6VpgP+C6iJgh6RjgGOBrkrYGDgC2IQ1P8AdJb4+IV0jjKR0B3Az8FpgKXFswDjMzM+thhVpeIuJmYEfgIVKy8a+kYQF2jIi/FFxHRMTS/HGN/ApgH+C8XH4esG+e3ge4OCKWR8TDpNGrJ0kaA6wXEbMjIkidiUvfMTMzsyGuaMsLEXEX8MnBbEzSCOB24G3ATyLiFkmjI2Jx3sZiSRvnxceSWlZKFuWyl/N0ZbmZmZkNAzWTF0kbRsQzpel6Kykt10i+5DNB0huByyVtW2fxav1Yok75qiuQjiBdXmL06NH09fU1jHHp0qUNlzt6uxUN11NkW92sSD0MB66HpNfqYSDHfiv1Wv3V4v3oLsN5P+q1vDwlaUxELAGepnqCUBqwcUR/NhoRz0rqI/VVeTJvZ3G+JLQkL7YI2Kzsa5sCT+TyTauUV9vO2cDZABMnTowpU6Y0jK2vr49Gyx1S5Ym6lRYe2Hhb3axIPQwHroek1+phIMd+K/Va/dXi/eguw3k/6iUvuwGlFpVdBxjTayS9GXg5Jy5rkTr6fh+4CjgYmJHfr8xfuQq4UNLJpA6744FbI+IVSc9Lmky6C+og4EeDjc/MzMx6Q83kJSL+CCBpddIdP1dERNUWjoLGAOflfi+rAZdExNWSZgOXSDoMeBTYP2//XkmXAPOAFcBR+bITpNu2ZwJrke4y8p1GZmZmw0TDDrsRsULSiUDjayX113M36dbqyvK/AbvX+M7xwPFVyucA9frLmJmZ2RBV9CF1NwM7tDIQMzMzsyKK3ip9DnCSpM1JtzovK58ZEXc0OzAzMzOzaoomLxfm95OrzOv33UZmZmZmA1U0edmypVGYmZmZFVQ0edkC+EtErPR0tnwn0nuBR5odmJmZmVk1RTvs3gBUe8ru+nmemZmZWVsUTV5KT9Kt9CYqOu+amZmZtVLdy0aSrsqTAfxS0vKy2SNIz1opNKq0mZmZWTM06vPyt/wu4O/AC2XzXgJuIt1GbWZmZtYWdZOXiDgUQNJC4KSI8CUiMzMz66iifV6+S1mri6T/JelwSe9tTVhmZmZm1RVNXq4BPg8gaR1gDnAi8EdJB7UoNjMzM7NVFE1edgCuz9P7Af8ANgY+A3ylBXGZmZmZVVU0eVkXeDZPfwi4PCJeJiU0b21BXGZmZmZVFU1eHgV2kjQK2AOYlcs3BP7ZisDMzMzMqik6PMDJwC+ApaShAG7M5bsAc1sQl5mZmVlVhZKXiDhL0u3AZsCsiHg1z3oQ+GargjMzMzOrVLTlhYiYQ7rLqLzsmqZHZGZmZlZHzeRF0peBMyLixTxdU0Sc3PTIzMzMzKqo1/LyeeA84MU8XUuQ+sSYmZmZtVzN5CUitqw2bWZmZtZJRW+VNjMzM+sK9fq8fKvoSiLi35sTztAx7piV+zIvnLF3hyIxMzMbWur1edm/4vMWwNrAE/nzJqQH1C0EnLyYmZlZW9Tr87JdaVrSocBBwMER8Wgu2xz4OXBBq4M0MzMzKyna5+VbwBdLiQtAnj4a+HYrAjMzMzOrpmjyMhpYq0r5SGCj5oVjZmZmVl/R5GUWcI6kyZJG5Ndk4CxeH6TRzMzMrOWKJi+HA48BfyE9tO5F4M/A48BnWhOamZmZ2aqKDsz4FLCXpPHAVoCA+yLi/lYGZ2ZmZlap8MCMABHxAPBAi2IxMzMza6htT9iVtJmkGyTdJ+leSdNy+YaSZkl6IL9vUPadYyUtkDRf0h5l5TtImpvnnS5J7doPMzMz66x2Dg+wAjg6IrYCJgNHSdoaOAa4LiLGA9flz+R5BwDbAFOBMySNyOs6EzgCGJ9fU9u4H2ZmZtZBbUteImJxRNyRp58H7gPGAvuQRq8mv++bp/cBLo6I5RHxMLAAmCRpDLBeRMyOiADOL/uOmZmZDXE1kxdJ50paN0/vIqlf/WPqkTQO2B64BRgdEYshJTjAxnmxsaQ7nEoW5bKxebqy3MzMzIaBegnJJ4GvA88DNwBjgCWD3aCkdYBLSU/s/Ued7irVZkSd8mrbOoJ0eYnRo0fT19fXML6lS5c2XO7o7VY0XE+lItvuJkXqYThwPSS9Vg8DOfZbqdfqrxbvR3cZzvtRL3lZCHxe0u9JCcOOkv5ebcGIuLHIxiStQUpcLoiIy3Lxk5LGRMTifEmolCAtAjYr+/qmpEEhF+XpyvJqcZ0NnA0wceLEmDJlSsMY+/r6qFyucoToft6kBcDCAxtvu5tUq4fhyPWQ9Fo9DOTYb6Veq79avB/dZTjvR73/hf8fcA5wLKll4/IaywUwosa81+Q7gn5Gej7MyWWzrgIOBmbk9yvLyi+UdDJpBOvxwK0R8Yqk5/MTfm8hDRj5o0bbNzMza4fKP3gXzti7Q5EMXfVGlb4SuFLSG4FnSHf9DOay0U7Ap4C5ku7MZV8nJS2XSDoMeBTYP2//XkmXAPNIdyodFRGv5O99FphJGm/p2vwyMzOzYaDh9Y+IeFbSrsADEdH/zh6vr+cmqvdXAdi9xneOB46vUj4H2HagsZiZmVnvKjo8wB8lrSnpIGBr0qWiecCFEbG8lQGamZmZlSuUvOQHxl0LrA/MzcWfAaZLmhoR97UoPjMzs57mPjDNV/QhdacBdwKbR8T7IuJ9wObAXcCprQnNzMzMbFVF7/ndCXh3RPyjVJCf0XIccHNLIjMzM+sBqz5Ow1qtaMvLi8Abq5Svn+eZmZmZtUXR5OU3wDmSdpI0Ir92Bs4iPY/FzMzMrC2KJi/TgAeAP5FaWl4E/gjcD3yxJZGZmZmZVVH0VulngX0kvQ3YivS8lnk
 								      "text/plain": [
 								       "<Figure size 576x216 with 2 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – this cell generates Figure 2–17\n",
-												Remove redundant comment

											
										
										
											2021-12-08 03:16:42 +01:00
+								    "fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing[\"population\"].hist(ax=axs[0], bins=50)\n",
 								    "housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n",
 								    "axs[0].set_xlabel(\"Population\")\n",
 								    "axs[1].set_xlabel(\"Log of population\")\n",
 								    "axs[0].set_ylabel(\"Number of districts\")\n",
 								    "save_fig(\"long_tail_plot\")\n",
 								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "What if we replace each value with its percentile?"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 82,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEMCAYAAAAxoErWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAAiNElEQVR4nO3deZhdVZnv8e+PgBITZRAtw5iosRWJgpRpFLWrhJY4xgmNrTRRMPYVEezYdNCronSuXgVEW/AaxAZELHNVJAIOGCiQlsEEkUCYggmQgAloGAohGnj7j72KnBzOrtpVdaY6+/d5nvOcs9ce1rtOVe239rSWIgIzM7Natml1AGZm1r6cJMzMLJeThJmZ5XKSMDOzXE4SZmaWy0nCzMxyNS1JSNpe0rWSfi/pJkmfT+UnSFon6fr0elPFOsdLWiXpVkmHNCtWMzPLqFnPSUgSMCkiBiRtB1wJHAPMAgYi4qSq5fcGvg/MBHYFfgW8KCIez6tjl112ialTpxaO6ZFHHmHSpEkjbcq4V8Z2l7HNUM52l7HNMLZ2L1++/P6IeE6teduOKaoRiCwbDaTJ7dJrqAw1G+iLiE3AakmryBLGVXkrTJ06lWXLlhWOqb+/n56ensLLd4oytruMbYZytruMbYaxtVvSnXnzmnpNQtIESdcDG4BLIuKaNOtjkm6Q9B1JO6Wy3YC7K1Zfm8rMzKxJmna6aatKpR2B84GjgfuA+8mOKk4EpkTEhySdBlwVEeemdc4ELo6IH1Vtax4wD6Crq2v/vr6+wnEMDAwwefLksTdonClju8vYZihnu8vYZhhbu3t7e5dHRHeteU073VQpIh6Q1A/MqrwWIekM4MI0uRbYo2K13YF7amxrEbAIoLu7O0ZyuOXD0vIoY5uhnO0uY5uhce1u5t1Nz0lHEEiaCBwM3CJpSsVi7wBuTJ+XAHMkPV3SNGA6cG2z4jUzs+YeSUwBzpY0gSw5LY6ICyV9V9K+ZKeb1gAfAYiImyQtBlYCm4GjhrqzyczM6q+ZdzfdAOxXo/ywIdZZCCxsZFxmZpbPT1ybmVkuJwkzM8vlJGFmZrlacgusjc3UBRfVLF/zpTc3ORKrB/88rZ05SZRY3s4JWreD8g7TrL04SZiNUrsltFrxzJ+xmZ7mh2IdxEmiQqP/6Ee6/aH+0283rdphFv2O5s/YzNwFF404nvH0MxhP2u1vrV6acXSeV8dZsxrT862TRAGN3lE0e/uDO8x6bKveyzdau/33Pxr1+hm0Y8KsRx3zZ2wmb9fWyt/HdvtbKMpJwoz6/gGPl53BeInTWsu3wJqZWS4fSVhN7fZfZrvFY1YWThIdxDvSzuKfp7UDn24yM7NcThJmZpbLScLMzHI5SZiZWS4nCTMzy+UkYWZmuZwkzMwsl5OEmZnlcpIwM7NcThJmZparaUlC0vaSrpX0e0k3Sfp8Kt9Z0iWSbk/vO1Wsc7ykVZJulXRIs2I1M7NMM48kNgGvj4iXA/sCsyQdACwAlkbEdGBpmkbS3sAc4KXALOB0SROaGK+ZWek1LUlEZiBNbpdeAcwGzk7lZwNvT59nA30RsSkiVgOrgJnNitfMzEAR0bzKsiOB5cALgdMi4t8lPRARO1YsszEidpL0DeDqiDg3lZ8J/Cwifli1zXnAPICurq79+/r6CsczMDDA5MmTn5xese7BUbdtPOmaCOsfbXUUzVXGNkM5213GNgNM22HCVvuzkejt7V0eEd215jW1q/CIeBzYV9KOwPmS9hlicdXaRI1tLgIWAXR3d0dPT0/hePr7+6lcfrRDeo4382ds5uQV5eolvoxthnK2u4xthmyM65Hs/4pqyd1NEfEA0E92rWG9pCkA6X1DWmwtsEfFarsD9zQvSjMza+bdTc9JRxBImggcDNwCLAEOT4sdDlyQPi8B5kh6uqRpwHTg2mbFa2ZmzT3dNAU4O12X2AZYHBEXSroKWCzpCOAu4FCAiLhJ0mJgJbAZOCqdrjIzsyZpWpKIiBuA/WqU/wk4KGedhcDCBodmZmY5/MS1mZnlcpIwM7NcThJmZpbLScLMzHKNOklIeqGk7esZjJmZtZdCSULS/5F0ePosSZcAtwH3Svr7RgZoZmatU/RI4v3ArenzG8l6cT0AOAf4Uv3DMjOzdlD0OYkusm4yAN5E9iDctZL+DCxrSGRmZtZyRY8k/gTslT6/Abg0fd6W2h3xmZlZByh6JPEj4DxJtwE7Az9P5fuSjfNgZmYdqGiS+FfgTmBP4LiIeCSVTwG+2YjAzMys9YomiV2Br0bEE1Xlp7J1d95mZtZBil6TWA3sUqN85zTPzMw6UNEkIWqMCgdMBh6rXzhmZtZOhjzdJOnr6WMAX5T0l4rZE4CZwPWNCc3MzFptuGsSM9K7gJcAf62Y91fgOuCkBsRlZmZtYMgkERG9AJL+CzgmIh5qSlRmZtYWil6T+BTwrOpCSbtL6qpvSGZm1i6KJolzyPpsqnYI8N36hWNmZu2kaJJ4JXBFjfJfA931C8fMzNpJ0SSxLfD0GuXb55SbmVkHKJokrgH+V43yo4Df1i8cMzNrJ0W75fg0cKmklwNLU9nrgf2Ag4tsQNIeZNc2ngc8ASyKiK9JOgH4MHBfWvRTEXFxWud44AjgceDjEfGLgvGamVkdFEoSEXG1pFcB/wa8k+y5ieuAj0bE7wvWtRmYHxHXSXomsDyNcAdZv1BbPW8haW9gDvBSsr6jfiXpRRHxeMH6zMxsjIoeSZCSwQdGW1FE3Avcmz4/LOlmYLchVpkN9EXEJmC1pFVkT3hfNdoYzMxsZBRRq0smkLRzRPx58PNQGxlcrnCl0lSyu6X2IeuGfC7wENkod/MjYqOkbwBXR8S5aZ0zgZ9FxA+rtjUPmAfQ1dW1f19fX+E4BgYGmDx58pPTK9Y9OJJmjFtdE2H9o62OornK2GYoZ7vL2GaAaTtM2Gp/NhK9vb3LI6LmnapDHUncJ2lKRGwA7qd2B3+DHf9NKBqMpMlkgxgdGxEPSfomcGLazonAycCHqD3i3VNiiIhFwCKA7u7u6OnpKRoK/f39VC4/d8FFhdcdz+bP2MzJKwofRHaEMrYZytnuMrYZ4KxZkxjJ/q+oob7J1wODRwi99ahM0nZkCeJ7EfFjgIhYXzH/DODCNLmWrceq2B24px5xmJlZMblJIiIuB5C0LdnF459ExKh30pIEnAncHBGnVJRPSdcrAN4B3Jg+LyEbMvUUsgvX04FrR1u/mZmN3LDHZBGxWdJXgLGeizkQOAxYIen6VPYp4H2S9iU7lbQG+Eiq9yZJi4GVZHdGHeU7m8zMmqvoiburgf3JxrkelYi4ktrXGS4eYp2FwMLR1mlmZmNTNEmcAZwkaU9gOfBI5cyIuK7egZmZWesVTRLnpfdTaswb0d1NZmY2fhRNEtMaGoWZmbWlokliL+A3EbG5sjDd+fRqxnCtwszM2lfRXmAvA2o9db1DmmdmZh2oaJIYfLK62rOpuohtZmadY8jTTZKWpI8BnCtpU8XsCWR9L/2mQbGZmVmLDXdN4k/pXcBGoLLbrL8CV5LdHmtmZh1oyCQRER8EkLQGOCkifGrJzKxEil6TOJGKowhJz5N0pKRXNyYsMzNrB0WTxEXA0fBkV9/LgK8Al0v65wbFZmZmLVY0SewPXJo+v5NsgKDnko1N/ckGxGVmZm2gaJJ4JvBA+vwG4PyI+BtZ4nhBA+IyM7M2UDRJ3AUcKGkScAhwSSrfGfhLIwIzM7PWK9otxynAd4EBsi44rkjlrwNWNCAuMzNrA4WSRER8S9JysuFEL4mIJ9KsO4DPNCo4MzNrrcKjhUfEMrK7mirLxjpanZmZtbHcJCHpX4HTI+Kx9DlX5ZjVZmbWOYY6kjgaOBt4LH3OE9QejMjMzMa53CQREdNqfTYzs/IoegusmZmV0FDXJD5bdCMR8YXhlpG0B3AO8DzgCWBRRHxN0s7AD4CpwBrgPRGxMa1zPHAE8Djw8Yj4RdGYzMxs7Ia6JnFo1fRewDOAe9L0rmQP0q0Bhk0SwGZgfkRcJ+mZwHJJlwBzgaUR8SVJC4AFwL9L2huYA7w01fUrSS+
 								      "text/plain": [
 								       "<Figure size 432x288 with 1 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – just shows that we get a uniform distribution\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "percentiles = [np.percentile(housing[\"median_income\"], p)\n",
 								    "               for p in range(1, 100)]\n",
 								    "flattened_median_income = pd.cut(housing[\"median_income\"],\n",
 								    "                                 bins=[-np.inf] + percentiles + [np.inf],\n",
 								    "                                 labels=range(1, 100 + 1))\n",
 								    "flattened_median_income.hist(bins=50)\n",
 								    "plt.xlabel(\"Median income percentile\")\n",
 								    "plt.ylabel(\"Number of districts\")\n",
 								    "plt.show()\n",
 								    "# Note: incomes below the 1st percentile are labeled 1, and incomes above the\n",
 								    "# 99th percentile are labeled 100. This is why the distribution below ranges\n",
 								    "# from 1 to 100 (not 0 to 100)."
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 83,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.metrics.pairwise import rbf_kernel\n",
 								    "\n",
 								    "age_simil_35 = rbf_kernel(housing[[\"housing_median_age\"]], [[35]], gamma=0.1)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 84,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAaAAAAEQCAYAAAD2/KAsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABSr0lEQVR4nO2deXxcVfXAv2eyJ933faOlUKAUKGVT9h0EXEAQhSKLKCLuFNGfgqLFhUVBsAIKCALKVgFBZBGQtrR0AdpSW9rSJl3SLWmzZ5Lz++O+aSaTyeRNZiYzk5zv5/M+M3PfffedN5m88869ZxFVxTAMwzC6mkC6BTAMwzB6JqaADMMwjLRgCsgwDMNIC6aADMMwjLRgCsgwDMNIC6aADMMwjLSQm24BuopAIKBFRUXpFsMwDCNp1NTUqKpmrSHRYxRQUVER1dXV6RbDMAwjaYhIbbplSISs1ZyGYRhGdmMKyDAMw0gLpoAMwzCMtGAKyDAMw0gLpoAMwzC6KSI8IEK5CB+0s19E+K0Ia0R4T4RDu1K+HuMF1xG7d++mvLycxsbGdItidFNKSkoYNWoUgYA99xldxp+Bu4CH2tl/BjDJ244A7vFeuwRTQDjls3XrVkaOHElRUREikm6RjG5Gc3MzZWVlbN++nSFDhqRbHKOHoMobIoyL0eVc4CFVFJgvQj8RhquyuSvks0cxoLy8nJEjR1JcXGzKx0gJgUCAoUOHUllZmW5R2qAKjz0GRx8N550HK1emWyKjCxkJbAz7XOq1dQlmAQGNjY1YlgQj1eTl5REMBtMtRiuqquBLX4Jnnmlp69cP/vznNAlkxEuuiCwK+zxHVefEcXy0J+4uq1JqCsjDLB8j1WTab0wVzj8fXnrJfX70UXj6aff6s5+5/aNHp1fG7sa4Wc9HbV8/+6zODhlU1emdFshZPOF/5VHApgTGiwubgjOMHooIXHYZDB0Khx8OF10Ev/wlNDXBddfBxInw7rvpltJIMXOBSzxvuCOByq5a/wFTQIbRI1FvkmXiRNiyBa66yn0eNw5OPhkWLoT+/eHqq51CMrITEf4KzAMmi1AqwuUiXC3C1V6XF4C1wBrgj8DXulI+U0BGj+DJJ2HKFCgocK9PP93xMdddB9OnQ2GhuzFH4/334bjjoKgIRo6Em29uublnMhddBHfe6dZ+cnLg059u2Xf++bBxI3znO7BoETz8cNrENBJElYtUGa5KniqjVLlflXtVudfbr6pco8o+qhykyqKOxkwmXaaAROQBESkXkQ/C2gaIyMsistp77R+27wYRWSMiq0TktLD2w0TkfW/fbyXTJtaNjGPePPj85+Hii2HpUvd6/vmwYEHs45qb4dJL4ZJLou/fvRtOOcVNYS1cCL/9LfzqV3DbbUm/hKSyeDE8/jjU1MCrr7rpt4EDW/affLJ7LSiAadPg5z+HDPOdMLoJXWkB/Rk4PaJtFvCKqk4CXvE+IyJTgAuBA7xjfi8iOd4x9wBX0RI8FTlmj6G62t0ce/VyN8Ff/ALOPhtmzmzp85e/uBtM794wZIi78ZaVtex//XW3FvDPf8Jhh7kn+U9+EkpL4T//gYMPduOffTbs2NFy3MyZru3WW2HYMOjbF2bNcjftn/zEnWvYMLc/nNtug6lToaTEWQxXXAEVFSn7igC44w444QS48UbYf3/3evzxrj0Wv/sdXHst7Ltv9P2PPOJu4g8+CAceCJ/9LFx/vbvGTLaCfvEL9/eaORPeeQeOPbb1/nHjYPx4eO01+L//gw0bnCVkGMmmyxSQqr4B7IxoPhd40Hv/IHBeWPtjqlqvqutw85MzRGQ40EdV56mq4qJ7z6OH8p3vOCXx9NPuSXbZMnjzzdZ9Ghrgppvcvueeg+3b3fRLJD/+sbshL1gAu3Y5i+Hmm2HOHKekli93iiWcN96Adevc/nvvdQvYZ54J9fXw1luu/6xZrReyAwF3nuXLnbfVO++4m3wsrr7aKcFY24YN7R8/bx6cemrrttNOg7ffjn3ejpg3zynrcA/+006DTZtg/frExk4VGzfCU0/BV78KH34IjY1tFRDAiSe6v+vZZ7sHliOP7HJRjR5Aut2wh6rqZgBV3SwioRDxkcD8sH6h4KhG731ke1L55jfdVE1XMm1ax0/k4VRVwQMPwEMPuWkggPvvh1GjWvf78pdb3k+YAPfc46yA0tLWfX/6U3czBXfDv/ZapzgO9TJDXXop/P3vrcfu2xfuvtutIey3H/zmN+7m++KLbv+++8Ls2e5J+rDDXNs3v9ly/LhxTmmde66zItrLUHPzzfDd78b+PkaMaH/fli3OQgxn6FDXnghbtrT9vkPn2bLFWRGZxgMPOOvsqqvcdy4Cn/hE234nnuh+T8uXu98mOGWVl9el4hrdnHQroPZoLzgqrqApEbkKN11Hfn5+ciTLED76yN0QZsxoaSspcVNB4Sxe7CygpUth586WqaENG1rfPKdObXkfuokedFDrtvLy1mNPmeKUT3iffv1a94k87tVX3RTQypVQWek8rBoa3A27PSUyZIjbEiFypVC1bVuyxo3Wnimcfrqz2MaPdxbc1KnuQSKS6V5kyeLFrs9JJ7nfw29/27XyGt2bdCugrSIy3LN+hgOhW1V7wVGl3vvI9qh4EcFzAEpKSnzPysdjiaQLPze66mo3JXTyyc6TacgQNwX3yU+6m3444U+2oTEj25qb2z8m1CdaW+i4jz+Gs86CK690Vs3Age4Gd9FFbeUJ5+qr3VpWLFasgDFjou8bNqyttVNe3tYqipf2xoXEx04VRxzhNlVYssRNsUVj4kQ3tblkibOiBw6EJ56A229v/dBhGImQbjfsucCl3vtLgWfD2i8UkQIRGY9zNnjHm67bIyJHet5vl4Qd06OYONHd7N95p6WtpgY+CEu6/uGHTuH8/Odunn+//dpaMV3JokVO0dx+Oxx1lJui2+Qj5vrmm50FF2uLNQV31FHw8sut215+2eU+S4SjjnJrbnV1rccdMaJ9t+108vTTTuGDU5zbtjknk2gEAm5fqP+FF8LWrW7N0TCSRVe6YYcFREmpiFwOzAZOEZHVwCneZ1R1OfAEsAJ4EbhGVUPhcF8F7sM5JnwE/LOrriGT6NXLPZlefz288oqzAK64wlkbIQtmzBjnSnvXXbB2LTz/PPzoR+mTedIkJ98ddzjnhb/+1Z+1OWSIU7ixttwYtvx117VM/X34oXt97bXW61F33eUUdDhr1jjltmmTU5whZRey1r7wBSgudt5kH3zgFvdnz4ZvfzvzpuCCQWdJhrwSly1zr6H1nWgceqjr19TknEtKSpz7tmEkiy6bglPVKL5XAJzUTv9bgFuitC8CDmx7RM/j179202znnOMU0re+5Z5SCwvd/sGD3ULzD37gnAWmTnUuwqenyXF96lQX/HjrrfDDHzoL5Ne/dh53qeToo1225x/+0Hn77bOPu5EeEVb1ZPt2WLWq9XFXXNH6if+QQ9zrunXOwunb11k811zj1kz693eeid/+dmqvpzO88Yazfi+4wH0OOdmEr/1Fcsgh7ve1Zg1Mnux+Z3//u1PW5oxgJAPRTA5YSCIlJSVaXV0ddd/KlSvZf//9u1ii5FNfD2PHwve+526ERuaRrt9aaB2tvNxZbRdd5JwQYrmLL1nirKDHH3eK6+23nVfcl77U8pBjxEeyk5GKSI2qliQiUzpJtxOCkQBLljhvshkzYM8eZ1ns2ZN6i8LILlRh7lw44wynfMClEIpl/YCzekTctCU4SzLRdTPDCCfdTghGgtx2m5sqOfFEN/32xhttY1OMns2GDS5t0Jlnus9NTbB6tYsHi0VxsbOoQwoIYPNmlwHCMJKBWUBZzCGHWIoUo2PGjnVplELu8OvXO0eKyZM7Pna//VoroCefdEHKM2Y4pxLDSASzgAyjB1BQ0JIyKORs4VcBrVrVorxCVtQ/e6TvqZFsTAEZRjemstKlQQqlR4L4FVBNjUvdBC6d0+TJ8MILyZfV6HmYAjKMbsy//+2CSUvC/KRWrYIBA2DQoI6PD8VGhU/DnXGGS1QaHoBrGJ3BFJBhdGNeecXFiIVns16
 								      "text/plain": [
 								       "<Figure size 432x288 with 2 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – this cell generates Figure 2–18\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "ages = np.linspace(housing[\"housing_median_age\"].min(),\n",
 								    "                   housing[\"housing_median_age\"].max(),\n",
 								    "                   500).reshape(-1, 1)\n",
 								    "gamma1 = 0.1\n",
 								    "gamma2 = 0.03\n",
 								    "rbf1 = rbf_kernel(ages, [[35]], gamma=gamma1)\n",
 								    "rbf2 = rbf_kernel(ages, [[35]], gamma=gamma2)\n",
 								    "\n",
 								    "fig, ax1 = plt.subplots()\n",
 								    "\n",
 								    "ax1.set_xlabel(\"Housing median age\")\n",
 								    "ax1.set_ylabel(\"Number of districts\")\n",
 								    "ax1.hist(housing[\"housing_median_age\"], bins=50)\n",
 								    "\n",
 								    "ax2 = ax1.twinx()  # create a twin axis that shares the same x-axis\n",
 								    "color = \"blue\"\n",
 								    "ax2.plot(ages, rbf1, color=color, label=\"gamma = 0.10\")\n",
-												Remove redundant comment

											
										
										
											2021-12-08 03:16:42 +01:00
+								    "ax2.plot(ages, rbf2, color=color, label=\"gamma = 0.03\", linestyle=\"--\")\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "ax2.tick_params(axis='y', labelcolor=color)\n",
 								    "ax2.set_ylabel(\"Age similarity\", color=color)\n",
 								    "\n",
-												Remove labelcolor argument in plt.legend(), as it requires recent matplotlib

											
										
										
											2022-02-20 00:55:08 +01:00
+								    "plt.legend(loc=\"upper left\")\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "save_fig(\"age_similarity_plot\")\n",
 								    "plt.show()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 85,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.linear_model import LinearRegression\n",
 								    "\n",
 								    "target_scaler = StandardScaler()\n",
 								    "scaled_labels = target_scaler.fit_transform(housing_labels.to_frame())\n",
 								    "\n",
 								    "model = LinearRegression()\n",
 								    "model.fit(housing[[\"median_income\"]], scaled_labels)\n",
 								    "some_new_data = housing[[\"median_income\"]].iloc[:5]  # pretend this is new data\n",
 								    "\n",
 								    "scaled_predictions = model.predict(some_new_data)\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "predictions = target_scaler.inverse_transform(scaled_predictions)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 86,
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[131997.15275877],\n",
 								       "       [299359.35844434],\n",
 								       "       [146023.37185694],\n",
 								       "       [138840.33653057],\n",
 								       "       [192016.61557639]])"
 								      ]
 								     },
 								     "execution_count": 86,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "predictions"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 87,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.compose import TransformedTargetRegressor\n",
 								    "\n",
 								    "model = TransformedTargetRegressor(LinearRegression(),\n",
 								    "                                   transformer=StandardScaler())\n",
 								    "model.fit(housing[[\"median_income\"]], housing_labels)\n",
 								    "predictions = model.predict(some_new_data)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 88,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([131997.15275877, 299359.35844434, 146023.37185694, 138840.33653057,\n",
 								       "       192016.61557639])"
 								      ]
 								     },
 								     "execution_count": 88,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "predictions"
 								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Custom Transformers"
 								   ]
 								  },
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "To create simple transformers:"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 89,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.preprocessing import FunctionTransformer\n",
 								    "\n",
 								    "log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)\n",
 								    "log_pop = log_transformer.transform(housing[[\"population\"]])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 90,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "rbf_transformer = FunctionTransformer(rbf_kernel,\n",
 								    "                                      kw_args=dict(Y=[[35.]], gamma=0.1))\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "age_simil_35 = rbf_transformer.transform(housing[[\"housing_median_age\"]])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 91,
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[2.81118530e-13],\n",
 								       "       [8.20849986e-02],\n",
 								       "       [6.70320046e-01],\n",
 								       "       ...,\n",
 								       "       [9.55316054e-22],\n",
 								       "       [6.70320046e-01],\n",
 								       "       [3.03539138e-04]])"
 								      ]
 								     },
 								     "execution_count": 91,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "age_simil_35"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 92,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "sf_coords = 37.7749, -122.41\n",
 								    "sf_transformer = FunctionTransformer(rbf_kernel,\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "                                     kw_args=dict(Y=[sf_coords], gamma=0.1))\n",
 								    "sf_simil = sf_transformer.transform(housing[[\"latitude\", \"longitude\"]])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 93,
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0.999927  ],\n",
 								       "       [0.05258419],\n",
 								       "       [0.94864161],\n",
 								       "       ...,\n",
 								       "       [0.00388525],\n",
 								       "       [0.05038518],\n",
 								       "       [0.99868067]])"
 								      ]
 								     },
 								     "execution_count": 93,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "sf_simil"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 94,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0.5 ],\n",
 								       "       [0.75]])"
 								      ]
 								     },
 								     "execution_count": 94,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "ratio_transformer = FunctionTransformer(lambda X: X[:, [0]] / X[:, [1]])\n",
 								    "ratio_transformer.transform(np.array([[1., 2.], [3., 4.]]))"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 95,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
 								    "from sklearn.base import BaseEstimator, TransformerMixin\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.utils.validation import check_array, check_is_fitted\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "class StandardScalerClone(BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, with_mean=True):  # no *args or **kwargs!\n",
 								    "        self.with_mean = with_mean\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    def fit(self, X, y=None):  # y is required even though we don't use it\n",
 								    "        X = check_array(X)  # checks that X is an array with finite float values\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "        self.mean_ = X.mean(axis=0)\n",
 								    "        self.scale_ = X.std(axis=0)\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()\n",
 								    "        return self  # always return self!\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    def transform(self, X):\n",
 								    "        check_is_fitted(self)  # looks for learned attributes (with trailing _)\n",
 								    "        X = check_array(X)\n",
 								    "        assert self.n_features_in_ == X.shape[1]\n",
 								    "        if self.with_mean:\n",
 								    "            X = X - self.mean_\n",
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "        return X / self.scale_"
-												Update to latest library versions

											
										
										
											2020-11-21 00:22:42 +01:00
+								   ]
 								  },
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 96,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "outputs": [],
-												Update to latest library versions

											
										
										
											2020-11-21 00:22:42 +01:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.cluster import KMeans\n",
 								    "\n",
 								    "class ClusterSimilarity(BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):\n",
 								    "        self.n_clusters = n_clusters\n",
 								    "        self.gamma = gamma\n",
 								    "        self.random_state = random_state\n",
 								    "\n",
 								    "    def fit(self, X, y=None, sample_weight=None):\n",
 								    "        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)\n",
 								    "        self.kmeans_.fit(X, sample_weight=sample_weight)\n",
 								    "        return self  # always return self!\n",
 								    "\n",
 								    "    def transform(self, X):\n",
 								    "        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)\n",
 								    "    \n",
 								    "    def get_feature_names_out(self, names=None):\n",
 								    "        return [f\"Cluster {i} similarity\" for i in range(self.n_clusters)]"
-												Update to latest library versions

											
										
										
											2020-11-21 00:22:42 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 97,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n",
 								    "similarities = cluster_simil.fit_transform(housing[[\"latitude\", \"longitude\"]],\n",
 								    "                                           sample_weight=housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 98,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[0.  , 0.14, 0.  , 0.  , 0.  , 0.08, 0.  , 0.99, 0.  , 0.6 ],\n",
 								       "       [0.63, 0.  , 0.99, 0.  , 0.  , 0.  , 0.04, 0.  , 0.11, 0.  ],\n",
 								       "       [0.  , 0.29, 0.  , 0.  , 0.01, 0.44, 0.  , 0.7 , 0.  , 0.3 ]])"
 								      ]
 								     },
 								     "execution_count": 98,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "similarities[:3].round(2)"
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 99,
-												Update to latest library versions

											
										
										
											2020-11-21 00:22:42 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAp0AAAHoCAYAAAAL0lTRAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAAEAAElEQVR4nOydd3hURReH39m+m4T0BAgl9N4RBUFBpAkWULB/iF3sigUrKp8VO5bPBlbELoKCKCCgKL33FloIJQkp2/fO98fdNNJ7wHmfZ59k986dOXd2N/ndM3POEVJKFAqFQqFQKBSK6sRQ2wYoFAqFQqFQKE5/lOhUKBQKhUKhUFQ7SnQqFAqFQqFQKKodJToVCoVCoVAoFNWOEp0KhUKhUCgUimpHiU6FQqFQKBQKRbVjqm0DKkJMTIxMTEysbTOKJTs7m5CQkNo241+Neg9qFzX/tYua/9pFzX/tkp2dzdatW49JKWNry4aWQkhnNfSbDPOklEOroesa4ZQUnYmJiaxcubK2zSiWRYsW0b9//9o241+Neg9qFzX/tYua/9pFzX/tsmjRIgYMGJBUmzY4gVuqod9JEFMN3dYYp6ToVCgUCoVCoairCJTAKgo1JwqFQqFQKBRViADMtW1EHUQFEikUCoVCoVAoqh3l6VQoFAqFQqGoQtTyetEoT6dCoVAoFAqFotpRQlyhUCgUlSYjI4MjR47g8/lq25RaJzw8nC1bttS2GactZrOZuLg46tWrV9umFIva01k0SnQqFAqFolJkZGSQkpJCQkICdrsdIURtm1SrZGZmEhYWVttmnJZIKXG5XBw8eBCgTgtPRWHU8rpCoVAoKsWRI0dISEjA4XCUS3AuXLiQxMREFi5cWCXtFKc/QggcDgcJCQkcOXKkts0plpw9nVX9ONVRolOhUCgUlcLn82G328t1zsKFCxkxYgRJSUmMGDGCRYsWlamdEp4KALvdXqe3cuQsr1f141RHiU6FQqFQVJryejhHjBiB06kXCnQ6nQwfPryQ8CyqnRKeCijf501Rd1CiU6FQKBQ1xslCMoeThWdJ7ZTwVNR11PJ60SjRqVAoFIoaoTghmUOO8HzxxRdLbVeTwjMxMZEpU6bUyFgKxelMjYtOIYRRCLFGCDE7+PwlIcRWIcR6IcT3QoiImrZJoVAoFNXPuHHjihWSOTidTp566qkytRs3blylbUpJSeHuu++mRYsWWK1WEhISGDZsGD///HOl+y6O6667jhEjRlRb/zXB3r17EUKwcuXK2jalTqL2dBZNbXg67wbyJzCbD3SUUnYGtgMTa8EmhUKhUFQz06ZNw+FwlNquNMEJ4HA4mD59eqXs2bt3L927d2fevHk899xzrF+/nt9++43hw4dz6623VqrvmsDv9yOlrG0zKo3X661tE6octbxeNDUqOoUQjYDhwAc5r0kpf5VS+oNP/wYa1aRNCoVCoagZBgwYwOzZs8skPEvC4XAwZ84c+vfvX6l+xo8fj5SSlStXMmbMGNq0aUO7du244447WLduXbHnCSH45ptvCrx28hL8//73P1q3bo3NZiM2NpYhQ4bg9/uZNGkSH3/8MXPmzEEIgRAidx/rwYMHueKKK4iMjCQyMpLhw4ezY8eO3D4nTZpEx44dmT59eq5nNjs7u0gbt27dykUXXUR4eDihoaH07t2bDRs25B6fNm0a7du3x2az0bp1a1599VU0TStwje+99x6jR48mJCSE5s2b89lnn+Ueb9asGQBnnHEGQogC70VZ+n7rrbcYNWoUISEhPPLII/h8Pu666y4aNmyI1WqlcePGPPzww8W+B4pTk5oWzq8BDwLFZc29HphZY9ZUEI8nwJo1R6lf30FiokpMq1AoFGUlR3iWtGezJKpKcKampjJ37lwmT55MaGhooeORkZEV7nv16tXcfvvtfPzxx/Tt25f09HQWLFgAwIQJE9iyZQupqal8+umnAERFReF0OhkwYAB9+vThjz/+wGKxMGXKFM4//3y2bNmSK9T37NnDF198wddff43FYsFmsxUa/9ChQ/Tt25ezzz6b+fPnExERwfLlywkEAgC8//77PPHEE7z55pv06NGDjRs3ctNNN2E2m7njjjty+3n66ad5/vnnee655/jwww+5/vrr6devH02bNmX58uX06tWLuXPn0qVLFywWS7n6fuqpp3j22WeZMmUKQgjeeOMNvv/+e7788ksSExM5cOAA27Ztq/B7UNuoikRFU2OiUwgxAjgipVwlhOhfxPFHAT/weTHn3wzcDBAfH19sTrfqRtMkmzen4fPpX97GjUOJiSmYny4rK6vW7KsLpKf7SEpyYjQKWrQIwW431rgN//b3oLZR81+71PT8h4eHk5mZWeb2PXv25KuvvmL06NG4XK4yn2e32/n666/p0aNHucYrinXr1iGlJDExsdS+pJR4PJ4C7VwuV4Hn+dvs27ePkJAQBgwYQFhYGFFRUTRv3jz3Wk0mE0ajkZCQEAA8Hg+ffvopgUCAN954Izcd0JQpU2jevDlff/01o0aNwuPx4PV6eeedd4iLi8u142ReffVVHA4HH330Ua4YvPjiiwG9WtLTTz/NU089xZAhQwDo378/99xzD1OnTmXs2LG5/Vx++eW55z344IO8/vrr/Prrr1xxxRW5eVltNlvudZSn75EjR3L55ZfnPt+xYwctWrSga9euCCGIjIykU6dOJb43bre7yM95VlZWsecoapea9HSeDVwkhLgAsAH1hBCfSSmvEUKMBUYAA2UxG1SklO8B7wH07NlTVvYut6I89NCfTJlykJyVAofDRHb2bQXaLFq0qNJ34acqUkpCQmbhcumivFcvA//807/G7fg3vwd1ATX/tUtNz/+WLVvKXfZx+PDhTJo0qUxBQ6B7OCdNmsQFF1xQUTML9Qe6kC3NdiEEVqu1QLuTz8vfZuDAgTRt2pTOnTszZMgQBg8ezKhRo3Lbm81mTCZTgfM3bdpEUlISDRs2LDC20+nk0KFDhIWFYbVaadSoES1atCjR3k2bNtGvXz+io6MLHTt69CgHDhzgnnvu4b777st9PWd/aH6bevbsWeB5bGxsbonPHO9wSEhIbpvy9N2nT58Cz2+++WYGDRpEjx49GDx4MBdccAHDhg3DYCh+F6DNZqNbt26FXq8LN7zK01k0NSY6pZQTCQYJBT2dE4KCcyjwEHCulLL8ay01SFaWl9dfX0u+rSkYDCpB7cnk5OwVAoxGNT8KhaIwCxcuLLPgBF18TZo0iTPOOKNKBHWrVq0QQrBlyxZGjhxZrnOFEIUCePJXxwkLC2P16tUsXryY+fPn89xzz/HII4+wYsWKQqIyB03T6Nq1K19++WWhY1FRUbm/53gVS6Kk4KKcvZXvvvsuffr0KbEfs7mgbBJCFNibWZm+T76O7t27s3fvXubOncuCBQsYO3YsXbp0Yf78+SUKT8WpRV14J6ei7/GcL4RYK4R4t7YNKo60NE+B5wYDTJ16brn6+PTTPbRqNYexY//B5yv+y1sVaJpk3rx0fvghFbe7esfKQd9gfyaNGtlo1y6MDz/sXiPjFsX06Vto3vxjmjSZxiuvrDktojwVitOB0vJ1FkdxlYsqQlRUFEOGDGHq1KlFLsemp6cXe25sbCzJycm5z1NSUgo8B30J/bzzzsuNis/Ozmb27NkAWCyW3P2VOXTv3p2dO3cSExNDy5YtCzzyi86y0L17d5YuXVpkVHh8fDwJCQns2rWr0DgtW7Ys8xg5y/b5r6OyfYeFhTF69Gjeeecd5syZw4IFC9i5c2eZbaprqOj1wtTKNUgpFwGLgr+X/VNeQ6SkOHnxxVV4PAHuuacrLVtGANCoUSjnntuIBQuS0TTBvfd24ppr2pS53xMnvNx440q8Xo1Dh1ycd14cY8c2q6argKuu2smcOekAtGpl459/OmA2V/99xrBh9dm/v2qWwCrKiRNebr99EU6nnhjh8cf/Jjzcwg03dKhVuxSKfzsVFZw55AjPqggmevvtt+nTpw89e/bkmWeeoXPnzkgpWbhwIc899xz79u0r8rzzzjuPt956iz59+mA0GnnkkUcKBPT
 								      "text/plain": [
 								       "<Figure size 720x504 with 2 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – this cell generates Figure 2–19\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_renamed = housing.rename(columns={\n",
 								    "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
 								    "    \"population\": \"Population\",\n",
 								    "    \"median_house_value\": \"Median house value (ᴜsᴅ)\"})\n",
 								    "housing_renamed[\"Max cluster similarity\"] = similarities.max(axis=1)\n",
 								    "\n",
 								    "housing_renamed.plot(kind=\"scatter\", x=\"Longitude\", y=\"Latitude\", grid=True,\n",
 								    "                     s=housing_renamed[\"Population\"] / 100, label=\"Population\",\n",
 								    "                     c=\"Max cluster similarity\",\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "                     cmap=\"jet\", colorbar=True,\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "                     legend=True, sharex=False, figsize=(10, 7))\n",
 								    "plt.plot(cluster_simil.kmeans_.cluster_centers_[:, 1],\n",
 								    "         cluster_simil.kmeans_.cluster_centers_[:, 0],\n",
 								    "         linestyle=\"\", color=\"black\", marker=\"X\", markersize=20,\n",
 								    "         label=\"Cluster centers\")\n",
 								    "plt.legend(loc=\"upper right\")\n",
 								    "save_fig(\"district_cluster_plot\")\n",
 								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Transformation Pipelines"
 								   ]
 								  },
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Now let's build a pipeline to preprocess the numerical attributes:"
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 100,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
 								    "from sklearn.pipeline import Pipeline\n",
 								    "\n",
 								    "num_pipeline = Pipeline([\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    (\"impute\", SimpleImputer(strategy=\"median\")),\n",
 								    "    (\"standardize\", StandardScaler()),\n",
 								    "])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 101,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.pipeline import make_pipeline\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 102,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
 								       "                ('standardscaler', StandardScaler())])"
 								      ]
 								     },
 								     "execution_count": 102,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn import set_config\n",
 								    "\n",
 								    "set_config(display='diagram')\n",
 								    "\n",
 								    "num_pipeline"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
-												Silence gelsd warning

											
										
										
											2018-07-31 21:22:05 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 103,
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[-1.42,  1.01,  1.86,  0.31,  1.37,  0.14,  1.39, -0.94],\n",
 								       "       [ 0.6 , -0.7 ,  0.91, -0.31, -0.44, -0.69, -0.37,  1.17]])"
 								      ]
 								     },
 								     "execution_count": 103,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_num_prepared = num_pipeline.fit_transform(housing_num)\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "housing_num_prepared[:2].round(2)"
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 104,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "Monkey-patching SimpleImputer.get_feature_names_out()\n",
 								      "Monkey-patching FunctionTransformer.get_feature_names_out()\n"
 								     ]
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "def monkey_patch_get_signature_names_out():\n",
 								    "    \"\"\"Monkey patch some classes which did not handle get_feature_names_out()\n",
 								    "       correctly in 1.0.0.\"\"\"\n",
 								    "    from inspect import Signature, signature, Parameter\n",
 								    "    import pandas as pd\n",
 								    "    from sklearn.impute import SimpleImputer\n",
 								    "    from sklearn.pipeline import make_pipeline, Pipeline\n",
 								    "    from sklearn.preprocessing import FunctionTransformer, StandardScaler\n",
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    default_get_feature_names_out = StandardScaler.get_feature_names_out\n",
 								    "\n",
 								    "    if not hasattr(SimpleImputer, \"get_feature_names_out\"):\n",
 								    "      print(\"Monkey-patching SimpleImputer.get_feature_names_out()\")\n",
 								    "      SimpleImputer.get_feature_names_out = default_get_feature_names_out\n",
 								    "\n",
 								    "    if not hasattr(FunctionTransformer, \"get_feature_names_out\"):\n",
 								    "        print(\"Monkey-patching FunctionTransformer.get_feature_names_out()\")\n",
 								    "        orig_init = FunctionTransformer.__init__\n",
 								    "        orig_sig = signature(orig_init)\n",
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "        def __init__(*args, feature_names_out=None, **kwargs):\n",
 								    "            orig_sig.bind(*args, **kwargs)\n",
 								    "            orig_init(*args, **kwargs)\n",
 								    "            args[0].feature_names_out = feature_names_out\n",
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "        __init__.__signature__ = Signature(\n",
 								    "            list(signature(orig_init).parameters.values()) + [\n",
 								    "                Parameter(\"feature_names_out\", Parameter.KEYWORD_ONLY)])\n",
 								    "\n",
 								    "        def get_feature_names_out(self, names=None):\n",
 								    "            if self.feature_names_out is None:\n",
 								    "                return default_get_feature_names_out(self, names)\n",
 								    "            elif callable(self.feature_names_out):\n",
 								    "                return self.feature_names_out(names)\n",
 								    "            else:\n",
 								    "                return self.feature_names_out\n",
 								    "\n",
 								    "        FunctionTransformer.__init__ = __init__\n",
 								    "        FunctionTransformer.get_feature_names_out = get_feature_names_out\n",
 								    "\n",
 								    "monkey_patch_get_signature_names_out()"
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 105,
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "df_housing_num_prepared = pd.DataFrame(\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    housing_num_prepared, columns=num_pipeline.get_feature_names_out(),\n",
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "    index=housing_num.index)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 106,
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>longitude</th>\n",
 								       "      <th>latitude</th>\n",
 								       "      <th>housing_median_age</th>\n",
 								       "      <th>total_rooms</th>\n",
 								       "      <th>total_bedrooms</th>\n",
 								       "      <th>population</th>\n",
 								       "      <th>households</th>\n",
 								       "      <th>median_income</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>13096</th>\n",
 								       "      <td>-1.423037</td>\n",
 								       "      <td>1.013606</td>\n",
 								       "      <td>1.861119</td>\n",
 								       "      <td>0.311912</td>\n",
 								       "      <td>1.368167</td>\n",
 								       "      <td>0.137460</td>\n",
 								       "      <td>1.394812</td>\n",
 								       "      <td>-0.936491</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14973</th>\n",
 								       "      <td>0.596394</td>\n",
 								       "      <td>-0.702103</td>\n",
 								       "      <td>0.907630</td>\n",
 								       "      <td>-0.308620</td>\n",
 								       "      <td>-0.435925</td>\n",
 								       "      <td>-0.693771</td>\n",
 								       "      <td>-0.373485</td>\n",
 								       "      <td>1.171942</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
 								       "13096  -1.423037  1.013606            1.861119     0.311912        1.368167   \n",
 								       "14973   0.596394 -0.702103            0.907630    -0.308620       -0.435925   \n",
 								       "\n",
 								       "       population  households  median_income  \n",
 								       "13096    0.137460    1.394812      -0.936491  \n",
 								       "14973   -0.693771   -0.373485       1.171942  "
 								      ]
 								     },
 								     "execution_count": 106,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "df_housing_num_prepared.head(2)  # extra code"
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 107,
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "[('simpleimputer', SimpleImputer(strategy='median')),\n",
 								       " ('standardscaler', StandardScaler())]"
 								      ]
 								     },
 								     "execution_count": 107,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "num_pipeline.steps"
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 108,
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "StandardScaler()"
 								      ]
 								     },
 								     "execution_count": 108,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "num_pipeline[1]"
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 109,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median'))])"
 								      ]
 								     },
 								     "execution_count": 109,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "num_pipeline[:-1]"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
-												Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

											
										
										
											2017-09-15 14:40:13 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 110,
-												Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

											
										
										
											2017-09-15 14:40:13 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "SimpleImputer(strategy='median')"
 								      ]
 								     },
 								     "execution_count": 110,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

											
										
										
											2017-09-15 14:40:13 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "num_pipeline.named_steps[\"simpleimputer\"]"
-												Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

											
										
										
											2017-09-15 14:40:13 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 111,
-												Do not use LabelEncoder and LabelBinarizer, use factorize() and CategoricalEncoder instead.

											
										
										
											2017-09-19 13:01:23 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),\n",
 								       "                ('standardscaler', StandardScaler())])"
 								      ]
 								     },
 								     "execution_count": 111,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "num_pipeline.set_params(simpleimputer__strategy=\"median\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 112,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.compose import ColumnTransformer\n",
 								    "\n",
 								    "num_attribs = [\"longitude\", \"latitude\", \"housing_median_age\", \"total_rooms\",\n",
 								    "               \"total_bedrooms\", \"population\", \"households\", \"median_income\"]\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "cat_attribs = [\"ocean_proximity\"]\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "cat_pipeline = make_pipeline(\n",
 								    "    SimpleImputer(strategy=\"most_frequent\"),\n",
 								    "    OneHotEncoder(handle_unknown=\"ignore\"))\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "preprocessing = ColumnTransformer([\n",
 								    "    (\"num\", num_pipeline, num_attribs),\n",
 								    "    (\"cat\", cat_pipeline, cat_attribs),\n",
 								    "])"
-												Make notebook code match book examples more closely in chapter 2

											
										
										
											2017-06-01 09:53:20 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 113,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Make notebook code match book examples more closely in chapter 2

											
										
										
											2017-06-01 09:53:20 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.compose import make_column_selector, make_column_transformer\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "preprocessing = make_column_transformer(\n",
 								    "    (num_pipeline, make_column_selector(dtype_include=np.number)),\n",
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "    (cat_pipeline, make_column_selector(dtype_include=object)),\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    ")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 114,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_prepared = preprocessing.fit_transform(housing)"
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 115,
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>pipeline-1__longitude</th>\n",
 								       "      <th>pipeline-1__latitude</th>\n",
 								       "      <th>pipeline-1__housing_median_age</th>\n",
 								       "      <th>pipeline-1__total_rooms</th>\n",
 								       "      <th>pipeline-1__total_bedrooms</th>\n",
 								       "      <th>pipeline-1__population</th>\n",
 								       "      <th>pipeline-1__households</th>\n",
 								       "      <th>pipeline-1__median_income</th>\n",
 								       "      <th>pipeline-2__ocean_proximity_&lt;1H OCEAN</th>\n",
 								       "      <th>pipeline-2__ocean_proximity_INLAND</th>\n",
 								       "      <th>pipeline-2__ocean_proximity_ISLAND</th>\n",
 								       "      <th>pipeline-2__ocean_proximity_NEAR BAY</th>\n",
 								       "      <th>pipeline-2__ocean_proximity_NEAR OCEAN</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>13096</th>\n",
 								       "      <td>-1.423037</td>\n",
 								       "      <td>1.013606</td>\n",
 								       "      <td>1.861119</td>\n",
 								       "      <td>0.311912</td>\n",
 								       "      <td>1.368167</td>\n",
 								       "      <td>0.137460</td>\n",
 								       "      <td>1.394812</td>\n",
 								       "      <td>-0.936491</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>1.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14973</th>\n",
 								       "      <td>0.596394</td>\n",
 								       "      <td>-0.702103</td>\n",
 								       "      <td>0.907630</td>\n",
 								       "      <td>-0.308620</td>\n",
 								       "      <td>-0.435925</td>\n",
 								       "      <td>-0.693771</td>\n",
 								       "      <td>-0.373485</td>\n",
 								       "      <td>1.171942</td>\n",
 								       "      <td>1.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "       pipeline-1__longitude  pipeline-1__latitude  \\\n",
 								       "13096              -1.423037              1.013606   \n",
 								       "14973               0.596394             -0.702103   \n",
 								       "\n",
 								       "       pipeline-1__housing_median_age  pipeline-1__total_rooms  \\\n",
 								       "13096                        1.861119                 0.311912   \n",
 								       "14973                        0.907630                -0.308620   \n",
 								       "\n",
 								       "       pipeline-1__total_bedrooms  pipeline-1__population  \\\n",
 								       "13096                    1.368167                0.137460   \n",
 								       "14973                   -0.435925               -0.693771   \n",
 								       "\n",
 								       "       pipeline-1__households  pipeline-1__median_income  \\\n",
 								       "13096                1.394812                  -0.936491   \n",
 								       "14973               -0.373485                   1.171942   \n",
 								       "\n",
 								       "       pipeline-2__ocean_proximity_<1H OCEAN  \\\n",
 								       "13096                                    0.0   \n",
 								       "14973                                    1.0   \n",
 								       "\n",
 								       "       pipeline-2__ocean_proximity_INLAND  pipeline-2__ocean_proximity_ISLAND  \\\n",
 								       "13096                                 0.0                                 0.0   \n",
 								       "14973                                 0.0                                 0.0   \n",
 								       "\n",
 								       "       pipeline-2__ocean_proximity_NEAR BAY  \\\n",
 								       "13096                                   1.0   \n",
 								       "14973                                   0.0   \n",
 								       "\n",
 								       "       pipeline-2__ocean_proximity_NEAR OCEAN  \n",
 								       "13096                                     0.0  \n",
 								       "14973                                     0.0  "
 								      ]
 								     },
 								     "execution_count": 115,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Replace FeatureUnion + DataFrameSelector with new ColumnTransformer

											
										
										
											2018-07-31 21:08:33 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – shows that we can get a DataFrame out if we want\n",
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "housing_prepared_fr = pd.DataFrame(\n",
 								    "    housing_prepared,\n",
 								    "    columns=preprocessing.get_feature_names_out(),\n",
 								    "    index=housing.index)\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_prepared_fr.head(2)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 116,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "def column_ratio(X):\n",
 								    "    return X[:, [0]] / X[:, [1]]\n",
 								    "\n",
 								    "def ratio_pipeline(name=None):\n",
 								    "    return make_pipeline(\n",
 								    "        SimpleImputer(strategy=\"median\"),\n",
 								    "        FunctionTransformer(column_ratio,\n",
 								    "                            feature_names_out=[name]),\n",
 								    "        StandardScaler())\n",
 								    "\n",
 								    "log_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n",
 								    "                             FunctionTransformer(np.log),\n",
 								    "                             StandardScaler())\n",
 								    "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n",
 								    "default_num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n",
 								    "                                     StandardScaler())\n",
 								    "preprocessing = ColumnTransformer([\n",
 								    "        (\"bedrooms_ratio\", ratio_pipeline(\"bedrooms_ratio\"),\n",
 								    "                           [\"total_bedrooms\", \"total_rooms\"]),\n",
 								    "        (\"rooms_per_house\", ratio_pipeline(\"rooms_per_house\"),\n",
 								    "                            [\"total_rooms\", \"households\"]),\n",
-												Fix people_per_house feature name, and fix solution to last exercise

											
										
										
											2021-11-15 08:56:11 +01:00
+								    "        (\"people_per_house\", ratio_pipeline(\"people_per_house\"),\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "                             [\"population\", \"households\"]),\n",
 								    "        (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\",\n",
 								    "                               \"population\", \"households\", \"median_income\"]),\n",
 								    "        (\"geo\", cluster_simil, [\"latitude\", \"longitude\"]),\n",
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "        (\"cat\", cat_pipeline, make_column_selector(dtype_include=object)),\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "    ],\n",
 								    "    remainder=default_num_pipeline)  # one column remaining: housing_median_age"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 117,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "(16512, 24)"
 								      ]
 								     },
 								     "execution_count": 117,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "housing_prepared = preprocessing.fit_transform(housing)\n",
 								    "housing_prepared.shape"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 118,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array(['bedrooms_ratio__bedrooms_ratio',\n",
 								       "       'rooms_per_house__rooms_per_house',\n",
 								       "       'people_per_house__people_per_house', 'log__total_bedrooms',\n",
 								       "       'log__total_rooms', 'log__population', 'log__households',\n",
 								       "       'log__median_income', 'geo__Cluster 0 similarity',\n",
 								       "       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',\n",
 								       "       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',\n",
 								       "       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',\n",
 								       "       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',\n",
 								       "       'geo__Cluster 9 similarity', 'cat__ocean_proximity_<1H OCEAN',\n",
 								       "       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',\n",
 								       "       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',\n",
 								       "       'remainder__housing_median_age'], dtype=object)"
 								      ]
 								     },
 								     "execution_count": 118,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "preprocessing.get_feature_names_out()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								    "# Select and Train a Model"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Training and Evaluating on the Training Set"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 119,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "Pipeline(steps=[('columntransformer',\n",
 								       "                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                              SimpleImputer(strategy='median')),\n",
 								       "                                                             ('standardscaler',\n",
 								       "                                                              StandardScaler())]),\n",
 								       "                                   transformers=[('bedrooms_ratio',\n",
 								       "                                                  Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                   SimpleImputer(strategy='median')),\n",
 								       "                                                                  ('functiontransformer',\n",
 								       "                                                                   FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
 								       "                                                                                       f...\n",
 								       "                                                   'median_income']),\n",
 								       "                                                 ('geo',\n",
 								       "                                                  ClusterSimilarity(random_state=42),\n",
 								       "                                                  ['latitude', 'longitude']),\n",
 								       "                                                 ('cat',\n",
 								       "                                                  Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
 								       "                                                                  ('onehotencoder',\n",
 								       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
 								       "                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
 								       "                ('linearregression', LinearRegression())])"
 								      ]
 								     },
 								     "execution_count": 119,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "from sklearn.linear_model import LinearRegression\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "lin_reg = make_pipeline(preprocessing, LinearRegression())\n",
 								    "lin_reg.fit(housing, housing_labels)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Let's try the full preprocessing pipeline on a few training instances:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 120,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([243700., 372400., 128800.,  94400., 328300.])"
 								      ]
 								     },
 								     "execution_count": 120,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_predictions = lin_reg.predict(housing)\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "housing_predictions[:5].round(-2)  # -2 = rounded to the nearest hundred"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Compare against the actual values:"
 								   ]
 								  },
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 121,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([458300., 483800., 101700.,  96100., 361800.])"
 								      ]
 								     },
 								     "execution_count": 121,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_labels.iloc[:5].values"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 122,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "-46.8%, -23.0%, 26.6%, -1.8%, -9.3%\n"
 								     ]
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – computes the error ratios discussed in the book\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 123,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "68687.89176589991"
 								      ]
 								     },
 								     "execution_count": 123,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "from sklearn.metrics import mean_squared_error\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "lin_rmse = mean_squared_error(housing_labels, housing_predictions,\n",
 								    "                              squared=False)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "lin_rmse"
 								   ]
 								  },
-												Add not about squared=False, fixes #361

											
										
										
											2021-03-01 10:18:40 +01:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 124,
-												Add not about squared=False, fixes #361

											
										
										
											2021-03-01 10:18:40 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "Pipeline(steps=[('columntransformer',\n",
 								       "                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                              SimpleImputer(strategy='median')),\n",
 								       "                                                             ('standardscaler',\n",
 								       "                                                              StandardScaler())]),\n",
 								       "                                   transformers=[('bedrooms_ratio',\n",
 								       "                                                  Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                   SimpleImputer(strategy='median')),\n",
 								       "                                                                  ('functiontransformer',\n",
 								       "                                                                   FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
 								       "                                                                                       f...\n",
 								       "                                                  ClusterSimilarity(random_state=42),\n",
 								       "                                                  ['latitude', 'longitude']),\n",
 								       "                                                 ('cat',\n",
 								       "                                                  Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
 								       "                                                                  ('onehotencoder',\n",
 								       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
 								       "                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
 								       "                ('decisiontreeregressor',\n",
 								       "                 DecisionTreeRegressor(random_state=42))])"
 								      ]
 								     },
 								     "execution_count": 124,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add not about squared=False, fixes #361

											
										
										
											2021-03-01 10:18:40 +01:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.tree import DecisionTreeRegressor\n",
 								    "\n",
 								    "tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))\n",
 								    "tree_reg.fit(housing, housing_labels)"
-												Add not about squared=False, fixes #361

											
										
										
											2021-03-01 10:18:40 +01:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 125,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "0.0"
 								      ]
 								     },
 								     "execution_count": 125,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "housing_predictions = tree_reg.predict(housing)\n",
 								    "tree_rmse = mean_squared_error(housing_labels, housing_predictions,\n",
 								    "                              squared=False)\n",
 								    "tree_rmse"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Better Evaluation Using Cross-Validation"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 126,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.model_selection import cross_val_score\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,\n",
 								    "                              scoring=\"neg_root_mean_squared_error\", cv=10)"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 127,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "count       10.000000\n",
 								       "mean     66868.027288\n",
 								       "std       2060.966425\n",
 								       "min      63649.536493\n",
 								       "25%      65338.078316\n",
 								       "50%      66801.953094\n",
 								       "75%      68229.934454\n",
 								       "max      70094.778246\n",
 								       "dtype: float64"
 								      ]
 								     },
 								     "execution_count": 127,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "pd.Series(tree_rmses).describe()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 128,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "count       10.000000\n",
 								       "mean     69858.018195\n",
 								       "std       4182.205077\n",
 								       "min      65397.780144\n",
 								       "25%      68070.536263\n",
 								       "50%      68619.737842\n",
 								       "75%      69810.076342\n",
 								       "max      80959.348171\n",
 								       "dtype: float64"
 								      ]
 								     },
 								     "execution_count": 128,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – computes the error stats for the linear model\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n",
 								    "                              scoring=\"neg_root_mean_squared_error\", cv=10)\n",
 								    "pd.Series(lin_rmses).describe()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Clarify a few messages

											
										
										
											2021-10-28 05:02:31 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "**Warning:** the following cell may take a few minutes to run:"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 129,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.ensemble import RandomForestRegressor\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "forest_reg = make_pipeline(preprocessing,\n",
 								    "                           RandomForestRegressor(random_state=42))\n",
 								    "forest_rmses = -cross_val_score(forest_reg, housing, housing_labels,\n",
 								    "                                scoring=\"neg_root_mean_squared_error\", cv=10)"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 130,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "count       10.000000\n",
 								       "mean     47019.561281\n",
 								       "std       1033.957120\n",
 								       "min      45458.112527\n",
 								       "25%      46464.031184\n",
 								       "50%      46967.596354\n",
 								       "75%      47325.694987\n",
 								       "max      49243.765795\n",
 								       "dtype: float64"
 								      ]
 								     },
 								     "execution_count": 130,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "pd.Series(forest_rmses).describe()"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Clarify a few messages

											
										
										
											2021-10-28 05:02:31 +02:00
+								    "Let's compare this RMSE measured using cross-validation (the \"validation error\") with the RMSE measured on the training set (the \"training error\"):"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 131,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "17474.619286483998"
 								      ]
 								     },
 								     "execution_count": 131,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "forest_reg.fit(housing, housing_labels)\n",
 								    "housing_predictions = forest_reg.predict(housing)\n",
 								    "forest_rmse = mean_squared_error(housing_labels, housing_predictions,\n",
 								    "                                 squared=False)\n",
 								    "forest_rmse"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Clarify a few messages

											
										
										
											2021-10-28 05:02:31 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "The training error is much lower than the validation error, which usually means that the model has overfit the training set. Another possible explanation may be that there's a mismatch between the training data and the validation data, but it's not the case here, since both came from the same dataset that we shuffled and split in two parts."
 								   ]
 								  },
-												Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20)

											
										
										
											2018-12-21 03:18:31 +01:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "# Fine-Tune Your Model"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Grid Search"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "**Warning:** the following cell may take a few minutes to run:"
-												Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20)

											
										
										
											2018-12-21 03:18:31 +01:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 132,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "GridSearchCV(cv=3,\n",
 								       "             estimator=Pipeline(steps=[('preprocessing',\n",
 								       "                                        ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                     SimpleImputer(strategy='median')),\n",
 								       "                                                                                    ('standardscaler',\n",
 								       "                                                                                     StandardScaler())]),\n",
 								       "                                                          transformers=[('bedrooms_ratio',\n",
 								       "                                                                         Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                          SimpleImputer(strategy='median')),\n",
 								       "                                                                                         ('functiontransformer',\n",
 								       "                                                                                          FunctionTransformer(feature_names_...\n",
 								       "                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
 								       "                                       ('random_forest',\n",
 								       "                                        RandomForestRegressor(random_state=42))]),\n",
 								       "             param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],\n",
 								       "                          'random_forest__max_features': [4, 6, 8]},\n",
 								       "                         {'preprocessing__geo__n_clusters': [10, 15],\n",
 								       "                          'random_forest__max_features': [6, 8, 10]}],\n",
 								       "             scoring='neg_root_mean_squared_error')"
 								      ]
 								     },
 								     "execution_count": 132,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.model_selection import GridSearchCV\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "full_pipeline = Pipeline([\n",
 								    "    (\"preprocessing\", preprocessing),\n",
 								    "    (\"random_forest\", RandomForestRegressor(random_state=42)),\n",
 								    "])\n",
 								    "param_grid = [\n",
 								    "    {'preprocessing__geo__n_clusters': [5, 8, 10],\n",
 								    "     'random_forest__max_features': [4, 6, 8]},\n",
 								    "    {'preprocessing__geo__n_clusters': [10, 15],\n",
 								    "     'random_forest__max_features': [6, 8, 10]},\n",
 								    "]\n",
 								    "grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,\n",
 								    "                           scoring='neg_root_mean_squared_error')\n",
 								    "grid_search.fit(housing, housing_labels)"
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Finish exercise solution for chapter 9, and ensure sync between notebook and book for chapter 2

											
										
										
											2017-05-28 18:14:49 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "You can get the full list of hyperparameters available for tuning by looking at `full_pipeline.get_params().keys()`:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 133,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "dict_keys(['memory', 'steps', 'verbose', 'preprocessing', 'random_forest', 'preprocessing__n_jobs', 'preprocessing__remainder__memory', 'preprocessing__remainder__steps', 'preprocessing__remainder__verbose', 'preprocessing__remainder__simpleimputer', 'preprocessing__remainder__standardscaler', 'preprocessing__remainder__simpleimputer__add_indicator', 'preprocessing__remainder__simpleimputer__copy', 'preprocessing__remainder__simpleimputer__fill_value', 'preprocessing__remainder__simpleimputer__missing_values', 'preprocessing__remainder__simpleimputer__strategy', 'preprocessing__remainder__simpleimputer__verbose', 'preprocessing__remainder__standardscaler__copy', 'preprocessing__remainder__standardscaler__with_mean', 'preprocessing__remainder__standardscaler__with_std', 'preprocessing__remainder', 'preprocessing__sparse_threshold', 'preprocessing__transformer_weights', 'preprocessing__transformers', 'preprocessing__verbose', 'preprocessing__verbose_feature_names_out', 'preprocessing__be...\n"
 								     ]
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – shows part of the output of get_params().keys()\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "print(str(full_pipeline.get_params().keys())[:1000] + \"...\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "The best hyperparameter combination found:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 134,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}"
 								      ]
 								     },
 								     "execution_count": 134,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "grid_search.best_params_"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 135,
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "Pipeline(steps=[('preprocessing',\n",
 								       "                 ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                              SimpleImputer(strategy='median')),\n",
 								       "                                                             ('standardscaler',\n",
 								       "                                                              StandardScaler())]),\n",
 								       "                                   transformers=[('bedrooms_ratio',\n",
 								       "                                                  Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                   SimpleImputer(strategy='median')),\n",
 								       "                                                                  ('functiontransformer',\n",
 								       "                                                                   FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
 								       "                                                                                       func=...\n",
 								       "                                                  ClusterSimilarity(n_clusters=15,\n",
 								       "                                                                    random_state=42),\n",
 								       "                                                  ['latitude', 'longitude']),\n",
 								       "                                                 ('cat',\n",
 								       "                                                  Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
 								       "                                                                  ('onehotencoder',\n",
 								       "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
 								       "                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b410ec490>)])),\n",
 								       "                ('random_forest',\n",
 								       "                 RandomForestRegressor(max_features=6, random_state=42))])"
 								      ]
 								     },
 								     "execution_count": 135,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "grid_search.best_estimator_"
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Let's look at the score of each hyperparameter combination tested during the grid search:"
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 136,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>n_clusters</th>\n",
 								       "      <th>max_features</th>\n",
 								       "      <th>split0</th>\n",
 								       "      <th>split1</th>\n",
 								       "      <th>split2</th>\n",
 								       "      <th>mean_test_rmse</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>12</th>\n",
 								       "      <td>15</td>\n",
 								       "      <td>6</td>\n",
 								       "      <td>43460</td>\n",
 								       "      <td>43919</td>\n",
 								       "      <td>44748</td>\n",
 								       "      <td>44042</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>13</th>\n",
 								       "      <td>15</td>\n",
 								       "      <td>8</td>\n",
 								       "      <td>44132</td>\n",
 								       "      <td>44075</td>\n",
 								       "      <td>45010</td>\n",
 								       "      <td>44406</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>14</th>\n",
 								       "      <td>15</td>\n",
 								       "      <td>10</td>\n",
 								       "      <td>44374</td>\n",
 								       "      <td>44286</td>\n",
 								       "      <td>45316</td>\n",
 								       "      <td>44659</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>7</th>\n",
 								       "      <td>10</td>\n",
 								       "      <td>6</td>\n",
 								       "      <td>44683</td>\n",
 								       "      <td>44655</td>\n",
 								       "      <td>45657</td>\n",
 								       "      <td>44999</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>9</th>\n",
 								       "      <td>10</td>\n",
 								       "      <td>6</td>\n",
 								       "      <td>44683</td>\n",
 								       "      <td>44655</td>\n",
 								       "      <td>45657</td>\n",
 								       "      <td>44999</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "   n_clusters max_features  split0  split1  split2  mean_test_rmse\n",
 								       "12         15            6   43460   43919   44748           44042\n",
 								       "13         15            8   44132   44075   45010           44406\n",
 								       "14         15           10   44374   44286   45316           44659\n",
 								       "7          10            6   44683   44655   45657           44999\n",
 								       "9          10            6   44683   44655   45657           44999"
 								      ]
 								     },
 								     "execution_count": 136,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "cv_res = pd.DataFrame(grid_search.cv_results_)\n",
 								    "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – these few lines of code just make the DataFrame look nicer\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
 								    "                 \"param_random_forest__max_features\", \"split0_test_score\",\n",
 								    "                 \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
 								    "score_cols = [\"split0\", \"split1\", \"split2\", \"mean_test_rmse\"]\n",
 								    "cv_res.columns = [\"n_clusters\", \"max_features\"] + score_cols\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "cv_res.head()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "## Randomized Search"
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "**Warning:** the following cell may take a few minutes to run:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 137,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.experimental import enable_halving_search_cv\n",
 								    "from sklearn.model_selection import HalvingRandomSearchCV"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Try 30 (`n_iter` × `cv`) random combinations of hyperparameters:"
 								   ]
 								  },
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 138,
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "RandomizedSearchCV(cv=3,\n",
 								       "                   estimator=Pipeline(steps=[('preprocessing',\n",
 								       "                                              ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                           SimpleImputer(strategy='median')),\n",
 								       "                                                                                          ('standardscaler',\n",
 								       "                                                                                           StandardScaler())]),\n",
 								       "                                                                transformers=[('bedrooms_ratio',\n",
 								       "                                                                               Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                                SimpleImputer(strategy='median')),\n",
 								       "                                                                                               ('functiontransformer',\n",
 								       "                                                                                                FunctionTransformer(feature_...\n",
 								       "                                                                               <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
 								       "                                             ('random_forest',\n",
 								       "                                              RandomForestRegressor(random_state=42))]),\n",
 								       "                   param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b103bb760>,\n",
 								       "                                        'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b410decd0>},\n",
 								       "                   random_state=42, scoring='neg_root_mean_squared_error')"
 								      ]
 								     },
 								     "execution_count": 138,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.model_selection import RandomizedSearchCV\n",
 								    "from scipy.stats import randint\n",
 								    "\n",
 								    "param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),\n",
 								    "                  'random_forest__max_features': randint(low=2, high=20)}\n",
 								    "\n",
 								    "rnd_search = RandomizedSearchCV(\n",
 								    "    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,\n",
 								    "    scoring='neg_root_mean_squared_error', random_state=42)\n",
 								    "\n",
 								    "rnd_search.fit(housing, housing_labels)"
-												Merge PR #36 by lsshawn: adds helpful notes and fixes the null rows sampling code

											
										
										
											2017-06-08 17:11:08 +02:00
+								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 139,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>n_clusters</th>\n",
 								       "      <th>max_features</th>\n",
 								       "      <th>split0</th>\n",
 								       "      <th>split1</th>\n",
 								       "      <th>split2</th>\n",
 								       "      <th>mean_test_rmse</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>45</td>\n",
 								       "      <td>9</td>\n",
 								       "      <td>41287</td>\n",
 								       "      <td>42150</td>\n",
 								       "      <td>42627</td>\n",
 								       "      <td>42021</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>8</th>\n",
 								       "      <td>32</td>\n",
 								       "      <td>7</td>\n",
 								       "      <td>41690</td>\n",
 								       "      <td>42542</td>\n",
 								       "      <td>43224</td>\n",
 								       "      <td>42485</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>41</td>\n",
 								       "      <td>16</td>\n",
 								       "      <td>42223</td>\n",
 								       "      <td>42959</td>\n",
 								       "      <td>43321</td>\n",
 								       "      <td>42834</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>5</th>\n",
 								       "      <td>42</td>\n",
 								       "      <td>4</td>\n",
 								       "      <td>41818</td>\n",
 								       "      <td>43094</td>\n",
 								       "      <td>43817</td>\n",
 								       "      <td>42910</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>2</th>\n",
 								       "      <td>23</td>\n",
 								       "      <td>8</td>\n",
 								       "      <td>42264</td>\n",
 								       "      <td>42996</td>\n",
 								       "      <td>43830</td>\n",
 								       "      <td>43030</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "  n_clusters max_features  split0  split1  split2  mean_test_rmse\n",
 								       "1         45            9   41287   42150   42627           42021\n",
 								       "8         32            7   41690   42542   43224           42485\n",
 								       "0         41           16   42223   42959   43321           42834\n",
 								       "5         42            4   41818   43094   43817           42910\n",
 								       "2         23            8   42264   42996   43830           43030"
 								      ]
 								     },
 								     "execution_count": 139,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – displays the random search results\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "cv_res = pd.DataFrame(rnd_search.cv_results_)\n",
 								    "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
 								    "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
 								    "                 \"param_random_forest__max_features\", \"split0_test_score\",\n",
 								    "                 \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
 								    "cv_res.columns = [\"n_clusters\", \"max_features\"] + score_cols\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "cv_res[score_cols] = -cv_res[score_cols].round().astype(np.int64)\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "cv_res.head()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "**Bonus section: how to choose the sampling distribution for a hyperparameter**\n",
 								    "\n",
 								    "* `scipy.stats.randint(a, b+1)`: for hyperparameters with _discrete_ values that range from a to b, and all values in that range seem equally likely.\n",
 								    "* `scipy.stats.uniform(a, b)`: this is very similar, but for _continuous_ hyperparameters.\n",
 								    "* `scipy.stats.geom(1 / scale)`: for discrete values, when you want to sample roughly in a given scale. E.g., with scale=1000 most samples will be in this ballpark, but ~10% of all samples will be <100 and ~10% will be >2300.\n",
 								    "* `scipy.stats.expon(scale)`: this is the continuous equivalent of `geom`. Just set `scale` to the most likely value.\n",
 								    "* `scipy.stats.reciprocal(a, b)`: when you have almost no idea what the optimal hyperparameter value's scale is. If you set a=0.01 and b=100, then you're just as likely to sample a value between 0.01 and 0.1 as a value between 10 and 100.\n"
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Here are plots of the probability mass functions (for discrete variables), and probability density functions (for continuous variables) for `randint()`, `uniform()`, `geom()` and `expon()`:"
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								   ]
 								  },
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 140,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {
 								    "tags": []
 								   },
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAuQAAAGxCAYAAAAqD6O8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAABnCElEQVR4nO3deXxU5dn/8c9FCCQE2STsIOCGaAUxAor7UlFQ1MdW0VpRK1LFpVYf6erSPv60Wh+1bqCiohRrUR+polSsKMqOArIIskrYwqLse67fH3MSJ/sMZOZMku/79ZpX5txnu84Qrrlyzn3uY+6OiIiIiIiEo1bYAYiIiIiI1GQqyEVEREREQqSCXEREREQkRCrIRURERERCpIJcRERERCREKshFREREREKU1ILczHqb2UIzW2xmQ0qZf7WZzQlek8ysS0XrmlkTM/vQzL4JfjZO1vGIiFRnytkiIsmRtILczNKAp4ELgM5AfzPrXGyxZcAZ7n488CdgWAzrDgE+cvcjgY+CaREROQjK2SIiyZPMM+TdgcXuvtTd9wCvA/2iF3D3Se7+XTA5BWgTw7r9gFeC968AlyTuEEREagzlbBGRJKmdxH21BlZGTecCPcpZ/gbg/RjWbe7uawDcfY2ZNSttY2Y2EBgIkJWVdWKnTp3iPgARkbDNnDlzg7tnJ2FXytkiIpUglrydzILcSmnzUhc0O4tIcj813nXL4u7DCC6n5uTk+IwZM+JZXUQkJZjZimTtqpQ25WwRkTjFkreT2WUlF2gbNd0GWF18ITM7HngB6OfuG2NYd52ZtQzWbQnkVXLcIiI1kXK2iEiSJLMgnw4caWYdzKwOcCUwJnoBM2sHvAVc4+6LYlx3DHBt8P5a4J0EHoOISE2hnC0ikiRJ67Li7vvMbDAwDkgDhrv7PDMbFMx/DvgjcCjwjJkB7HP3nLLWDTb9EPCGmd0AfAv8JFnHJCJSXSlni4gkj7nH1a2vWlB/RBGpqsxsprvnhB1HMilni0hVFkve1pM6RURERERClMxRVqQGys/PJzc3l+3bt4cdikiVkJWVRZs2bahVS+dLpPrasmULeXl57N27N+xQRA5Keno6zZo1o0GDBge1HRXkklAbNmzAzDj66KNVYIhUID8/n1WrVrFhwwaaNSt1eG6RKm/Lli2sW7eO1q1bk5mZSXD/gUiV4+7s3LmTVatWARxUUa4KSRLq+++/p3nz5irGRWJQq1YtmjdvzubNm8MORSRh8vLyaN26NfXq1VMxLlWamVGvXj1at25NXt7BjeCqKkkSav/+/aSnp4cdhkiVkZ6ezr59+8IOQyRh9u7dS2ZmZthhiFSazMzMg+5+pYJcEk5nQERip/8vUhPo91yqk8r4fVZBLiIiIiISIhXkIiIiIiIh0igrknTth7yX1P0tf6hP0vY1YMAANmzYwLvvvpu0fSbScccdx+WXX859990HQPv27Rk8eDB33XXXQW977969HHvssbzwwgucfvrpB729quDyyy/nlFNO4c477ww7FJGUo++GcC1fvpwOHTowffp0cnIiz7D5/PPP+eUvf8nXX3/NKaecwoQJE5IeV5jfFe+++y6///3v+eKLLxI+OIXOkItUoieeeILXXnst7DASZvr06dx8880xL798+XLMjNKesjhs2DBat25dJMF+9913XHPNNTRs2JCGDRtyzTXX8P3338cV48svv4yZlfqaPn16XNuKx7x587j88svp2LEjZlb4R0y0e++9lz//+c8aRUWkhqkK3w1t27ZlzZo1dO3atbDt9ttvp0uXLixZsoS33norlLjC/K7o27cvaWlpjBw5sjIPqVQqyEUqUcOGDWnUqFHS9pfsh2pkZ2dTr169StnW3/72N2644YYibVdddRVffPEF77//Ph988AFffPEF11xzTVzbveKKK1izZk2R189+9jM6dOhQeNYnFmeeeSYvv/xyzMvv2LGD9u3b8+c//5kOHTqUusyPfvQjOnbsmPJfzCJSuZL93XAg0tLSaNGiBbVr/9B5YvHixZx99tm0bduWJk2aHNB29+zZc1Bxhf1dcd111/Hkk08e1DHEQgW5SDGffvopPXv2pH79+jRs2JAePXowd+7cwvlTpkzh7LPPJisri4YNG3LOOeewevVqIHJZsm/fvoXLnnnmmQwaNIjbb7+dxo0b07hxY+6++27y8/MBeOCBBzjuuONKxNCrVy9uu+22Im0FZ5tHjRrF2WefTWZmJkOHDmXjxo3079+fNm3akJmZybHHHstLL71UZN0zzzyTm2++md/+9rc0bdqUZs2acddddxXGAZGxgfv160dmZiaHHXYYw4cPLxFX+/btefTRRwunzYxhw4bxk5/8hKysrBLFZkFhetJJJ2FmnHnmmQDMmDGDRYsWFfmsFixYwAcffMCwYcM45ZRTOPnkkxk6dCjvvvsuCxcuLONfq6TMzExatGhR+GrQoAH/+te/+MUvfpHQkR1OOukkHn30Ua666qpy/2i5+OKLGTVqVMLiEJHESPXvhuJXIs2M0aNHF1nmzTff5LzzzqNevXp07tyZDz/8sNTtFLzfvHkz119/PWZWeILi008/pUePHmRkZNC8eXN+9atfFSm6zzzzTH75y19y1113kZ2dTa9evZgwYQJmxvvvv8+JJ55IZmYmp512Grm5uXzyySd06dKF+vXr07dvXzZu3Fi4rVT4rrj44ouZMWMGixcvjnnbB0IFuUiUffv20a9fP0499VRmz57N1KlTuf3220lLSwNg9uzZnHXWWRxxxBF8/vnnTJkyhZ/+9Kfljhs9cuRI8vPzmTx5MkOHDmXYsGE8/vjjAFx//fV8/fXXTJs2rXD5hQsXMmnSpBJnBAr85je/4eabb2b+/Plccskl7Nq1i27duvHuu+8yb948br/9dm666SY++uijEnHUrl2bSZMm8dRTT/H444/zj3/8o3D+gAEDWLx4MePHj+f//u//GDFiBMuXL6/wM3vggQfo168fs2fP5oorruD6669nxYoVAIXH9cEHH7BmzZrCS54TJ07kiCOOKHLGaPLkydSvX59TTjmlsK1Xr15kZWUxadKkCuMoyxtvvMH27du57rrrDngblal79+5MmzaNnTt3hh2KiMSoKnw3xOJ3v/sdt912G7Nnz+akk07iyiuvZNu2bSWWK+i+Uq9ePR5//HHWrFnDFVdcwapVq7jgggs44YQT+PLLL3nxxRcZNWoUv/nNb4qs/9prr+HuTJw4kREjRhS233vvvTz++ONMnTqV7777jiuuuIIHHniAYcOGMWHCBObNm1eku18qfFe0a9eO5s2b88knnxzwtmOR1Js6zaw38ASQBrzg7g8Vm98JeAnoBvzO3R8N2o8G/hG1aEfgj+7+uJndB9wIrA/m/dbdxyb0QKTa2rJlC99//z0XXXQRhx9+OACdOnUqnP+Xv/yFLl26MGzYsMK2Y445ptxttmzZkieffBIzo1OnTixatIjHHnuMO++8kzZt2tC7d2+GDx9O9+7dARg+fDgnnngiXbp0KXV7t956K5dffnmRtrvvvrvw/cCBA/nPf/7DqFGjOOeccwrbO3fuzAMPPADAUUcdxfPPP89HH31E//79WbRoEe+//z6fffYZvXr1AuCVV16hY8eOFX5m11xzDT/72c8A+NOf/sQTTzzBxIkTOeyww8jOzgbg0EMPpUWLFoXrrFixgpYtWxbZztq1a8nOzi5yZsLMaNasGWvXrq0wjrIMGzaMvn37lthfWFq1asXevXtZvXp14e9YqlLOFomoCt8NsfjVr37FRRddBMCDDz7IiBEjmDVrFqeeemqR5Qq6r5gZDRs2LMzfzzzzDC1btuSZZ56hVq1aHHPMMTz00EPcdNNN/OlPfyq8OtihQwf++te/Fm5v3bp1QOQ74rTTTgNg0KBB3HrrrcycOZNu3boBcO211xae1YfU+a5o1apVTCeoDkbSzpCbWRrwNHAB0Bnob2adiy22CbgNeDS60d0XuntXd+8KnAjsAN6OWuR/C+YrscvBaNKkCQMGDOD888+nT58+PPbYY6xcubJw/pdfflmkyI1Fz54
 								      "text/plain": [
 								       "<Figure size 864x504 with 4 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

											
										
										
											2016-11-03 23:47:11 +01:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – plots a few distributions you can use in randomized search\n",
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from scipy.stats import randint, uniform, geom, expon\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "xs1 = np.arange(0, 7 + 1)\n",
 								    "randint_distrib = randint(0, 7 + 1).pmf(xs1)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "xs2 = np.linspace(0, 7, 500)\n",
 								    "uniform_distrib = uniform(0, 7).pdf(xs2)\n",
 								    "\n",
 								    "xs3 = np.arange(0, 7 + 1)\n",
 								    "geom_distrib = geom(0.5).pmf(xs3)\n",
 								    "\n",
 								    "xs4 = np.linspace(0, 7, 500)\n",
 								    "expon_distrib = expon(scale=1).pdf(xs4)\n",
 								    "\n",
 								    "plt.figure(figsize=(12, 7))\n",
 								    "\n",
 								    "plt.subplot(2, 2, 1)\n",
 								    "plt.bar(xs1, randint_distrib, label=\"scipy.randint(0, 7 + 1)\")\n",
 								    "plt.ylabel(\"Probability\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([-1, 8, 0, 0.2])\n",
 								    "\n",
 								    "plt.subplot(2, 2, 2)\n",
 								    "plt.fill_between(xs2, uniform_distrib, label=\"scipy.uniform(0, 7)\")\n",
 								    "plt.ylabel(\"PDF\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([-1, 8, 0, 0.2])\n",
 								    "\n",
 								    "plt.subplot(2, 2, 3)\n",
 								    "plt.bar(xs3, geom_distrib, label=\"scipy.geom(0.5)\")\n",
 								    "plt.xlabel(\"Hyperparameter value\")\n",
 								    "plt.ylabel(\"Probability\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([0, 7, 0, 1])\n",
 								    "\n",
 								    "plt.subplot(2, 2, 4)\n",
 								    "plt.fill_between(xs4, expon_distrib, label=\"scipy.expon(scale=1)\")\n",
 								    "plt.xlabel(\"Hyperparameter value\")\n",
 								    "plt.ylabel(\"PDF\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([0, 7, 0, 1])\n",
 								    "\n",
 								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "source": [
 								    "Here are the PDF for `expon()` and `reciprocal()` (left column), as well as the PDF of log(X) (right column). The right column shows the distribution of hyperparameter _scales_. You can see that `expon()` favors hyperparameters with roughly the desired scale, with a longer tail towards the smaller scales. But `reciprocal()` does not favor any scale, they are all equally likely:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 141,
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "metadata": {
 								    "tags": []
 								   },
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAuQAAAGxCAYAAAAqD6O8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAAsTAAALEwEAmpwYAACNVklEQVR4nOzdeXwV1f3/8dcnO4Rd9kUBBQWpUkTEukHdV9S2VksVsK2gUrUuFfVX96+7qKUKIoIg7oqVqtUqsqkg+77vEJawb4GEJJ/fH/fmmoSskOTem7yfj8d95M7MmZnP3OSe+eTMmTPm7oiIiIiISHjEhDsAEREREZGqTAm5iIiIiEgYKSEXEREREQkjJeQiIiIiImGkhFxEREREJIyUkIuIiIiIhFFEJ+RmNtzMUs1sQSHLzcz+aWYrzGyemXWq6BhFRCRAdbaIyJGJ6IQceAu4pIjllwJtgq9bgMEVEJOIiBTsLVRni4iUWkQn5O4+CdhRRJEewCgPmArUMbMmFROdiIjkpjpbROTIxIU7gKPUDFifa3pDcN6m/AXN7BYCLTIkJyefdtJJJ1VIgCIiZWnmzJnb3L1BuOM4QqqzRaTKKUm9He0JuRUwzwsq6O5DgaEAJ/2io8+YMaM84xIRKRdmtjbcMRyFI6qzO3furDpbRKJWSertiO6yUgIbgBa5ppsDG4tbaX96ZrkFJCIihTqiOltEpLKL9oR8LHBT8M79rsBudz/s0md++5SQi4iEwxHV2SIilV1Ed1kxs/eAbkB9M9sAPALEA7j7EOBL4DJgBZAG9CnJdg9kZLE/PZPkxIg+fBGRqFJedbaISGUX0Rmpu99QzHIHbi/1doHpa3bQ7cSGRxqaiIjkU151tohIZRftXVaO2I8rt4c7BBERERGRyG4hL0/fL98W7hCqvOzsbDZs2MD+/fvDHYpIxEhOTqZ58+bExFTZ9hKJEHv27CE1NZVDhw6FOxSRiBUfH0/Dhg2pVavWUW2nyibkizbtYcf+DOolJ4Q7lCpr27ZtmBknnniikg8RAv+kpqSksG3bNho2VJc6CZ89e/awZcsWmjVrRrVq1TAraMRKkarN3Tlw4AApKSkAR5WUV+ks6MeVaiUPp127dtGoUSMl4yJBMTExNGrUiN27d4c7FKniUlNTadasGdWrV1cyLlIIM6N69eo0a9aM1NTUo9pWlc6EflihfuThlJWVRXx8fLjDEIko8fHxZGZqaFYJr0OHDlGtWrVwhyESFapVq3bUXbuqdEL+/Yqt4Q6hylPLi0he+k5IpNDfokjJlMV3pUon5Ot3HGDtdt1QKCIiIiLhU6UTcoDJGm1FRERERMKoyo6ykmPSsq38setx4Q5DgloO+KJC97fmmcsrbF+9e/dm27ZtfP755xW2z0j26KOPsm7dOoYPH14h+zMzPvroI377299WyP4K0qVLFwYMGMC1114bthhEjkak19HlWc/27t2b1q1b8/DDD5d4nfnz53PJJZewbNkykpOTyyyWbt260aFDB/71r38VWuatt96if//+7Nu3r8z2K+WnyreQT1m5ncys7HCHIVXAK6+8wujRo8MdRkRITU1l4MCB/L//9//CHUqZmTRpEldddRXNmjXDzHjrrbcOK/OPf/yD+++/n+xs1Tki0WT+/Pl89tln3HXXXQCkpaXRtm1b/vrXv+Ypt2XLFurXr8+zzz4LwC9+8Qu6du3KwIEDyzSeMWPG8PTTT4emW7ZsyQsvvHDU2928eTP169fnxRdfzDN/4cKFJCUl8cEHHxz1PqRgVT4h35ueyZz1u8IdhlQBtWvXpk6dOuEOIyIMGzaMLl260Lp163CHUmb27dtHhw4deOWVVwodneKyyy5j7969/Pe//63g6ETkaAwaNIjf/OY3oXGmq1evzsiRIxkyZAjjxo0LlfvLX/5C27Ztuffee0Pz+vTpw+DBg8t09KR69epRs2bNMttejsaNG/Paa6/x0EMPsWjRIiAw4s5NN91Ejx49+P3vf1/m+5SAKp+QQ6DbikhJTZo0ia5du1KjRg1q167NGWecwYIFCwCYOnUqv/71r0lOTqZ27dqcf/75bNy4EQhc7rziiitC2+nWrRv9+vXjzjvvpG7dutStW5f77rsv1Hr6+OOP06FDh8P2f9ZZZ3HHHXccNj8lJYXrr78+tK3LL7+c5cuXA7B161aaNGnC448/Hio/b948kpKS+Pjjj4FAF5IOHTowbNgwjj32WKpVq8bVV1/Ntm0/32eRnZ3NE088QYsWLUhMTOQXv/gFn332WWj5mjVrMDM++eQTLrzwQqpXr0779u355ptv8sT67rvvctVVV5X4cy3us/3qq68455xzqFu3LvXq1ePiiy9m8eLFhf4Oi/u8jsRll13GU089xW9/+9tCx9aPjY3lsssu47333jvi/YhIyaWnp3PXXXfRqFEjkpKS6Nq1K99//32eMl988QUnnngiSUlJnHvuubz//vuYGWvWrAECQ/R++OGHh9VZZ555Jvfccw99+vRh9+7dDB8+nHHjxjFq1ChiY2ND5S666CJ27NjBhAkTCo2zcePGeVqfzzrrLGrWrBlK4pcvX46ZhR5A061bN/r37x96v3btWu677z7M7LARP8aNG0eHDh1ITk6me/furF69usjP7LrrruPqq6/mpptuIjMzkyeeeIKNGzcyePDgItcrCyNGjKB9+/YkJSXRtm1bXnrppdA58YknnqBx48Z5xvu+4YYb6NSpExkZGUCga+K//vUvLr/8cqpXr85xxx132JXp+fPnc8EFF1CtWjXq1atH79698zz7Iedc/corr9CsWTPq1q1Lnz59SEtLK9djV0IOTFRCLiWUmZlJjx49OPvss5k7dy4//fQTd955J7GxscydO5fu3btzwgkn8MMPPzB16lSuu+66IltF3nnnHbKzs5kyZQqvv/46Q4cO5eWXXwbg5ptvZsmSJUybNi1UfunSpfz444/86U9/yrOdtLQ0unfvTlJSEhMnTmTKlCk0adKECy64gLS0NBo0aMBbb73Fk08+yZQpUzhw4AA33HADN9xwQ54+1WvWrGH06NF89tlnfPvttyxfvpybb745tPyVV17h+eef59lnn2X+/Plcc801XHvttcyZMydPPA899BB33HEHc+fO5fTTT+f6668P9WPcsWMHixYtonPnziX6XIFiP9v9+/dz1113MW3aNCZMmEDt2rW58sorQ5V0fsV9XgCTJ0+mRo0aRb6eeuqpQn+3henSpQsTJ04s9XoiUnp///vf+eCDDxg+fDizZ8/mF7/4BZdccgmbNm0CYN26dVx77bVcfvnlzJ07lzvuuIO///3vebYxb948du/enafOyvH4449Tu3ZtbrzxRv72t7/x/PPPc8IJJ+Qpk5CQQMeOHYv83p933nmMHz8eCNRPM2bMIDExkRkzZgAwYcIETjjhBJo1a3bYumPGjKF58+Y8/PDDbNq0KXRsEPiH5Omnn2b48OFMmTKFXbt20a9fv2I/t9dee42UlBR69uzJ008/zbBhw6hXr16x6xWnqO56b7zxBg8++CCPP/44ixcv5sUXX+TZZ5/ltddeA+DBBx+kTZs2oXPSqFGj+Oyzz3j33XdJSPj5qeuPPPIIV111FXPmzOGWW27hpptuCn2OaWlpXHLJJdSoUYNp06bx6aef8uOPP+Y5z0Gg/l+wYAHffvstH3zwAZ9++imvvPLKUR9/Uar8TZ0A81J2s31fOsfUSAx3KBLh9uzZw65du7jyyis5/vjjATjppJMA6NmzJ6eeeipDhw4NlW/Xrl2R22vSpAn//Oc/MTNOOukkli1bxsCBA7n77rtp3rw5l1xyCcOHD6dLly4ADB8+nNNOO41TTz01z3bef/993J0RI0aEWkdef/11GjZsyOeff851113HxRdfzG233UbPnj0577zzSE9PZ9CgQXm2c+DAAUaNGsWxxx4b2sY555zD8uXLadOmDS+88AL33nsvf/jDH4DAyWjSpEm88MILeVoh/va3v3HllVcC8NRTTzFq1CjmzJnD2Wefzbp163B3mjRpUqLPFeC5554r8rP9zW9+k+c
 								      "text/plain": [
 								       "<Figure size 864x504 with 4 Axes>"
 								      ]
 								     },
 								     "metadata": {
 								      "needs_background": "light"
 								     },
 								     "output_type": "display_data"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – shows the difference between expon and reciprocal\n",
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from scipy.stats import reciprocal\n",
 								    "\n",
 								    "xs1 = np.linspace(0, 7, 500)\n",
 								    "expon_distrib = expon(scale=1).pdf(xs1)\n",
 								    "\n",
 								    "log_xs2 = np.linspace(-5, 3, 500)\n",
 								    "log_expon_distrib = np.exp(log_xs2 - np.exp(log_xs2))\n",
 								    "\n",
 								    "xs3 = np.linspace(0.001, 1000, 500)\n",
 								    "reciprocal_distrib = reciprocal(0.001, 1000).pdf(xs3)\n",
 								    "\n",
 								    "log_xs4 = np.linspace(np.log(0.001), np.log(1000), 500)\n",
 								    "log_reciprocal_distrib = uniform(np.log(0.001), np.log(1000)).pdf(log_xs4)\n",
 								    "\n",
 								    "plt.figure(figsize=(12, 7))\n",
 								    "\n",
 								    "plt.subplot(2, 2, 1)\n",
 								    "plt.fill_between(xs1, expon_distrib,\n",
 								    "                 label=\"scipy.expon(scale=1)\")\n",
 								    "plt.ylabel(\"PDF\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([0, 7, 0, 1])\n",
 								    "\n",
 								    "plt.subplot(2, 2, 2)\n",
 								    "plt.fill_between(log_xs2, log_expon_distrib,\n",
 								    "                 label=\"log(X) with X ~ expon\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([-5, 3, 0, 1])\n",
 								    "\n",
 								    "plt.subplot(2, 2, 3)\n",
 								    "plt.fill_between(xs3, reciprocal_distrib,\n",
 								    "                 label=\"scipy.reciprocal(0.001, 1000)\")\n",
 								    "plt.xlabel(\"Hyperparameter value\")\n",
 								    "plt.ylabel(\"PDF\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([0.001, 1000, 0, 0.005])\n",
 								    "\n",
 								    "plt.subplot(2, 2, 4)\n",
 								    "plt.fill_between(log_xs4, log_reciprocal_distrib,\n",
 								    "                 label=\"log(X) with X ~ reciprocal\")\n",
 								    "plt.xlabel(\"Log of hyperparameter value\")\n",
 								    "plt.legend()\n",
 								    "plt.axis([-8, 1, 0, 0.2])\n",
 								    "\n",
 								    "plt.show()"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Analyze the Best Models and Their Errors"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 142,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([0.07, 0.05, 0.05, 0.01, 0.01, 0.01, 0.01, 0.19, 0.04, 0.01, 0.  ,\n",
 								       "       0.01, 0.01, 0.01, 0.01, 0.01, 0.  , 0.01, 0.01, 0.01, 0.  , 0.01,\n",
 								       "       0.01, 0.01, 0.01, 0.01, 0.  , 0.  , 0.02, 0.01, 0.01, 0.01, 0.02,\n",
 								       "       0.01, 0.  , 0.02, 0.03, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01,\n",
 								       "       0.01, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.01, 0.  , 0.07,\n",
 								       "       0.  , 0.  , 0.  , 0.01])"
 								      ]
 								     },
 								     "execution_count": 142,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "final_model = rnd_search.best_estimator_  # includes preprocessing\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "feature_importances = final_model[\"random_forest\"].feature_importances_\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "feature_importances.round(2)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 143,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "[(0.18694559869103852, 'log__median_income'),\n",
 								       " (0.0748194905715524, 'cat__ocean_proximity_INLAND'),\n",
 								       " (0.06926417748515576, 'bedrooms_ratio__bedrooms_ratio'),\n",
 								       " (0.05446998753775219, 'rooms_per_house__rooms_per_house'),\n",
 								       " (0.05262301809680712, 'people_per_house__people_per_house'),\n",
 								       " (0.03819415873915732, 'geo__Cluster 0 similarity'),\n",
 								       " (0.02879263999929514, 'geo__Cluster 28 similarity'),\n",
 								       " (0.023530192521380392, 'geo__Cluster 24 similarity'),\n",
 								       " (0.020544786346378206, 'geo__Cluster 27 similarity'),\n",
 								       " (0.019873052631077512, 'geo__Cluster 43 similarity'),\n",
 								       " (0.018597511022930273, 'geo__Cluster 34 similarity'),\n",
 								       " (0.017409085415656868, 'geo__Cluster 37 similarity'),\n",
 								       " (0.015546519677632162, 'geo__Cluster 20 similarity'),\n",
 								       " (0.014230331127504292, 'geo__Cluster 17 similarity'),\n",
 								       " (0.0141032216204026, 'geo__Cluster 39 similarity'),\n",
 								       " (0.014065768027447325, 'geo__Cluster 9 similarity'),\n",
 								       " (0.01354220782825315, 'geo__Cluster 4 similarity'),\n",
 								       " (0.01348963625822907, 'geo__Cluster 3 similarity'),\n",
 								       " (0.01338319626383868, 'geo__Cluster 38 similarity'),\n",
 								       " (0.012240533790212824, 'geo__Cluster 31 similarity'),\n",
 								       " (0.012089046542256785, 'geo__Cluster 7 similarity'),\n",
 								       " (0.01152326329703204, 'geo__Cluster 23 similarity'),\n",
 								       " (0.011397459905603558, 'geo__Cluster 40 similarity'),\n",
 								       " (0.011282340924816439, 'geo__Cluster 36 similarity'),\n",
 								       " (0.01104139770781063, 'remainder__housing_median_age'),\n",
 								       " (0.010671123191312802, 'geo__Cluster 44 similarity'),\n",
 								       " (0.010296376177202627, 'geo__Cluster 5 similarity'),\n",
 								       " (0.010184798445004483, 'geo__Cluster 42 similarity'),\n",
 								       " (0.010121853542225083, 'geo__Cluster 11 similarity'),\n",
 								       " (0.009795219101117579, 'geo__Cluster 35 similarity'),\n",
 								       " (0.00952581084310724, 'geo__Cluster 10 similarity'),\n",
 								       " (0.009433209165984823, 'geo__Cluster 13 similarity'),\n",
 								       " (0.00915075361116215, 'geo__Cluster 1 similarity'),\n",
 								       " (0.009021485619463173, 'geo__Cluster 30 similarity'),\n",
 								       " (0.00894936224917583, 'geo__Cluster 41 similarity'),\n",
 								       " (0.008901832702357514, 'geo__Cluster 25 similarity'),\n",
 								       " (0.008897504713401587, 'geo__Cluster 29 similarity'),\n",
 								       " (0.0086846298524955, 'geo__Cluster 21 similarity'),\n",
 								       " (0.008061104590483955, 'geo__Cluster 15 similarity'),\n",
 								       " (0.00786048176566994, 'geo__Cluster 16 similarity'),\n",
 								       " (0.007793633130749198, 'geo__Cluster 22 similarity'),\n",
 								       " (0.007501766442066527, 'log__total_rooms'),\n",
 								       " (0.0072024111938241275, 'geo__Cluster 32 similarity'),\n",
 								       " (0.006947156598995616, 'log__population'),\n",
 								       " (0.006800076770899128, 'log__households'),\n",
 								       " (0.006736105364684462, 'log__total_bedrooms'),\n",
 								       " (0.006315268213499131, 'geo__Cluster 33 similarity'),\n",
 								       " (0.005796398579893261, 'geo__Cluster 14 similarity'),\n",
 								       " (0.005234954623294958, 'geo__Cluster 6 similarity'),\n",
 								       " (0.0045514083468621595, 'geo__Cluster 12 similarity'),\n",
 								       " (0.004546042080216035, 'geo__Cluster 18 similarity'),\n",
 								       " (0.004314514641115755, 'geo__Cluster 2 similarity'),\n",
 								       " (0.003953528110719969, 'geo__Cluster 19 similarity'),\n",
 								       " (0.003297404747742136, 'geo__Cluster 26 similarity'),\n",
 								       " (0.00289453474290887, 'cat__ocean_proximity_<1H OCEAN'),\n",
 								       " (0.0016978863168109126, 'cat__ocean_proximity_NEAR OCEAN'),\n",
 								       " (0.0016391131530559377, 'geo__Cluster 8 similarity'),\n",
 								       " (0.00015061247730531558, 'cat__ocean_proximity_NEAR BAY'),\n",
 								       " (7.301686597099842e-05, 'cat__ocean_proximity_ISLAND')]"
 								      ]
 								     },
 								     "execution_count": 143,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "sorted(zip(feature_importances,\n",
 								    "           final_model[\"preprocessing\"].get_feature_names_out()),\n",
 								    "           reverse=True)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add some section headers

											
										
										
											2021-10-02 13:14:44 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "## Evaluate Your System on the Test Set"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 144,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "41424.40026462184\n"
 								     ]
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
 								    "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
 								    "y_test = strat_test_set[\"median_house_value\"].copy()\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "final_predictions = final_model.predict(X_test)\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "final_rmse = mean_squared_error(y_test, final_predictions, squared=False)\n",
 								    "print(final_rmse)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "We can compute a 95% confidence interval for the test RMSE:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 145,
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([39275.40861216, 43467.27680583])"
 								      ]
 								     },
 								     "execution_count": 145,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   "source": [
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "from scipy import stats\n",
 								    "\n",
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								    "confidence = 0.95\n",
 								    "squared_errors = (final_predictions - y_test) ** 2\n",
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,\n",
 								    "                         loc=squared_errors.mean(),\n",
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								    "                         scale=stats.sem(squared_errors)))"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "We could compute the interval manually like this:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 146,
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "(39275.40861216077, 43467.2768058342)"
 								      ]
 								     },
 								     "execution_count": 146,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   "source": [
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – shows how to compute a confidence interval for the RMSE\n",
-												Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

											
										
										
											2019-01-18 16:08:37 +01:00
+								    "m = len(squared_errors)\n",
 								    "mean = squared_errors.mean()\n",
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								    "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n",
 								    "tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
 								    "np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "Alternatively, we could use a z-score rather than a t-score. Since the test set is not too small, it won't make a big difference:"
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 147,
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "(39276.05610140007, 43466.691749969636)"
 								      ]
 								     },
 								     "execution_count": 147,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "# extra code – computes a confidence interval again using a z-score\n",
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								    "zscore = stats.norm.ppf((1 + confidence) / 2)\n",
 								    "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
 								    "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)"
 								   ]
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "## Model persistence using joblib"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Save the final model:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 148,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "['my_california_housing_model.pkl']"
 								      ]
 								     },
 								     "execution_count": 148,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "import joblib\n",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "joblib.dump(final_model, \"my_california_housing_model.pkl\")"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												Clarify the 'not in the book' comments

											
										
										
											2021-11-21 04:40:36 +01:00
+								    "Now you can deploy this model to production. For example, the following code could be a script that would run in production:"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 149,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "import joblib\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "\n",
-												Sync notebook with book's code examples, and better identify extra code

											
										
										
											2022-02-19 06:17:36 +01:00
+								    "# extra code – excluded for conciseness\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.cluster import KMeans\n",
 								    "from sklearn.base import BaseEstimator, TransformerMixin\n",
 								    "from sklearn.metrics.pairwise import rbf_kernel\n",
 								    "\n",
 								    "def column_ratio(X):\n",
 								    "    return X[:, [0]] / X[:, [1]]\n",
 								    "\n",
-												Sync notebook code examples with book

											
										
										
											2021-11-03 03:53:04 +01:00
+								    "#class ClusterSimilarity(BaseEstimator, TransformerMixin):\n",
 								    "#    [...]\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "final_model_reloaded = joblib.load(\"my_california_housing_model.pkl\")\n",
 								    "\n",
 								    "new_data = housing.iloc[:5]  # pretend these are new districts\n",
 								    "predictions = final_model_reloaded.predict(new_data)"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 150,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([442737.15, 457566.06, 105965.  ,  98462.  , 332992.01])"
 								      ]
 								     },
 								     "execution_count": 150,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "predictions"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Also works with pickle, but joblib is more efficient."
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   ]
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "source": [
 								    "# Exercise solutions"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "source": [
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "## 1."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Update notebooks to latest nbformat

											
										
										
											2020-04-06 09:13:12 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "Exercise: _Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?_"
-												Update libraries to latest version, including TensorFlow 2.4.1 and Scikit-Learn 0.24.1

											
										
										
											2021-02-14 03:02:09 +01:00
+								   ]
 								  },
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 151,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "GridSearchCV(cv=3,\n",
 								       "             estimator=Pipeline(steps=[('preprocessing',\n",
 								       "                                        ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                     SimpleImputer(strategy='median')),\n",
 								       "                                                                                    ('standardscaler',\n",
 								       "                                                                                     StandardScaler())]),\n",
 								       "                                                          transformers=[('bedrooms_ratio',\n",
 								       "                                                                         Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                          SimpleImputer(strategy='median')),\n",
 								       "                                                                                         ('functiontransformer',\n",
 								       "                                                                                          FunctionTransformer(feature_names_...\n",
 								       "                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
 								       "                                       ('svr', SVR())]),\n",
 								       "             param_grid=[{'svr__C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,\n",
 								       "                                     10000.0, 30000.0],\n",
 								       "                          'svr__kernel': ['linear']},\n",
 								       "                         {'svr__C': [1.0, 3.0, 10.0, 30.0, 100.0, 300.0,\n",
 								       "                                     1000.0],\n",
 								       "                          'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],\n",
 								       "                          'svr__kernel': ['rbf']}],\n",
 								       "             scoring='neg_root_mean_squared_error')"
 								      ]
 								     },
 								     "execution_count": 151,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "from sklearn.model_selection import GridSearchCV\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.svm import SVR\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "\n",
 								    "param_grid = [\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "        {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,\n",
 								    "                                               3000., 10000., 30000.0]},\n",
 								    "        {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,\n",
 								    "                                            1000.0],\n",
 								    "         'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "    ]\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "svr_pipeline = Pipeline([(\"preprocessing\", preprocessing), (\"svr\", SVR())])\n",
 								    "grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3,\n",
 								    "                           scoring='neg_root_mean_squared_error')\n",
 								    "grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "The best model achieves the following score (evaluated using 3-fold cross validation):"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 152,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "69814.13889867254"
 								      ]
 								     },
 								     "execution_count": 152,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "svr_grid_search_rmse = -grid_search.best_score_\n",
 								    "svr_grid_search_rmse"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "That's much worse than the `RandomForestRegressor` (but to be fair, we trained the model on much less data). Let's check the best hyperparameters found:"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 153,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "{'svr__C': 10000.0, 'svr__kernel': 'linear'}"
 								      ]
 								     },
 								     "execution_count": 153,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "grid_search.best_params_"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "The linear kernel seems better than the RBF kernel. Notice that the value of `C` is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for `C` (removing the smallest values), because it is likely that higher values of `C` will be better."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "## 2."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "Exercise: _Try replacing the `GridSearchCV` with a `RandomizedSearchCV`._"
-												Update libraries to latest version, including TensorFlow 2.4.1 and Scikit-Learn 0.24.1

											
										
										
											2021-02-14 03:02:09 +01:00
+								   ]
 								  },
-												Clarify a few messages

											
										
										
											2021-10-28 05:02:31 +02:00
+								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "**Warning:** the following cell will take several minutes to run. You can specify `verbose=2` when creating the `RandomizedSearchCV` if you want to see the training details."
 								   ]
 								  },
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 154,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "RandomizedSearchCV(cv=3,\n",
 								       "                   estimator=Pipeline(steps=[('preprocessing',\n",
 								       "                                              ColumnTransformer(remainder=Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                           SimpleImputer(strategy='median')),\n",
 								       "                                                                                          ('standardscaler',\n",
 								       "                                                                                           StandardScaler())]),\n",
 								       "                                                                transformers=[('bedrooms_ratio',\n",
 								       "                                                                               Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                                SimpleImputer(strategy='median')),\n",
 								       "                                                                                               ('functiontransformer',\n",
 								       "                                                                                                FunctionTransformer(feature_...\n",
 								       "                                                                               <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
 								       "                                             ('svr', SVR())]),\n",
 								       "                   n_iter=50,\n",
 								       "                   param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9ae254b9d0>,\n",
 								       "                                        'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b734dbe50>,\n",
 								       "                                        'svr__kernel': ['linear', 'rbf']},\n",
 								       "                   random_state=42, scoring='neg_root_mean_squared_error')"
 								      ]
 								     },
 								     "execution_count": 154,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "from sklearn.model_selection import RandomizedSearchCV\n",
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "from scipy.stats import expon, reciprocal\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "\n",
-												Add code to compute a confidence interval

											
										
										
											2018-05-08 19:41:47 +02:00
+								    "# see https://docs.scipy.org/doc/scipy/reference/stats.html\n",
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								    "# for `expon()` and `reciprocal()` documentation and more probability distribution functions.\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "\n",
 								    "# Note: gamma is ignored when kernel is \"linear\"\n",
 								    "param_distribs = {\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "        'svr__kernel': ['linear', 'rbf'],\n",
 								    "        'svr__C': reciprocal(20, 200_000),\n",
 								    "        'svr__gamma': expon(scale=1.0),\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "    }\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "rnd_search = RandomizedSearchCV(svr_pipeline,\n",
 								    "                                param_distributions=param_distribs,\n",
 								    "                                n_iter=50, cv=3,\n",
 								    "                                scoring='neg_root_mean_squared_error',\n",
 								    "                                random_state=42)\n",
 								    "rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "The best model achieves the following score (evaluated using 3-fold cross validation):"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 155,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "55853.88100300133"
 								      ]
 								     },
 								     "execution_count": 155,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "svr_rnd_search_rmse = -rnd_search.best_score_\n",
 								    "svr_rnd_search_rmse"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Now that's really much better, but still far from the `RandomForestRegressor`'s performance. Let's check the best hyperparameters found:"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 156,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "{'svr__C': 157055.10989448498,\n",
 								       " 'svr__gamma': 0.26497040005002437,\n",
 								       " 'svr__kernel': 'rbf'}"
 								      ]
 								     },
 								     "execution_count": 156,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "rnd_search.best_params_"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "This time the search found a good set of hyperparameters for the RBF kernel. Randomized search tends to find better hyperparameters than grid search in the same amount of time."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Note that we used the `expon()` distribution for `gamma`, with a scale of 1, so `RandomSearch` mostly searched for values roughly of that scale: about 80% of the samples were between 0.1 and 2.3 (roughly 10% were smaller and 10% were larger):"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 157,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "0.80066"
 								      ]
 								     },
 								     "execution_count": 157,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "np.random.seed(42)\n",
 								    "\n",
 								    "s = expon(scale=1).rvs(100_000)  # get 100,000 samples\n",
-												Replace np.round(a) with a.round(), and other similar changes

											
										
										
											2021-11-01 02:42:42 +01:00
+								    "((s > 0.105) & (s < 2.29)).sum() / 100_000"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "We used the `reciprocal()` distribution for `C`, meaning we did not have a clue what the optimal scale of `C` was before running the random search. It explored the range from 20 to 200 just as much as the range from 2,000 to 20,000 or from 20,000 to 200,000."
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "## 3."
-												Explain the expon() and reciprocal() distributions

											
										
										
											2017-05-03 19:46:23 +02:00
+								   ]
 								  },
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "Exercise: _Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes._"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Let's create a new pipeline that runs the previously defined preparation pipeline, and adds a `SelectFromModel` transformer based on a `RandomForestRegressor` before the final regressor:"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 158,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.feature_selection import SelectFromModel\n",
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "selector_pipeline = Pipeline([\n",
 								    "    ('preprocessing', preprocessing),\n",
 								    "    ('selector', SelectFromModel(RandomForestRegressor(random_state=42),\n",
 								    "                                 threshold=0.005)),  # min feature importance\n",
 								    "    ('svr', SVR(C=rnd_search.best_params_[\"svr__C\"],\n",
 								    "                gamma=rnd_search.best_params_[\"svr__gamma\"],\n",
 								    "                kernel=rnd_search.best_params_[\"svr__kernel\"])),\n",
 								    "])"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 159,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "count        3.000000\n",
 								       "mean     56211.362086\n",
 								       "std       1922.002802\n",
 								       "min      54150.008629\n",
 								       "25%      55339.929909\n",
 								       "50%      56529.851189\n",
 								       "75%      57242.038815\n",
 								       "max      57954.226441\n",
 								       "dtype: float64"
 								      ]
 								     },
 								     "execution_count": 159,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "selector_rmses = -cross_val_score(selector_pipeline,\n",
 								    "                                  housing.iloc[:5000],\n",
 								    "                                  housing_labels.iloc[:5000],\n",
 								    "                                  scoring=\"neg_root_mean_squared_error\",\n",
 								    "                                  cv=3)\n",
 								    "pd.Series(selector_rmses).describe()"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Oh well, feature selection does not seem to help. But maybe that's just because the threshold we used was not optimal. Perhaps try tuning it using random search or grid search?"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "## 4."
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "Exercise: _Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()` method, and outputs the model's predictions in its `transform()` method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts._"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "Rather than restrict ourselves to k-Nearest Neighbors regressors, let's create a transformer that accepts any regressor. For this, we can extend the `MetaEstimatorMixin` and have a required `estimator` argument in the constructor. The `fit()` method must work on a clone of this estimator, and it must also save `feature_names_in_`. The `MetaEstimatorMixin` will ensure that `estimator` is listed as a required parameters, and it will update `get_params()` and `set_params()` to make the estimator's hyperparameters available for tuning. Lastly, we create a `get_feature_names_out()` method: the output column name is the ..."
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 160,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.neighbors import KNeighborsRegressor\n",
 								    "from sklearn.base import MetaEstimatorMixin, clone\n",
 								    "\n",
 								    "class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, estimator):\n",
 								    "        self.estimator = estimator\n",
 								    "\n",
 								    "    def fit(self, X, y=None):\n",
 								    "        estimator_ = clone(self.estimator)\n",
 								    "        estimator_.fit(X, y)\n",
 								    "        self.estimator_ = estimator_\n",
 								    "        self.n_features_in_ = self.estimator_.n_features_in_\n",
 								    "        if hasattr(self.estimator, \"feature_names_in_\"):\n",
 								    "            self.feature_names_in_ = self.estimator.feature_names_in_\n",
 								    "        return self  # always return self!\n",
 								    "    \n",
 								    "    def transform(self, X):\n",
 								    "        check_is_fitted(self)\n",
 								    "        predictions = self.estimator_.predict(X)\n",
 								    "        if predictions.ndim == 1:\n",
 								    "            predictions = predictions.reshape(-1, 1)\n",
 								    "        return predictions\n",
 								    "\n",
 								    "    def get_feature_names_out(self, names=None):\n",
 								    "        check_is_fitted(self)\n",
 								    "        n_outputs = getattr(self.estimator_, \"n_outputs_\", 1)\n",
 								    "        estimator_class_name = self.estimator_.__class__.__name__\n",
 								    "        estimator_short_name = estimator_class_name.lower().replace(\"_\", \"\")\n",
 								    "        return [f\"{estimator_short_name}_prediction_{i}\"\n",
 								    "                for i in range(n_outputs)]"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Let's ensure it complies to Scikit-Learn's API:"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 161,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "outputs": [],
 								   "source": [
-												Fix people_per_house feature name, and fix solution to last exercise

											
										
										
											2021-11-15 08:56:11 +01:00
+								    "from sklearn.utils.estimator_checks import check_estimator\n",
 								    "\n",
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "check_estimator(FeatureFromRegressor(KNeighborsRegressor()))"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Good! Now let's test it:"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 162,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "array([[456667.        ],\n",
 								       "       [435250.        ],\n",
 								       "       [105100.        ],\n",
 								       "       ...,\n",
 								       "       [148800.        ],\n",
 								       "       [500001.        ],\n",
 								       "       [234333.33333333]])"
 								      ]
 								     },
 								     "execution_count": 162,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "knn_reg = KNeighborsRegressor(n_neighbors=3, weights=\"distance\")\n",
 								    "knn_transformer = FeatureFromRegressor(knn_reg)\n",
 								    "geo_features = housing[[\"latitude\", \"longitude\"]]\n",
 								    "knn_transformer.fit_transform(geo_features, housing_labels)"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "And what does its output feature name look like?"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 163,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "['kneighborsregressor_prediction_0']"
 								      ]
 								     },
 								     "execution_count": 163,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "knn_transformer.get_feature_names_out()"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Okay, now let's include this transformer in our preprocessing pipeline:"
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 164,
-												Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2

											
										
										
											2018-01-14 09:11:47 +01:00
+								   "metadata": {},
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "outputs": [],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "from sklearn.base import clone\n",
 								    "\n",
 								    "transformers = [(name, clone(transformer), columns)\n",
 								    "                for name, transformer, columns in preprocessing.transformers]\n",
 								    "geo_index = [name for name, _, _ in transformers].index(\"geo\")\n",
 								    "transformers[geo_index] = (\"geo\", knn_transformer, [\"latitude\", \"longitude\"])\n",
 								    "\n",
 								    "new_geo_preprocessing = ColumnTransformer(transformers)"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 165,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "outputs": [],
 								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "new_geo_pipeline = Pipeline([\n",
 								    "    ('preprocessing', new_geo_preprocessing),\n",
 								    "    ('svr', SVR(C=rnd_search.best_params_[\"svr__C\"],\n",
 								    "                gamma=rnd_search.best_params_[\"svr__gamma\"],\n",
 								    "                kernel=rnd_search.best_params_[\"svr__kernel\"])),\n",
 								    "])"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 166,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "count         3.000000\n",
 								       "mean     104992.095758\n",
 								       "std        3112.486560\n",
 								       "min      101550.880533\n",
 								       "25%      103682.876337\n",
 								       "50%      105814.872141\n",
 								       "75%      106712.703370\n",
 								       "max      107610.534600\n",
 								       "dtype: float64"
 								      ]
 								     },
 								     "execution_count": 166,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "new_pipe_rmses = -cross_val_score(new_geo_pipeline,\n",
 								    "                                  housing.iloc[:5000],\n",
 								    "                                  housing_labels.iloc[:5000],\n",
 								    "                                  scoring=\"neg_root_mean_squared_error\",\n",
 								    "                                  cv=3)\n",
 								    "pd.Series(new_pipe_rmses).describe()"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "Yikes, that's terrible! Apparently the cluster similarity features were much better. But perhaps we should tune the `KNeighborsRegressor`'s hyperparameters? That's what the next exercise is about."
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "## 5."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "Exercise: _Automatically explore some preparation options using `RandomSearchCV`._"
-												Update libraries to latest version, including TensorFlow 2.4.1 and Scikit-Learn 0.24.1

											
										
										
											2021-02-14 03:02:09 +01:00
+								   ]
 								  },
-												Set OneHotEncoder's handle_unknown='ignore' to avoid warnings

											
										
										
											2021-10-11 09:51:34 +02:00
+								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 167,
-												Set OneHotEncoder's handle_unknown='ignore' to avoid warnings

											
										
										
											2021-10-11 09:51:34 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "RandomizedSearchCV(cv=3,\n",
 								       "                   estimator=Pipeline(steps=[('preprocessing',\n",
 								       "                                              ColumnTransformer(transformers=[('bedrooms_ratio',\n",
 								       "                                                                               Pipeline(steps=[('simpleimputer',\n",
 								       "                                                                                                SimpleImputer(strategy='median')),\n",
 								       "                                                                                               ('functiontransformer',\n",
 								       "                                                                                                FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
 								       "                                                                                                                    func=<function column_ratio at 0x7f9b505e5670>)),\n",
 								       "                                                                                               ('standardscaler',\n",
 								       "                                                                                                StandardScaler())]),\n",
 								       "                                                                               ['...\n",
 								       "                   param_distributions={'preprocessing__geo__estimator__n_neighbors': range(1, 30),\n",
 								       "                                        'preprocessing__geo__estimator__weights': ['distance',\n",
 								       "                                                                                   'uniform'],\n",
 								       "                                        'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9acb940bb0>,\n",
 								       "                                        'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9acb940a30>},\n",
 								       "                   random_state=42, scoring='neg_root_mean_squared_error')"
 								      ]
 								     },
 								     "execution_count": 167,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Set OneHotEncoder's handle_unknown='ignore' to avoid warnings

											
										
										
											2021-10-11 09:51:34 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "param_distribs = {\n",
 								    "    \"preprocessing__geo__estimator__n_neighbors\": range(1, 30),\n",
 								    "    \"preprocessing__geo__estimator__weights\": [\"distance\", \"uniform\"],\n",
 								    "    \"svr__C\": reciprocal(20, 200_000),\n",
 								    "    \"svr__gamma\": expon(scale=1.0),\n",
 								    "}\n",
 								    "\n",
 								    "new_geo_rnd_search = RandomizedSearchCV(new_geo_pipeline,\n",
 								    "                                        param_distributions=param_distribs,\n",
 								    "                                        n_iter=50,\n",
 								    "                                        cv=3,\n",
 								    "                                        scoring='neg_root_mean_squared_error',\n",
 								    "                                        random_state=42)\n",
 								    "new_geo_rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])"
-												Set OneHotEncoder's handle_unknown='ignore' to avoid warnings

											
										
										
											2021-10-11 09:51:34 +02:00
+								   ]
 								  },
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 168,
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add notebook outputs

											
										
										
											2022-02-19 10:24:54 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "106775.63787128967"
 								      ]
 								     },
 								     "execution_count": 168,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								    "new_geo_rnd_search_rmse = -new_geo_rnd_search.best_score_\n",
 								    "new_geo_rnd_search_rmse"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Clarify a few messages

											
										
										
											2021-10-28 05:02:31 +02:00
+								    "Oh well... at least we tried! It looks like the cluster similarity features are definitely better than the KNN feature. But perhaps you could try having both? And maybe training on the full training set would help as well."
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "## 6."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Exercise: _Try to implement the `StandardScalerClone` class again from scratch, then add support for the `inverse_transform()` method: executing `scaler.inverse_transform(scaler.fit_transform(X))` should return an array very close to `X`. Then add support for feature names: set `feature_names_in_` in the `fit()` method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the `get_feature_names_out()` method: it should have one optional `input_features=None` argument. If passed, the method should check that its length matches `n_features_in_`, and it should match `feature_names_in_` if it is defined, then `input_features` should be returned. If `input_features` is `None`, then the method should return `feature_names_in_` if it is defined or `np.array([\"x0\", \"x1\", ...])` with length `n_features_in_` otherwise._"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 169,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.base import BaseEstimator, TransformerMixin\n",
 								    "from sklearn.utils.validation import check_array, check_is_fitted\n",
 								    "\n",
 								    "class StandardScalerClone(BaseEstimator, TransformerMixin):\n",
 								    "    def __init__(self, with_mean=True):  # no *args or **kwargs!\n",
 								    "        self.with_mean = with_mean\n",
 								    "\n",
 								    "    def fit(self, X, y=None):  # y is required even though we don't use it\n",
-												Fix people_per_house feature name, and fix solution to last exercise

											
										
										
											2021-11-15 08:56:11 +01:00
+								    "        X_orig = X\n",
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "        X = check_array(X)  # checks that X is an array with finite float values\n",
 								    "        self.mean_ = X.mean(axis=0)\n",
 								    "        self.scale_ = X.std(axis=0)\n",
 								    "        self.n_features_in_ = X.shape[1]  # every estimator stores this in fit()\n",
-												Fix people_per_house feature name, and fix solution to last exercise

											
										
										
											2021-11-15 08:56:11 +01:00
+								    "        if hasattr(X_orig, \"columns\"):\n",
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "            self.feature_names_in_ = np.array(X_orig.columns, dtype=object)\n",
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								    "        return self  # always return self!\n",
 								    "\n",
 								    "    def transform(self, X):\n",
 								    "        check_is_fitted(self)  # looks for learned attributes (with trailing _)\n",
 								    "        X = check_array(X)\n",
 								    "        if self.n_features_in_ != X.shape[1]:\n",
 								    "            raise ValueError(\"Unexpected number of features\")\n",
 								    "        if self.with_mean:\n",
 								    "            X = X - self.mean_\n",
 								    "        return X / self.scale_\n",
 								    "    \n",
 								    "    def inverse_transform(self, X):\n",
 								    "        check_is_fitted(self)\n",
 								    "        X = check_array(X)\n",
 								    "        if self.n_features_in_ != X.shape[1]:\n",
 								    "            raise ValueError(\"Unexpected number of features\")\n",
 								    "        X = X * self.scale_\n",
 								    "        return X + self.mean_ if self.with_mean else X\n",
 								    "    \n",
 								    "    def get_feature_names_out(self, input_features=None):\n",
 								    "        if input_features is None:\n",
 								    "            return getattr(self, \"feature_names_in_\",\n",
 								    "                           [f\"x{i}\" for i in range(self.n_features_in_)])\n",
 								    "        else:\n",
 								    "            if len(input_features) != self.n_features_in_:\n",
 								    "                raise ValueError(\"Invalid number of features\")\n",
 								    "            if hasattr(self, \"feature_names_in_\") and not np.all(\n",
 								    "                self.feature_names_in_ == input_features\n",
 								    "            ):\n",
 								    "                raise ValueError(\"input_features ≠ feature_names_in_\")\n",
 								    "            return input_features"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "Let's test our custom transformer:"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 170,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "from sklearn.utils.estimator_checks import check_estimator\n",
 								    " \n",
 								    "check_estimator(StandardScalerClone())"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "No errors, that's a great start, we respect the Scikit-Learn API."
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
-												Fix a few typos and deprecated np.object reference

											
										
										
											2022-05-09 10:31:09 +02:00
+								    "Now let's ensure the transformation works as expected:"
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 171,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "np.random.seed(42)\n",
 								    "X = np.random.rand(1000, 3)\n",
 								    "\n",
 								    "scaler = StandardScalerClone()\n",
 								    "X_scaled = scaler.fit_transform(X)\n",
 								    "\n",
 								    "assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "How about setting `with_mean=False`?"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 172,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "scaler = StandardScalerClone(with_mean=False)\n",
 								    "X_scaled_uncentered = scaler.fit_transform(X)\n",
 								    "\n",
 								    "assert np.allclose(X_scaled_uncentered, X / X.std(axis=0))"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "And does the inverse work?"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 173,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "scaler = StandardScalerClone()\n",
 								    "X_back = scaler.inverse_transform(scaler.fit_transform(X))\n",
 								    "assert np.allclose(X, X_back)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "How about the feature names out?"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 174,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "assert np.all(scaler.get_feature_names_out() == [\"x0\", \"x1\", \"x2\"])\n",
 								    "assert np.all(scaler.get_feature_names_out([\"a\", \"b\", \"c\"]) == [\"a\", \"b\", \"c\"])"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "And if we fit a DataFrame, are the feature in and out ok?"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Load plt before setting rcParams

											
										
										
											2021-11-27 01:43:11 +01:00
+								   "execution_count": 175,
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df = pd.DataFrame({\"a\": np.random.rand(100), \"b\": np.random.rand(100)})\n",
 								    "scaler = StandardScalerClone()\n",
 								    "X_scaled = scaler.fit_transform(df)\n",
 								    "\n",
-												Fix people_per_house feature name, and fix solution to last exercise

											
										
										
											2021-11-15 08:56:11 +01:00
+								    "assert np.all(scaler.feature_names_in_ == [\"a\", \"b\"])\n",
 								    "assert np.all(scaler.get_feature_names_out() == [\"a\", \"b\"])"
-												Move StandardScalerClone inverse_transform and get_feature_names_out to exercise

											
										
										
											2021-11-15 05:45:26 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "metadata": {},
 								   "source": [
 								    "All good! That's all for today! 😀"
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Make fetch_housing_data() work on Windows

											
										
										
											2017-06-18 13:52:10 +02:00
+								   "metadata": {},
-												Add exercise solutions

											
										
										
											2017-04-30 17:32:46 +02:00
+								   "source": [
 								    "Congratulations! You already know quite a lot about Machine Learning. :)"
 								   ]
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  }
 								 ],
 								 "metadata": {
 								  "kernelspec": {
-												BIG UPDATE: rewrote in large part for 3rd edition

											
										
										
											2021-10-28 03:55:10 +02:00
+								   "display_name": "Python 3",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "language": "python",
 								   "name": "python3"
 								  },
 								  "language_info": {
 								   "codemirror_mode": {
 								    "name": "ipython",
 								    "version": 3
 								   },
 								   "file_extension": ".py",
 								   "mimetype": "text/x-python",
 								   "name": "python",
 								   "nbconvert_exporter": "python",
 								   "pygments_lexer": "ipython3",
-												Require and upgrade to Python 3.8

											
										
										
											2021-10-17 03:27:34 +02:00
+								   "version": "3.8.12"
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  },
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								  "nav_menu": {
 								   "height": "279px",
 								   "width": "309px"
 								  },
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								  "toc": {
-												Chapter 2:  Improvement of code in "Setup" & "Get the data"

											
										
										
											2017-10-15 23:14:05 +02:00
+								   "nav_menu": {},
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "number_sections": true,
 								   "sideBar": true,
-												Chapter 2:  Improvement of code in "Setup" & "Get the data"

											
										
										
											2017-10-15 23:14:05 +02:00
+								   "skip_h1_title": false,
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "toc_cell": false,
-												Chapter 2:  Improvement of code in "Setup" & "Get the data"

											
										
										
											2017-10-15 23:14:05 +02:00
+								   "toc_position": {},
-												Update chapters 1, 2 and 4

											
										
										
											2016-09-27 16:39:16 +02:00
+								   "toc_section_display": "block",
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								   "toc_window_display": false
 								  }
 								 },
 								 "nbformat": 4,
-												Update notebooks to latest nbformat

											
										
										
											2020-04-06 09:13:12 +02:00
+								 "nbformat_minor": 4
-												Many small fixes in end_to_end_project.ipynb

											
										
										
											2016-05-07 17:41:41 +02:00
+								}