handson-ml/02_end_to_end_machine_learn...

1576 lines
38 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"**Chapter 2 End-to-end Machine Learning project**\n",
"\n",
"*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*\n",
"\n",
"*This notebook contains all the sample code and solutions to the exercices in chapter 2.*"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import numpy.random as rnd\n",
"import os\n",
"\n",
"# to make this notebook's output stable across runs\n",
"rnd.seed(42)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"# Where to save the figures\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"end_to_end_project\"\n",
"\n",
"def save_fig(fig_id, tight_layout=True):\n",
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
" print(\"Saving figure\", fig_id)\n",
" if tight_layout:\n",
" plt.tight_layout()\n",
" plt.savefig(path, format='png', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Get the data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"DATASETS_URL = \"https://github.com/ageron/handson-ml/raw/master/datasets\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import os\n",
"import tarfile\n",
"from six.moves import urllib\n",
"\n",
"HOUSING_PATH = \"datasets/housing\"\n",
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
"\n",
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
" if not os.path.exists(housing_path):\n",
" os.makedirs(housing_path)\n",
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
" housing_tgz = tarfile.open(tgz_path)\n",
" housing_tgz.extractall(path=housing_path)\n",
" housing_tgz.close()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"fetch_housing_data()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def load_housing_data(housing_path=HOUSING_PATH):\n",
" csv_path = os.path.join(housing_path, \"housing.csv\")\n",
" return pd.read_csv(csv_path)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing = load_housing_data()\n",
"housing.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing[\"ocean_proximity\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"print(housing.describe())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"housing.hist(bins=50, figsize=(11,8))\n",
"save_fig(\"attribute_histogram_plots\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import numpy.random as rnd\n",
"rnd.seed(42) # to make this notebook's output identical at every run\n",
"\n",
"def split_train_test(data, test_ratio):\n",
" shuffled_indices = rnd.permutation(len(data))\n",
" test_set_size = int(len(data) * test_ratio)\n",
" test_indices = shuffled_indices[:test_set_size]\n",
" train_indices = shuffled_indices[test_set_size:]\n",
" return data.iloc[train_indices], data.iloc[test_indices]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"train_set, test_set = split_train_test(housing, 0.2)\n",
"print(len(train_set), len(test_set))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def test_set_check(identifier, test_ratio, hash):\n",
" return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n",
"\n",
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
" ids = data[id_column]\n",
" in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n",
" return data.loc[~in_test_set], data.loc[in_test_set]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_with_id = housing.reset_index() # adds an `index` column\n",
"train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")\n",
"test_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
"test_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing[\"median_income\"].hist()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n",
"housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)\n",
"housing[\"income_cat\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"\n",
"split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
"for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
" strat_train_set = housing.loc[train_index]\n",
" strat_test_set = housing.loc[test_index]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"def income_cat_proportions(data):\n",
" return data[\"income_cat\"].value_counts() / len(data)\n",
"\n",
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
"\n",
"compare_props = pd.DataFrame({\n",
" \"Overall\": income_cat_proportions(housing),\n",
" \"Stratified\": income_cat_proportions(strat_test_set),\n",
" \"Random\": income_cat_proportions(test_set),\n",
"}).sort_index()\n",
"compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n",
"compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"compare_props"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"for set in (strat_train_set, strat_test_set):\n",
" set.drop(\"income_cat\", axis=1, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Discover and visualize the data to gain insights"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing = strat_train_set.copy()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n",
"save_fig(\"bad_visualization_plot\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n",
"save_fig(\"better_visualization_plot\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n",
" s=housing['population']/100, label=\"population\",\n",
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
" colorbar=True, alpha=0.4, figsize=(10,7),\n",
")\n",
"plt.legend()\n",
"save_fig(\"housing_prices_scatterplot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import matplotlib.image as mpimg\n",
"california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n",
"ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
" s=housing['population']/100, label=\"Population\",\n",
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
" colorbar=False, alpha=0.4,\n",
" )\n",
"plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n",
"plt.ylabel(\"Latitude\", fontsize=14)\n",
"plt.xlabel(\"Longitude\", fontsize=14)\n",
"\n",
"prices = housing[\"median_house_value\"]\n",
"tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
"cbar = plt.colorbar()\n",
"cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n",
"cbar.set_label('Median House Value', fontsize=16)\n",
"\n",
"plt.legend(fontsize=16)\n",
"save_fig(\"california_housing_prices_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"corr_matrix = housing.corr()\n",
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
" alpha=0.3)\n",
"plt.axis([0, 16, 0, 550000])\n",
"save_fig(\"income_vs_house_value_scatterplot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from pandas.tools.plotting import scatter_matrix\n",
"\n",
"attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n",
"scatter_matrix(housing[attributes], figsize=(11, 8))\n",
"save_fig(\"scatter_matrix_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing[\"rooms_per_household\"] = housing[\"total_rooms\"] / housing[\"population\"]\n",
"housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n",
"housing[\"population_per_household\"] = housing[\"population\"] / housing[\"households\"]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"corr_matrix = housing.corr()\n",
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n",
" alpha=0.2)\n",
"plt.axis([0, 5, 0, 520000])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Prepare the data for Machine Learning algorithms"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
"housing_labels = strat_train_set[\"median_house_value\"].copy()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_copy = housing.copy().iloc[21:24]\n",
"housing_copy"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_copy = housing.copy().iloc[21:24]\n",
"housing_copy.drop(\"total_bedrooms\", axis=1) # option 2"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_copy = housing.copy().iloc[21:24]\n",
"median = housing_copy[\"total_bedrooms\"].median()\n",
"housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
"housing_copy"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import Imputer\n",
"\n",
"imputer = Imputer(strategy='median')\n",
"housing_num = housing.drop(\"ocean_proximity\", axis=1)\n",
"imputer.fit(housing_num)\n",
"X = imputer.transform(housing_num)\n",
"housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
"housing_tr.iloc[21:24]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"imputer.statistics_"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_num.median().values"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"imputer.strategy"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
"housing_tr.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"encoder = LabelEncoder()\n",
"housing_cat = housing[\"ocean_proximity\"]\n",
"housing_cat_encoded = encoder.fit_transform(housing_cat)\n",
"housing_cat_encoded"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"print(encoder.classes_)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"encoder = OneHotEncoder()\n",
"housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n",
"housing_cat_1hot"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_cat_1hot.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelBinarizer\n",
"\n",
"encoder = LabelBinarizer()\n",
"encoder.fit_transform(housing_cat)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"\n",
"rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n",
"\n",
"class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n",
" def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n",
" self.add_bedrooms_per_room = add_bedrooms_per_room\n",
" def fit(self, X, y=None):\n",
" return self # nothing else to do\n",
" def transform(self, X, y=None):\n",
" rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n",
" population_per_household = X[:, population_ix] / X[:, household_ix]\n",
" if self.add_bedrooms_per_room:\n",
" bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
" return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n",
" else:\n",
" return np.c_[X, rooms_per_household, population_per_household]\n",
"\n",
"attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n",
"housing_extra_attribs = attr_adder.transform(housing.values)\n",
"\n",
"housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n",
"housing_extra_attribs.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"num_pipeline = Pipeline([\n",
" ('imputer', Imputer(strategy=\"median\")),\n",
" ('attribs_adder', CombinedAttributesAdder()),\n",
" ('std_scaler', StandardScaler()),\n",
" ])\n",
"\n",
"num_pipeline.fit_transform(housing_num)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.pipeline import FeatureUnion\n",
"\n",
"class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
" def __init__(self, attribute_names):\n",
" self.attribute_names = attribute_names\n",
" def fit(self, X, y=None):\n",
" return self\n",
" def transform(self, X):\n",
" return X[self.attribute_names].values\n",
"\n",
"num_attribs = list(housing_num)\n",
"cat_attribs = [\"ocean_proximity\"]\n",
"\n",
"num_pipeline = Pipeline([\n",
" ('selector', DataFrameSelector(num_attribs)),\n",
" ('imputer', Imputer(strategy=\"median\")),\n",
" ('attribs_adder', CombinedAttributesAdder()),\n",
" ('std_scaler', StandardScaler()),\n",
" ])\n",
"\n",
"cat_pipeline = Pipeline([\n",
" ('selector', DataFrameSelector(cat_attribs)),\n",
" ('label_binarizer', LabelBinarizer()),\n",
" ])\n",
"\n",
"preparation_pipeline = FeatureUnion(transformer_list=[\n",
" (\"num_pipeline\", num_pipeline),\n",
" (\"cat_pipeline\", cat_pipeline),\n",
" ])\n"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_prepared = preparation_pipeline.fit_transform(housing)\n",
"housing_prepared"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"housing_prepared.shape"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Prepare the data for Machine Learning algorithms"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"\n",
"lin_reg = LinearRegression()\n",
"lin_reg.fit(housing_prepared, housing_labels)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# let's try the full pipeline on a few training instances\n",
"some_data = housing.iloc[:5]\n",
"some_labels = housing_labels.iloc[:5]\n",
"some_data_prepared = preparation_pipeline.transform(some_data)\n",
"\n",
"print(\"Predictions:\\t\", lin_reg.predict(some_data_prepared))\n",
"print(\"Labels:\\t\\t\", list(some_labels))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"\n",
"housing_predictions = lin_reg.predict(housing_prepared)\n",
"lin_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"lin_rmse = np.sqrt(lin_mse)\n",
"lin_rmse"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_absolute_error\n",
"\n",
"lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n",
"lin_mae"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.tree import DecisionTreeRegressor\n",
"\n",
"tree_reg = DecisionTreeRegressor()\n",
"tree_reg.fit(housing_prepared, housing_labels)\n",
"housing_predictions = tree_reg.predict(housing_prepared)\n",
"tree_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"tree_rmse = np.sqrt(tree_mse)\n",
"tree_rmse"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Fine-tune your model"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score\n",
"\n",
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
" scoring=\"neg_mean_squared_error\", cv=10)\n",
"tree_rmse_scores = np.sqrt(-tree_scores)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"def display_scores(scores):\n",
" print(\"Scores:\", scores)\n",
" print(\"Mean:\", scores.mean())\n",
" print(\"Standard deviation:\", scores.std())\n",
"\n",
"display_scores(tree_rmse_scores)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
" scoring=\"neg_mean_squared_error\", cv=10)\n",
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
"display_scores(lin_rmse_scores)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"forest_reg.fit(housing_prepared, housing_labels)\n",
"housing_predictions = forest_reg.predict(housing_prepared)\n",
"forest_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"forest_rmse = np.sqrt(forest_mse)\n",
"forest_rmse"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score\n",
"\n",
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
" scoring=\"neg_mean_squared_error\", cv=10)\n",
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
"display_scores(forest_rmse_scores)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
"pd.Series(np.sqrt(-scores)).describe()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.svm import SVR\n",
"\n",
"svm_reg = SVR(kernel=\"linear\")\n",
"svm_reg.fit(housing_prepared, housing_labels)\n",
"housing_predictions = svm_reg.predict(housing_prepared)\n",
"svm_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"svm_rmse = np.sqrt(svm_mse)\n",
"svm_rmse"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"param_grid = [\n",
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
" {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
" ]\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n",
"grid_search.fit(housing_prepared, housing_labels)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"grid_search.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"grid_search.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"cvres = grid_search.cv_results_\n",
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
" print(np.sqrt(-mean_score), params)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.DataFrame(grid_search.cv_results_)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import RandomizedSearchCV\n",
"from scipy.stats import randint\n",
"\n",
"param_distribs = {\n",
" 'n_estimators': randint(low=1, high=200),\n",
" 'max_features': randint(low=1, high=8),\n",
" }\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
" n_iter=10, cv=5, scoring='neg_mean_squared_error')\n",
"rnd_search.fit(housing_prepared, housing_labels)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"cvres = rnd_search.cv_results_\n",
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
" print(np.sqrt(-mean_score), params)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
"feature_importances"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n",
"cat_one_hot_attribs = list(encoder.classes_)\n",
"attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n",
"sorted(zip(feature_importances, attributes), reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"final_model = grid_search.best_estimator_\n",
"\n",
"X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
"y_test = strat_test_set[\"median_house_value\"].copy()\n",
"\n",
"X_test_transformed = preparation_pipeline.transform(X_test)\n",
"final_predictions = final_model.predict(X_test_transformed)\n",
"\n",
"final_mse = mean_squared_error(y_test, final_predictions)\n",
"final_rmse = np.sqrt(final_mse)\n",
"final_rmse"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Extra material"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Label Binarizer hack\n",
"`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
"\n",
"This hack creates a supervision-friendly `LabelBinarizer`."
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
" def fit_transform(self, X, y=None):\n",
" return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
"\n",
"# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
"cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
"\n",
"# Now you can create a full pipeline with a supervised predictor at the end.\n",
"full_pipeline = Pipeline([\n",
" (\"preparation\", preparation_pipeline),\n",
" (\"linear\", LinearRegression())\n",
" ])\n",
"\n",
"full_pipeline.fit(housing, housing_labels)\n",
"full_pipeline.predict(some_data)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Model persistence using joblib"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.externals import joblib"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"joblib.dump(final_model, \"my_random_forest_regressor.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"final_model_loaded = joblib.load(\"my_random_forest_regressor.pkl\")\n",
"final_model_loaded"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Example SciPy distributions for `RandomizedSearchCV`"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from scipy.stats import geom, expon\n",
"geom_distrib=geom(0.5).rvs(10000)\n",
"expon_distrib=expon(scale=1).rvs(10000)\n",
"plt.hist(geom_distrib, bins=50)\n",
"plt.show()\n",
"plt.hist(expon_distrib, bins=50)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Exercise solutions"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"**Coming soon**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2+"
},
"nav_menu": {
"height": "279px",
"width": "309px"
},
"toc": {
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 6,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}