handson-ml/end_to_end_project.ipynb

1313 lines
32 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Chapter 2 End to end Machine Learning project**\n",
"\n",
"*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, let's make sure this notebook works well in both python 2 and 3:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"end_to_end_project\"\n",
"\n",
"def save_fig(fig_id):\n",
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
" print(\"Saving figure\", fig_id)\n",
" plt.tight_layout()\n",
" plt.savefig(path, format='png', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get the data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"DATASETS_URL = \"https://github.com/ageron/ml-notebooks/raw/master/datasets\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import tarfile\n",
"import urllib.request\n",
"\n",
"HOUSING_PATH = \"datasets/housing\"\n",
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
"\n",
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
" os.makedirs(housing_path, exist_ok=True)\n",
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
" housing_tgz = tarfile.open(tgz_path)\n",
" housing_tgz.extractall(path=housing_path)\n",
" housing_tgz.close()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fetch_housing_data()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def load_housing_data(housing_path=HOUSING_PATH):\n",
" csv_path = os.path.join(housing_path, \"housing.csv\")\n",
" return pd.read_csv(csv_path)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing = load_housing_data()\n",
"housing.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing[\"ocean_proximity\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(housing.describe())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"housing.hist(bins=50, figsize=(11,8))\n",
"save_fig(\"attribute_histogram_plots\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import numpy.random as rnd\n",
"rnd.seed(42) # to make this notebook's output identical at every run\n",
"\n",
"def split_train_test(data, test_ratio):\n",
" shuffled_indices = rnd.permutation(len(data))\n",
" test_set_size = int(len(data) * test_ratio)\n",
" test_indices = shuffled_indices[:test_set_size]\n",
" train_indices = shuffled_indices[test_set_size:]\n",
" return data.iloc[train_indices], data.iloc[test_indices]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train_set, test_set = split_train_test(housing, 0.2)\n",
"print(len(train_set), len(test_set))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import hashlib\n",
"\n",
"def test_set_check(identifier, test_ratio, hash):\n",
" return hash(identifier).digest()[-1] < 256 * test_ratio\n",
"\n",
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
" ids = data[id_column]\n",
" in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n",
" return data.loc[~in_test_set], data.loc[in_test_set]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_with_id = housing.reset_index() # adds an `index` column\n",
"train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")\n",
"test_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cross_validation import train_test_split\n",
"\n",
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
"test_set.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing[\"median_income\"].hist()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n",
"housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)\n",
"housing[\"income_cat\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cross_validation import StratifiedShuffleSplit\n",
"\n",
"split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n",
"train_index, test_index = next(iter(split))\n",
"strat_train_set = housing.loc[train_index]\n",
"strat_test_set = housing.loc[test_index]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def income_cat_proportions(data):\n",
" return data[\"income_cat\"].value_counts() / len(data)\n",
"\n",
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
"\n",
"compare_props = pd.DataFrame({\n",
" \"Overall\": income_cat_proportions(housing),\n",
" \"Stratified\": income_cat_proportions(strat_test_set),\n",
" \"Random\": income_cat_proportions(test_set),\n",
"}).sort_index()\n",
"compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n",
"compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"compare_props"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for set in (strat_train_set, strat_test_set):\n",
" set.drop(\"income_cat\", axis=1, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Discover and visualize the data to gain insights"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing = strat_train_set.copy()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n",
"save_fig(\"bad_visualization\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n",
"save_fig(\"better_visualization\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n",
" s=housing['population']/100, label=\"population\",\n",
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
" colorbar=True, alpha=0.4, figsize=(10,7),\n",
")\n",
"plt.legend()\n",
"save_fig(\"housing_prices_scatterplot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import matplotlib.image as mpimg\n",
"california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n",
"ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
" s=housing['population']/100, label=\"Population\",\n",
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
" colorbar=False, alpha=0.4,\n",
" )\n",
"plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n",
"plt.ylabel(\"Latitude\", fontsize=14)\n",
"plt.xlabel(\"Longitude\", fontsize=14)\n",
"\n",
"prices = housing[\"median_house_value\"]\n",
"tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
"cbar = plt.colorbar()\n",
"cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n",
"cbar.set_label('Median House Value', fontsize=16)\n",
"\n",
"plt.legend(fontsize=16)\n",
"save_fig(\"california_housing_prices\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"corr_matrix = housing.corr()\n",
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
" alpha=0.3)\n",
"plt.axis([0, 16, 0, 550000])\n",
"save_fig(\"income_vs_house_value_scatterplot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from pandas.tools.plotting import scatter_matrix\n",
"\n",
"attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n",
"scatter_matrix(housing[attributes], figsize=(11, 8))\n",
"save_fig(\"scatter_matrix_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing[\"rooms_per_household\"] = housing[\"total_rooms\"] / housing[\"population\"]\n",
"housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n",
"housing[\"population_per_household\"] = housing[\"population\"] / housing[\"households\"]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"corr_matrix = housing.corr()\n",
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n",
" alpha=0.2)\n",
"plt.axis([0, 5, 0, 520000])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare the data for Machine Learning algorithms"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
"housing_labels = strat_train_set[\"median_house_value\"].copy()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_copy = housing.copy().iloc[21:24]\n",
"housing_copy"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_copy = housing.copy().iloc[21:24]\n",
"housing_copy.drop(\"total_bedrooms\", axis=1) # option 2"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_copy = housing.copy().iloc[21:24]\n",
"median = housing_copy[\"total_bedrooms\"].median()\n",
"housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
"housing_copy"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing import Imputer\n",
"\n",
"imputer = Imputer(strategy='median')\n",
"housing_num = housing.drop(\"ocean_proximity\", axis=1)\n",
"imputer.fit(housing_num)\n",
"X = imputer.transform(housing_num)\n",
"housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
"housing_tr.iloc[21:24]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"imputer.statistics_"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_num.median().values"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"imputer.strategy"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
"housing_tr.head()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"encoder = LabelEncoder()\n",
"housing_cat = housing[\"ocean_proximity\"]\n",
"housing_cat_encoded = encoder.fit_transform(housing_cat)\n",
"housing_cat_encoded"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(encoder.classes_)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"encoder = OneHotEncoder()\n",
"housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n",
"housing_cat_1hot"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_cat_1hot.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelBinarizer\n",
"\n",
"encoder = LabelBinarizer()\n",
"encoder.fit_transform(housing_cat)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"\n",
"rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n",
"\n",
"class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n",
" def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n",
" self.add_bedrooms_per_room = add_bedrooms_per_room\n",
" def fit(self, X, y=None):\n",
" return self # nothing else to do\n",
" def transform(self, X, y=None):\n",
" rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n",
" population_per_household = X[:, population_ix] / X[:, household_ix]\n",
" if self.add_bedrooms_per_room:\n",
" bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
" return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n",
" else:\n",
" return np.c_[X, rooms_per_household, population_per_household]\n",
"\n",
"attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n",
"housing_extra_attribs = attr_adder.transform(housing.values)\n",
"\n",
"housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n",
"housing_extra_attribs.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"num_pipeline = Pipeline([\n",
" ('imputer', Imputer(strategy=\"median\")),\n",
" ('attribs_adder', CombinedAttributesAdder()),\n",
" ('std_scaler', StandardScaler()),\n",
" ])\n",
"\n",
"num_pipeline.fit_transform(housing_num)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.pipeline import FeatureUnion\n",
"\n",
"class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
" def __init__(self, attribute_names):\n",
" self.attribute_names = attribute_names\n",
" def fit(self, X, y=None):\n",
" return self\n",
" def transform(self, X):\n",
" return X[self.attribute_names].values\n",
"\n",
"num_attribs = list(housing_num)\n",
"cat_attribs = [\"ocean_proximity\"]\n",
"\n",
"num_pipeline = Pipeline([\n",
" ('selector', DataFrameSelector(num_attribs)),\n",
" ('imputer', Imputer(strategy=\"median\")),\n",
" ('attribs_adder', CombinedAttributesAdder()),\n",
" ('std_scaler', StandardScaler()),\n",
" ])\n",
"\n",
"cat_pipeline = Pipeline([\n",
" ('selector', DataFrameSelector(cat_attribs)),\n",
" ('label_binarizer', LabelBinarizer()),\n",
" ])\n",
"\n",
"preparation_pipeline = FeatureUnion(transformer_list=[\n",
" (\"num_pipeline\", num_pipeline),\n",
" (\"cat_pipeline\", cat_pipeline),\n",
" ])\n"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_prepared = preparation_pipeline.fit_transform(housing)\n",
"housing_prepared"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"housing_prepared.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare the data for Machine Learning algorithms"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"\n",
"lin_reg = LinearRegression()\n",
"lin_reg.fit(housing_prepared, housing_labels)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# let's try the full pipeline on a few training instances\n",
"some_data = housing.iloc[:5]\n",
"some_labels = housing_labels.iloc[:5]\n",
"some_data_prepared = preparation_pipeline.transform(some_data)\n",
"\n",
"print(\"Predictions:\\t\", lin_reg.predict(some_data_prepared))\n",
"print(\"Labels:\\t\\t\", list(some_labels))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"\n",
"housing_predictions = lin_reg.predict(housing_prepared)\n",
"lin_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"lin_rmse = np.sqrt(lin_mse)\n",
"lin_rmse"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_absolute_error\n",
"\n",
"lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n",
"lin_mae"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.tree import DecisionTreeRegressor\n",
"\n",
"tree_reg = DecisionTreeRegressor()\n",
"tree_reg.fit(housing_prepared, housing_labels)\n",
"housing_predictions = tree_reg.predict(housing_prepared)\n",
"tree_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"tree_rmse = np.sqrt(tree_mse)\n",
"tree_rmse"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fine-tune your model"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cross_validation import cross_val_score\n",
"\n",
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
" scoring=\"mean_squared_error\", cv=10)\n",
"tree_rmse_scores = np.sqrt(-tree_scores)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def display_scores(scores):\n",
" print(\"Scores:\", scores)\n",
" print(\"Mean:\", scores.mean())\n",
" print(\"Standard deviation:\", scores.std())\n",
"\n",
"display_scores(tree_rmse_scores)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
" scoring=\"mean_squared_error\", cv=10)\n",
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
"display_scores(lin_rmse_scores)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"forest_reg.fit(housing_prepared, housing_labels)\n",
"housing_predictions = forest_reg.predict(housing_prepared)\n",
"forest_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"forest_rmse = np.sqrt(forest_mse)\n",
"forest_rmse"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.cross_validation import cross_val_score\n",
"\n",
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
" scoring=\"mean_squared_error\", cv=10)\n",
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
"display_scores(forest_rmse_scores)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n",
"pd.Series(np.sqrt(-scores)).describe()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.svm import SVR\n",
"\n",
"svm_reg = SVR(kernel=\"linear\")\n",
"svm_reg.fit(housing_prepared, housing_labels)\n",
"housing_predictions = svm_reg.predict(housing_prepared)\n",
"svm_mse = mean_squared_error(housing_labels, housing_predictions)\n",
"svm_rmse = np.sqrt(svm_mse)\n",
"svm_rmse"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.grid_search import GridSearchCV\n",
"\n",
"param_grid = [\n",
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
" {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
" ]\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n",
"grid_search.fit(housing_prepared, housing_labels)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"grid_search.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"grid_search.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for params, mean_score, scores in grid_search.grid_scores_:\n",
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.grid_search import RandomizedSearchCV\n",
"from scipy.stats import randint\n",
"\n",
"param_distribs = {\n",
" 'n_estimators': randint(low=1, high=200),\n",
" 'max_features': randint(low=1, high=8),\n",
" }\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
" n_iter=10, cv=5, scoring='mean_squared_error')\n",
"rnd_search.fit(housing_prepared, housing_labels)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for params, mean_score, scores in rnd_search.grid_scores_:\n",
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
"feature_importances"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n",
"cat_one_hot_attribs = list(encoder.classes_)\n",
"attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n",
"sorted(zip(feature_importances, attributes), reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"final_model = grid_search.best_estimator_\n",
"\n",
"X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
"y_test = strat_test_set[\"median_house_value\"].copy()\n",
"\n",
"X_test_transformed = preparation_pipeline.transform(X_test)\n",
"final_predictions = final_model.predict(X_test_transformed)\n",
"\n",
"final_mse = mean_squared_error(y_test, final_predictions)\n",
"final_rmse = np.sqrt(final_mse)\n",
"final_rmse"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extra material"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Label Binarizer hack\n",
"`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
"\n",
"This hack creates a supervision-friendly `LabelBinarizer`."
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
" def fit_transform(self, X, y=None):\n",
" return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
"\n",
"# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
"cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
"\n",
"# Now you can create a full pipeline with a supervised predictor at the end.\n",
"full_pipeline = Pipeline([\n",
" (\"preparation\", preparation_pipeline),\n",
" (\"linear\", LinearRegression())\n",
" ])\n",
"\n",
"full_pipeline.fit(housing, housing_labels)\n",
"full_pipeline.predict(some_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model persistence using joblib"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.externals import joblib"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"joblib.dump(final_model, \"my_random_forest_regressor.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"final_model_loaded = joblib.load(\"my_random_forest_regressor.pkl\")\n",
"final_model_loaded"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example SciPy distributions for `RandomizedSearchCV`"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from scipy.stats import geom, expon\n",
"geom_distrib=geom(0.5).rvs(10000)\n",
"expon_distrib=expon(scale=1).rvs(10000)\n",
"plt.hist(geom_distrib, bins=50)\n",
"plt.show()\n",
"plt.hist(expon_distrib, bins=50)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
},
"toc": {
"toc_cell": false,
"toc_number_sections": true,
"toc_threshold": 6,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}