{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Chapter 2 – End to end Machine Learning project**\n", "\n", "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division, print_function, unicode_literals\n", "\n", "import os\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "plt.rcParams['axes.labelsize'] = 14\n", "plt.rcParams['xtick.labelsize'] = 12\n", "plt.rcParams['ytick.labelsize'] = 12\n", "\n", "PROJECT_ROOT_DIR = \".\"\n", "CHAPTER_ID = \"end_to_end_project\"\n", "\n", "def save_fig(fig_id):\n", " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", " print(\"Saving figure\", fig_id)\n", " plt.tight_layout()\n", " plt.savefig(path, format='png', dpi=300)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Get the data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "DATASETS_URL = \"https://github.com/ageron/ml-notebooks/raw/master/datasets\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import tarfile\n", "import urllib.request\n", "\n", "HOUSING_PATH = \"datasets/housing\"\n", "HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n", "\n", "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", " os.makedirs(housing_path, exist_ok=True)\n", " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", " urllib.request.urlretrieve(housing_url, tgz_path)\n", " housing_tgz = tarfile.open(tgz_path)\n", " housing_tgz.extractall(path=housing_path)\n", " housing_tgz.close()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fetch_housing_data()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def load_housing_data(housing_path=HOUSING_PATH):\n", " csv_path = os.path.join(housing_path, \"housing.csv\")\n", " return pd.read_csv(csv_path)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing = load_housing_data()\n", "housing.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing.info()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing[\"ocean_proximity\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(housing.describe())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "housing.hist(bins=50, figsize=(11,8))\n", "save_fig(\"attribute_histogram_plots\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import numpy.random as rnd\n", "rnd.seed(42) # to make this notebook's output identical at every run\n", "\n", "def split_train_test(data, test_ratio):\n", " shuffled_indices = rnd.permutation(len(data))\n", " test_set_size = int(len(data) * test_ratio)\n", " test_indices = shuffled_indices[:test_set_size]\n", " train_indices = shuffled_indices[test_set_size:]\n", " return data.iloc[train_indices], data.iloc[test_indices]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "train_set, test_set = split_train_test(housing, 0.2)\n", "print(len(train_set), len(test_set))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import hashlib\n", "\n", "def test_set_check(identifier, test_ratio, hash):\n", " return hash(identifier).digest()[-1] < 256 * test_ratio\n", "\n", "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n", " ids = data[id_column]\n", " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n", " return data.loc[~in_test_set], data.loc[in_test_set]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_with_id = housing.reset_index() # adds an `index` column\n", "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")\n", "test_set.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.cross_validation import train_test_split\n", "\n", "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", "test_set.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing[\"median_income\"].hist()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n", "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)\n", "housing[\"income_cat\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.cross_validation import StratifiedShuffleSplit\n", "\n", "split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n", "train_index, test_index = next(iter(split))\n", "strat_train_set = housing.loc[train_index]\n", "strat_test_set = housing.loc[test_index]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def income_cat_proportions(data):\n", " return data[\"income_cat\"].value_counts() / len(data)\n", "\n", "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", "\n", "compare_props = pd.DataFrame({\n", " \"Overall\": income_cat_proportions(housing),\n", " \"Stratified\": income_cat_proportions(strat_test_set),\n", " \"Random\": income_cat_proportions(test_set),\n", "}).sort_index()\n", "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n", "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [], "source": [ "compare_props" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for set in (strat_train_set, strat_test_set):\n", " set.drop(\"income_cat\", axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Discover and visualize the data to gain insights" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "housing = strat_train_set.copy()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n", "save_fig(\"bad_visualization\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n", "save_fig(\"better_visualization\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n", " s=housing['population']/100, label=\"population\",\n", " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", " colorbar=True, alpha=0.4, figsize=(10,7),\n", ")\n", "plt.legend()\n", "save_fig(\"housing_prices_scatterplot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.image as mpimg\n", "california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n", "ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n", " s=housing['population']/100, label=\"Population\",\n", " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", " colorbar=False, alpha=0.4,\n", " )\n", "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n", "plt.ylabel(\"Latitude\", fontsize=14)\n", "plt.xlabel(\"Longitude\", fontsize=14)\n", "\n", "prices = housing[\"median_house_value\"]\n", "tick_values = np.linspace(prices.min(), prices.max(), 11)\n", "cbar = plt.colorbar()\n", "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n", "cbar.set_label('Median House Value', fontsize=16)\n", "\n", "plt.legend(fontsize=16)\n", "save_fig(\"california_housing_prices\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "corr_matrix = housing.corr()\n", "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n", " alpha=0.3)\n", "plt.axis([0, 16, 0, 550000])\n", "save_fig(\"income_vs_house_value_scatterplot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from pandas.tools.plotting import scatter_matrix\n", "\n", "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n", "scatter_matrix(housing[attributes], figsize=(11, 8))\n", "save_fig(\"scatter_matrix_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "housing[\"rooms_per_household\"] = housing[\"total_rooms\"] / housing[\"population\"]\n", "housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n", "housing[\"population_per_household\"] = housing[\"population\"] / housing[\"households\"]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [], "source": [ "corr_matrix = housing.corr()\n", "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n", " alpha=0.2)\n", "plt.axis([0, 5, 0, 520000])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare the data for Machine Learning algorithms" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": true }, "outputs": [], "source": [ "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n", "housing_labels = strat_train_set[\"median_house_value\"].copy()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_copy = housing.copy().iloc[21:24]\n", "housing_copy" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_copy = housing.copy().iloc[21:24]\n", "housing_copy.drop(\"total_bedrooms\", axis=1) # option 2" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_copy = housing.copy().iloc[21:24]\n", "median = housing_copy[\"total_bedrooms\"].median()\n", "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", "housing_copy" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", "\n", "imputer = Imputer(strategy='median')\n", "housing_num = housing.drop(\"ocean_proximity\", axis=1)\n", "imputer.fit(housing_num)\n", "X = imputer.transform(housing_num)\n", "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n", "housing_tr.iloc[21:24]" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [], "source": [ "imputer.statistics_" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_num.median().values" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "imputer.strategy" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n", "housing_tr.head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "encoder = LabelEncoder()\n", "housing_cat = housing[\"ocean_proximity\"]\n", "housing_cat_encoded = encoder.fit_transform(housing_cat)\n", "housing_cat_encoded" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "print(encoder.classes_)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "encoder = OneHotEncoder()\n", "housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n", "housing_cat_1hot" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_cat_1hot.toarray()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelBinarizer\n", "\n", "encoder = LabelBinarizer()\n", "encoder.fit_transform(housing_cat)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n", "\n", "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n", " def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n", " self.add_bedrooms_per_room = add_bedrooms_per_room\n", " def fit(self, X, y=None):\n", " return self # nothing else to do\n", " def transform(self, X, y=None):\n", " rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n", " population_per_household = X[:, population_ix] / X[:, household_ix]\n", " if self.add_bedrooms_per_room:\n", " bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n", " return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n", " else:\n", " return np.c_[X, rooms_per_household, population_per_household]\n", "\n", "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n", "housing_extra_attribs = attr_adder.transform(housing.values)\n", "\n", "housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n", "housing_extra_attribs.head()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "num_pipeline = Pipeline([\n", " ('imputer', Imputer(strategy=\"median\")),\n", " ('attribs_adder', CombinedAttributesAdder()),\n", " ('std_scaler', StandardScaler()),\n", " ])\n", "\n", "num_pipeline.fit_transform(housing_num)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", "\n", "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", " def __init__(self, attribute_names):\n", " self.attribute_names = attribute_names\n", " def fit(self, X, y=None):\n", " return self\n", " def transform(self, X):\n", " return X[self.attribute_names].values\n", "\n", "num_attribs = list(housing_num)\n", "cat_attribs = [\"ocean_proximity\"]\n", "\n", "num_pipeline = Pipeline([\n", " ('selector', DataFrameSelector(num_attribs)),\n", " ('imputer', Imputer(strategy=\"median\")),\n", " ('attribs_adder', CombinedAttributesAdder()),\n", " ('std_scaler', StandardScaler()),\n", " ])\n", "\n", "cat_pipeline = Pipeline([\n", " ('selector', DataFrameSelector(cat_attribs)),\n", " ('label_binarizer', LabelBinarizer()),\n", " ])\n", "\n", "preparation_pipeline = FeatureUnion(transformer_list=[\n", " (\"num_pipeline\", num_pipeline),\n", " (\"cat_pipeline\", cat_pipeline),\n", " ])\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_prepared = preparation_pipeline.fit_transform(housing)\n", "housing_prepared" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [], "source": [ "housing_prepared.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prepare the data for Machine Learning algorithms" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(housing_prepared, housing_labels)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# let's try the full pipeline on a few training instances\n", "some_data = housing.iloc[:5]\n", "some_labels = housing_labels.iloc[:5]\n", "some_data_prepared = preparation_pipeline.transform(some_data)\n", "\n", "print(\"Predictions:\\t\", lin_reg.predict(some_data_prepared))\n", "print(\"Labels:\\t\\t\", list(some_labels))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", "housing_predictions = lin_reg.predict(housing_prepared)\n", "lin_mse = mean_squared_error(housing_labels, housing_predictions)\n", "lin_rmse = np.sqrt(lin_mse)\n", "lin_rmse" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.metrics import mean_absolute_error\n", "\n", "lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n", "lin_mae" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.tree import DecisionTreeRegressor\n", "\n", "tree_reg = DecisionTreeRegressor()\n", "tree_reg.fit(housing_prepared, housing_labels)\n", "housing_predictions = tree_reg.predict(housing_prepared)\n", "tree_mse = mean_squared_error(housing_labels, housing_predictions)\n", "tree_rmse = np.sqrt(tree_mse)\n", "tree_rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fine-tune your model" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.cross_validation import cross_val_score\n", "\n", "tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n", " scoring=\"mean_squared_error\", cv=10)\n", "tree_rmse_scores = np.sqrt(-tree_scores)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def display_scores(scores):\n", " print(\"Scores:\", scores)\n", " print(\"Mean:\", scores.mean())\n", " print(\"Standard deviation:\", scores.std())\n", "\n", "display_scores(tree_rmse_scores)" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n", " scoring=\"mean_squared_error\", cv=10)\n", "lin_rmse_scores = np.sqrt(-lin_scores)\n", "display_scores(lin_rmse_scores)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "\n", "forest_reg = RandomForestRegressor()\n", "forest_reg.fit(housing_prepared, housing_labels)\n", "housing_predictions = forest_reg.predict(housing_prepared)\n", "forest_mse = mean_squared_error(housing_labels, housing_predictions)\n", "forest_rmse = np.sqrt(forest_mse)\n", "forest_rmse" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.cross_validation import cross_val_score\n", "\n", "forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n", " scoring=\"mean_squared_error\", cv=10)\n", "forest_rmse_scores = np.sqrt(-forest_scores)\n", "display_scores(forest_rmse_scores)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [], "source": [ "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n", "pd.Series(np.sqrt(-scores)).describe()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.svm import SVR\n", "\n", "svm_reg = SVR(kernel=\"linear\")\n", "svm_reg.fit(housing_prepared, housing_labels)\n", "housing_predictions = svm_reg.predict(housing_prepared)\n", "svm_mse = mean_squared_error(housing_labels, housing_predictions)\n", "svm_rmse = np.sqrt(svm_mse)\n", "svm_rmse" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.grid_search import GridSearchCV\n", "\n", "param_grid = [\n", " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", " ]\n", "\n", "forest_reg = RandomForestRegressor()\n", "grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [], "source": [ "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [], "source": [ "grid_search.best_estimator_" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for params, mean_score, scores in grid_search.grid_scores_:\n", " print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.grid_search import RandomizedSearchCV\n", "from scipy.stats import randint\n", "\n", "param_distribs = {\n", " 'n_estimators': randint(low=1, high=200),\n", " 'max_features': randint(low=1, high=8),\n", " }\n", "\n", "forest_reg = RandomForestRegressor()\n", "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n", " n_iter=10, cv=5, scoring='mean_squared_error')\n", "rnd_search.fit(housing_prepared, housing_labels)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for params, mean_score, scores in rnd_search.grid_scores_:\n", " print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [], "source": [ "feature_importances = grid_search.best_estimator_.feature_importances_\n", "feature_importances" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [], "source": [ "extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n", "cat_one_hot_attribs = list(encoder.classes_)\n", "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n", "sorted(zip(feature_importances, attributes), reverse=True)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": false }, "outputs": [], "source": [ "final_model = grid_search.best_estimator_\n", "\n", "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n", "y_test = strat_test_set[\"median_house_value\"].copy()\n", "\n", "X_test_transformed = preparation_pipeline.transform(X_test)\n", "final_predictions = final_model.predict(X_test_transformed)\n", "\n", "final_mse = mean_squared_error(y_test, final_predictions)\n", "final_rmse = np.sqrt(final_mse)\n", "final_rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Extra material" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Label Binarizer hack\n", "`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n", "\n", "This hack creates a supervision-friendly `LabelBinarizer`." ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n", " def fit_transform(self, X, y=None):\n", " return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n", "\n", "# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n", "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n", "\n", "# Now you can create a full pipeline with a supervised predictor at the end.\n", "full_pipeline = Pipeline([\n", " (\"preparation\", preparation_pipeline),\n", " (\"linear\", LinearRegression())\n", " ])\n", "\n", "full_pipeline.fit(housing, housing_labels)\n", "full_pipeline.predict(some_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model persistence using joblib" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.externals import joblib" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [], "source": [ "joblib.dump(final_model, \"my_random_forest_regressor.pkl\")" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [], "source": [ "final_model_loaded = joblib.load(\"my_random_forest_regressor.pkl\")\n", "final_model_loaded" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example SciPy distributions for `RandomizedSearchCV`" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from scipy.stats import geom, expon\n", "geom_distrib=geom(0.5).rvs(10000)\n", "expon_distrib=expon(scale=1).rvs(10000)\n", "plt.hist(geom_distrib, bins=50)\n", "plt.show()\n", "plt.hist(expon_distrib, bins=50)\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" }, "toc": { "toc_cell": false, "toc_number_sections": true, "toc_threshold": 6, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }