1313 lines
32 KiB
Plaintext
1313 lines
32 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"**Chapter 2 – End to end Machine Learning project**\n",
|
||
"\n",
|
||
"*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Setup"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"First, let's make sure this notebook works well in both python 2 and 3:"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from __future__ import division, print_function, unicode_literals\n",
|
||
"\n",
|
||
"%matplotlib inline\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"plt.rcParams['axes.labelsize'] = 14\n",
|
||
"plt.rcParams['xtick.labelsize'] = 12\n",
|
||
"plt.rcParams['ytick.labelsize'] = 12\n",
|
||
"\n",
|
||
"PROJECT_ROOT_DIR = \".\"\n",
|
||
"CHAPTER_ID = \"end_to_end_project\"\n",
|
||
"\n",
|
||
"def save_fig(fig_id):\n",
|
||
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
|
||
" print(\"Saving figure\", fig_id)\n",
|
||
" plt.tight_layout()\n",
|
||
" plt.savefig(path, format='png', dpi=300)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Get the data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"DATASETS_URL = \"https://github.com/ageron/ml-notebooks/raw/master/datasets\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import tarfile\n",
|
||
"import urllib.request\n",
|
||
"\n",
|
||
"HOUSING_PATH = \"datasets/housing\"\n",
|
||
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
|
||
"\n",
|
||
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
|
||
" os.makedirs(housing_path, exist_ok=True)\n",
|
||
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
|
||
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
|
||
" housing_tgz = tarfile.open(tgz_path)\n",
|
||
" housing_tgz.extractall(path=housing_path)\n",
|
||
" housing_tgz.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"fetch_housing_data()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"def load_housing_data(housing_path=HOUSING_PATH):\n",
|
||
" csv_path = os.path.join(housing_path, \"housing.csv\")\n",
|
||
" return pd.read_csv(csv_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing = load_housing_data()\n",
|
||
"housing.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.info()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing[\"ocean_proximity\"].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(housing.describe())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"%matplotlib inline\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"housing.hist(bins=50, figsize=(11,8))\n",
|
||
"save_fig(\"attribute_histogram_plots\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import numpy as np\n",
|
||
"import numpy.random as rnd\n",
|
||
"rnd.seed(42) # to make this notebook's output identical at every run\n",
|
||
"\n",
|
||
"def split_train_test(data, test_ratio):\n",
|
||
" shuffled_indices = rnd.permutation(len(data))\n",
|
||
" test_set_size = int(len(data) * test_ratio)\n",
|
||
" test_indices = shuffled_indices[:test_set_size]\n",
|
||
" train_indices = shuffled_indices[test_set_size:]\n",
|
||
" return data.iloc[train_indices], data.iloc[test_indices]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"train_set, test_set = split_train_test(housing, 0.2)\n",
|
||
"print(len(train_set), len(test_set))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import hashlib\n",
|
||
"\n",
|
||
"def test_set_check(identifier, test_ratio, hash):\n",
|
||
" return hash(identifier).digest()[-1] < 256 * test_ratio\n",
|
||
"\n",
|
||
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
|
||
" ids = data[id_column]\n",
|
||
" in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n",
|
||
" return data.loc[~in_test_set], data.loc[in_test_set]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_with_id = housing.reset_index() # adds an `index` column\n",
|
||
"train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")\n",
|
||
"test_set.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.cross_validation import train_test_split\n",
|
||
"\n",
|
||
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
|
||
"test_set.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing[\"median_income\"].hist()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n",
|
||
"housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)\n",
|
||
"housing[\"income_cat\"].value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.cross_validation import StratifiedShuffleSplit\n",
|
||
"\n",
|
||
"split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n",
|
||
"train_index, test_index = next(iter(split))\n",
|
||
"strat_train_set = housing.loc[train_index]\n",
|
||
"strat_test_set = housing.loc[test_index]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def income_cat_proportions(data):\n",
|
||
" return data[\"income_cat\"].value_counts() / len(data)\n",
|
||
"\n",
|
||
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
|
||
"\n",
|
||
"compare_props = pd.DataFrame({\n",
|
||
" \"Overall\": income_cat_proportions(housing),\n",
|
||
" \"Stratified\": income_cat_proportions(strat_test_set),\n",
|
||
" \"Random\": income_cat_proportions(test_set),\n",
|
||
"}).sort_index()\n",
|
||
"compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n",
|
||
"compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"compare_props"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"for set in (strat_train_set, strat_test_set):\n",
|
||
" set.drop(\"income_cat\", axis=1, inplace=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Discover and visualize the data to gain insights"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing = strat_train_set.copy()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n",
|
||
"save_fig(\"bad_visualization\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n",
|
||
"save_fig(\"better_visualization\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"metadata": {
|
||
"collapsed": false,
|
||
"scrolled": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n",
|
||
" s=housing['population']/100, label=\"population\",\n",
|
||
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
|
||
" colorbar=True, alpha=0.4, figsize=(10,7),\n",
|
||
")\n",
|
||
"plt.legend()\n",
|
||
"save_fig(\"housing_prices_scatterplot\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import matplotlib.image as mpimg\n",
|
||
"california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n",
|
||
"ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
|
||
" s=housing['population']/100, label=\"Population\",\n",
|
||
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
|
||
" colorbar=False, alpha=0.4,\n",
|
||
" )\n",
|
||
"plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n",
|
||
"plt.ylabel(\"Latitude\", fontsize=14)\n",
|
||
"plt.xlabel(\"Longitude\", fontsize=14)\n",
|
||
"\n",
|
||
"prices = housing[\"median_house_value\"]\n",
|
||
"tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
|
||
"cbar = plt.colorbar()\n",
|
||
"cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n",
|
||
"cbar.set_label('Median House Value', fontsize=16)\n",
|
||
"\n",
|
||
"plt.legend(fontsize=16)\n",
|
||
"save_fig(\"california_housing_prices\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 27,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"corr_matrix = housing.corr()\n",
|
||
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 28,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
|
||
" alpha=0.3)\n",
|
||
"plt.axis([0, 16, 0, 550000])\n",
|
||
"save_fig(\"income_vs_house_value_scatterplot\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from pandas.tools.plotting import scatter_matrix\n",
|
||
"\n",
|
||
"attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n",
|
||
"scatter_matrix(housing[attributes], figsize=(11, 8))\n",
|
||
"save_fig(\"scatter_matrix_plot\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing[\"rooms_per_household\"] = housing[\"total_rooms\"] / housing[\"population\"]\n",
|
||
"housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n",
|
||
"housing[\"population_per_household\"] = housing[\"population\"] / housing[\"households\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"corr_matrix = housing.corr()\n",
|
||
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n",
|
||
" alpha=0.2)\n",
|
||
"plt.axis([0, 5, 0, 520000])\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 33,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Prepare the data for Machine Learning algorithms"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing = strat_train_set.drop(\"median_house_value\", axis=1)\n",
|
||
"housing_labels = strat_train_set[\"median_house_value\"].copy()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_copy = housing.copy().iloc[21:24]\n",
|
||
"housing_copy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_copy = housing.copy().iloc[21:24]\n",
|
||
"housing_copy.drop(\"total_bedrooms\", axis=1) # option 2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_copy = housing.copy().iloc[21:24]\n",
|
||
"median = housing_copy[\"total_bedrooms\"].median()\n",
|
||
"housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n",
|
||
"housing_copy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import Imputer\n",
|
||
"\n",
|
||
"imputer = Imputer(strategy='median')\n",
|
||
"housing_num = housing.drop(\"ocean_proximity\", axis=1)\n",
|
||
"imputer.fit(housing_num)\n",
|
||
"X = imputer.transform(housing_num)\n",
|
||
"housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
|
||
"housing_tr.iloc[21:24]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"imputer.statistics_"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 41,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_num.median().values"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 42,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"imputer.strategy"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n",
|
||
"housing_tr.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import LabelEncoder\n",
|
||
"\n",
|
||
"encoder = LabelEncoder()\n",
|
||
"housing_cat = housing[\"ocean_proximity\"]\n",
|
||
"housing_cat_encoded = encoder.fit_transform(housing_cat)\n",
|
||
"housing_cat_encoded"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"print(encoder.classes_)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 46,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"\n",
|
||
"encoder = OneHotEncoder()\n",
|
||
"housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))\n",
|
||
"housing_cat_1hot"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 47,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_cat_1hot.toarray()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.preprocessing import LabelBinarizer\n",
|
||
"\n",
|
||
"encoder = LabelBinarizer()\n",
|
||
"encoder.fit_transform(housing_cat)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 49,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
||
"\n",
|
||
"rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n",
|
||
"\n",
|
||
"class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n",
|
||
" def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n",
|
||
" self.add_bedrooms_per_room = add_bedrooms_per_room\n",
|
||
" def fit(self, X, y=None):\n",
|
||
" return self # nothing else to do\n",
|
||
" def transform(self, X, y=None):\n",
|
||
" rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n",
|
||
" population_per_household = X[:, population_ix] / X[:, household_ix]\n",
|
||
" if self.add_bedrooms_per_room:\n",
|
||
" bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
|
||
" return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n",
|
||
" else:\n",
|
||
" return np.c_[X, rooms_per_household, population_per_household]\n",
|
||
"\n",
|
||
"attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n",
|
||
"housing_extra_attribs = attr_adder.transform(housing.values)\n",
|
||
"\n",
|
||
"housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n",
|
||
"housing_extra_attribs.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.preprocessing import StandardScaler\n",
|
||
"\n",
|
||
"num_pipeline = Pipeline([\n",
|
||
" ('imputer', Imputer(strategy=\"median\")),\n",
|
||
" ('attribs_adder', CombinedAttributesAdder()),\n",
|
||
" ('std_scaler', StandardScaler()),\n",
|
||
" ])\n",
|
||
"\n",
|
||
"num_pipeline.fit_transform(housing_num)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.pipeline import FeatureUnion\n",
|
||
"\n",
|
||
"class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
|
||
" def __init__(self, attribute_names):\n",
|
||
" self.attribute_names = attribute_names\n",
|
||
" def fit(self, X, y=None):\n",
|
||
" return self\n",
|
||
" def transform(self, X):\n",
|
||
" return X[self.attribute_names].values\n",
|
||
"\n",
|
||
"num_attribs = list(housing_num)\n",
|
||
"cat_attribs = [\"ocean_proximity\"]\n",
|
||
"\n",
|
||
"num_pipeline = Pipeline([\n",
|
||
" ('selector', DataFrameSelector(num_attribs)),\n",
|
||
" ('imputer', Imputer(strategy=\"median\")),\n",
|
||
" ('attribs_adder', CombinedAttributesAdder()),\n",
|
||
" ('std_scaler', StandardScaler()),\n",
|
||
" ])\n",
|
||
"\n",
|
||
"cat_pipeline = Pipeline([\n",
|
||
" ('selector', DataFrameSelector(cat_attribs)),\n",
|
||
" ('label_binarizer', LabelBinarizer()),\n",
|
||
" ])\n",
|
||
"\n",
|
||
"preparation_pipeline = FeatureUnion(transformer_list=[\n",
|
||
" (\"num_pipeline\", num_pipeline),\n",
|
||
" (\"cat_pipeline\", cat_pipeline),\n",
|
||
" ])\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_prepared = preparation_pipeline.fit_transform(housing)\n",
|
||
"housing_prepared"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"housing_prepared.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Prepare the data for Machine Learning algorithms"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 54,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.linear_model import LinearRegression\n",
|
||
"\n",
|
||
"lin_reg = LinearRegression()\n",
|
||
"lin_reg.fit(housing_prepared, housing_labels)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 55,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# let's try the full pipeline on a few training instances\n",
|
||
"some_data = housing.iloc[:5]\n",
|
||
"some_labels = housing_labels.iloc[:5]\n",
|
||
"some_data_prepared = preparation_pipeline.transform(some_data)\n",
|
||
"\n",
|
||
"print(\"Predictions:\\t\", lin_reg.predict(some_data_prepared))\n",
|
||
"print(\"Labels:\\t\\t\", list(some_labels))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 56,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.metrics import mean_squared_error\n",
|
||
"\n",
|
||
"housing_predictions = lin_reg.predict(housing_prepared)\n",
|
||
"lin_mse = mean_squared_error(housing_labels, housing_predictions)\n",
|
||
"lin_rmse = np.sqrt(lin_mse)\n",
|
||
"lin_rmse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.metrics import mean_absolute_error\n",
|
||
"\n",
|
||
"lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n",
|
||
"lin_mae"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 58,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.tree import DecisionTreeRegressor\n",
|
||
"\n",
|
||
"tree_reg = DecisionTreeRegressor()\n",
|
||
"tree_reg.fit(housing_prepared, housing_labels)\n",
|
||
"housing_predictions = tree_reg.predict(housing_prepared)\n",
|
||
"tree_mse = mean_squared_error(housing_labels, housing_predictions)\n",
|
||
"tree_rmse = np.sqrt(tree_mse)\n",
|
||
"tree_rmse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Fine-tune your model"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 59,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.cross_validation import cross_val_score\n",
|
||
"\n",
|
||
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
|
||
" scoring=\"mean_squared_error\", cv=10)\n",
|
||
"tree_rmse_scores = np.sqrt(-tree_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 60,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"def display_scores(scores):\n",
|
||
" print(\"Scores:\", scores)\n",
|
||
" print(\"Mean:\", scores.mean())\n",
|
||
" print(\"Standard deviation:\", scores.std())\n",
|
||
"\n",
|
||
"display_scores(tree_rmse_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 61,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
|
||
" scoring=\"mean_squared_error\", cv=10)\n",
|
||
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
|
||
"display_scores(lin_rmse_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 62,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||
"\n",
|
||
"forest_reg = RandomForestRegressor()\n",
|
||
"forest_reg.fit(housing_prepared, housing_labels)\n",
|
||
"housing_predictions = forest_reg.predict(housing_prepared)\n",
|
||
"forest_mse = mean_squared_error(housing_labels, housing_predictions)\n",
|
||
"forest_rmse = np.sqrt(forest_mse)\n",
|
||
"forest_rmse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.cross_validation import cross_val_score\n",
|
||
"\n",
|
||
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
|
||
" scoring=\"mean_squared_error\", cv=10)\n",
|
||
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
|
||
"display_scores(forest_rmse_scores)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 64,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n",
|
||
"pd.Series(np.sqrt(-scores)).describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 65,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.svm import SVR\n",
|
||
"\n",
|
||
"svm_reg = SVR(kernel=\"linear\")\n",
|
||
"svm_reg.fit(housing_prepared, housing_labels)\n",
|
||
"housing_predictions = svm_reg.predict(housing_prepared)\n",
|
||
"svm_mse = mean_squared_error(housing_labels, housing_predictions)\n",
|
||
"svm_rmse = np.sqrt(svm_mse)\n",
|
||
"svm_rmse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 66,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.grid_search import GridSearchCV\n",
|
||
"\n",
|
||
"param_grid = [\n",
|
||
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
|
||
" {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n",
|
||
" ]\n",
|
||
"\n",
|
||
"forest_reg = RandomForestRegressor()\n",
|
||
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n",
|
||
"grid_search.fit(housing_prepared, housing_labels)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 67,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"grid_search.best_params_"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"grid_search.best_estimator_"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"for params, mean_score, scores in grid_search.grid_scores_:\n",
|
||
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.grid_search import RandomizedSearchCV\n",
|
||
"from scipy.stats import randint\n",
|
||
"\n",
|
||
"param_distribs = {\n",
|
||
" 'n_estimators': randint(low=1, high=200),\n",
|
||
" 'max_features': randint(low=1, high=8),\n",
|
||
" }\n",
|
||
"\n",
|
||
"forest_reg = RandomForestRegressor()\n",
|
||
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
|
||
" n_iter=10, cv=5, scoring='mean_squared_error')\n",
|
||
"rnd_search.fit(housing_prepared, housing_labels)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"for params, mean_score, scores in rnd_search.grid_scores_:\n",
|
||
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
|
||
"feature_importances"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 73,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n",
|
||
"cat_one_hot_attribs = list(encoder.classes_)\n",
|
||
"attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n",
|
||
"sorted(zip(feature_importances, attributes), reverse=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 74,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"final_model = grid_search.best_estimator_\n",
|
||
"\n",
|
||
"X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n",
|
||
"y_test = strat_test_set[\"median_house_value\"].copy()\n",
|
||
"\n",
|
||
"X_test_transformed = preparation_pipeline.transform(X_test)\n",
|
||
"final_predictions = final_model.predict(X_test_transformed)\n",
|
||
"\n",
|
||
"final_mse = mean_squared_error(y_test, final_predictions)\n",
|
||
"final_rmse = np.sqrt(final_mse)\n",
|
||
"final_rmse"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Extra material"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Label Binarizer hack\n",
|
||
"`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
|
||
"\n",
|
||
"This hack creates a supervision-friendly `LabelBinarizer`."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 75,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
|
||
" def fit_transform(self, X, y=None):\n",
|
||
" return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
|
||
"\n",
|
||
"# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
|
||
"cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
|
||
"\n",
|
||
"# Now you can create a full pipeline with a supervised predictor at the end.\n",
|
||
"full_pipeline = Pipeline([\n",
|
||
" (\"preparation\", preparation_pipeline),\n",
|
||
" (\"linear\", LinearRegression())\n",
|
||
" ])\n",
|
||
"\n",
|
||
"full_pipeline.fit(housing, housing_labels)\n",
|
||
"full_pipeline.predict(some_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Model persistence using joblib"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 76,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.externals import joblib"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 77,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"joblib.dump(final_model, \"my_random_forest_regressor.pkl\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 78,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"final_model_loaded = joblib.load(\"my_random_forest_regressor.pkl\")\n",
|
||
"final_model_loaded"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Example SciPy distributions for `RandomizedSearchCV`"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 79,
|
||
"metadata": {
|
||
"collapsed": false
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from scipy.stats import geom, expon\n",
|
||
"geom_distrib=geom(0.5).rvs(10000)\n",
|
||
"expon_distrib=expon(scale=1).rvs(10000)\n",
|
||
"plt.hist(geom_distrib, bins=50)\n",
|
||
"plt.show()\n",
|
||
"plt.hist(expon_distrib, bins=50)\n",
|
||
"plt.show()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.5.1"
|
||
},
|
||
"toc": {
|
||
"toc_cell": false,
|
||
"toc_number_sections": true,
|
||
"toc_threshold": 6,
|
||
"toc_window_display": false
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 0
|
||
}
|