diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index c50889a..5155781 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -89,13 +89,28 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": true, + "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ - "DATASETS_URL = \"https://github.com/ageron/handson-ml/raw/master/datasets\"" + "import os\n", + "import tarfile\n", + "from six.moves import urllib\n", + "\n", + "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n", + "HOUSING_PATH = \"datasets/housing\"\n", + "HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + \"/housing.tgz\"\n", + "\n", + "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", + " if not os.path.isdir(housing_path):\n", + " os.makedirs(housing_path)\n", + " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", + " urllib.request.urlretrieve(housing_url, tgz_path)\n", + " housing_tgz = tarfile.open(tgz_path)\n", + " housing_tgz.extractall(path=housing_path)\n", + " housing_tgz.close()" ] }, { @@ -107,40 +122,13 @@ "editable": true }, "outputs": [], - "source": [ - "import os\n", - "import tarfile\n", - "from six.moves import urllib\n", - "\n", - "HOUSING_PATH = \"datasets/housing\"\n", - "HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n", - "\n", - "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", - " if not os.path.exists(housing_path):\n", - " os.makedirs(housing_path)\n", - " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", - " urllib.request.urlretrieve(housing_url, tgz_path)\n", - " housing_tgz = tarfile.open(tgz_path)\n", - " housing_tgz.extractall(path=housing_path)\n", - " housing_tgz.close()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], "source": [ "fetch_housing_data()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "collapsed": true, "deletable": true, @@ -157,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "collapsed": false, "deletable": true, @@ -171,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "collapsed": false, "deletable": true, @@ -184,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "collapsed": false, "deletable": true, @@ -197,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, @@ -205,12 +193,12 @@ }, "outputs": [], "source": [ - "print(housing.describe())" + "housing.describe()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "collapsed": false, "deletable": true, @@ -220,11 +208,25 @@ "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", - "housing.hist(bins=50, figsize=(11,8))\n", + "housing.hist(bins=50, figsize=(20,15))\n", "save_fig(\"attribute_histogram_plots\")\n", "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# to make this notebook's output identical at every run\n", + "np.random.seed(42)" + ] + }, { "cell_type": "code", "execution_count": 11, @@ -236,11 +238,9 @@ "outputs": [], "source": [ "import numpy as np\n", - "import numpy.random as rnd\n", - "rnd.seed(42) # to make this notebook's output identical at every run\n", "\n", "def split_train_test(data, test_ratio):\n", - " shuffled_indices = rnd.permutation(len(data))\n", + " shuffled_indices = np.random.permutation(len(data))\n", " test_set_size = int(len(data) * test_ratio)\n", " test_indices = shuffled_indices[:test_set_size]\n", " train_indices = shuffled_indices[test_set_size:]\n", @@ -258,7 +258,7 @@ "outputs": [], "source": [ "train_set, test_set = split_train_test(housing, 0.2)\n", - "print(len(train_set), len(test_set))" + "print(len(train_set), \"train +\", len(test_set), \"test\")" ] }, { @@ -274,7 +274,7 @@ "import hashlib\n", "\n", "def test_set_check(identifier, test_ratio, hash):\n", - " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n", + " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n", "\n", "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n", " ids = data[id_column]\n", @@ -286,15 +286,15 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false, + "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ - "housing_with_id = housing.reset_index() # adds an `index` column\n", - "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")\n", - "test_set.head()" + "# This version supports both Python 2 and Python 3, instead of just Python 3.\n", + "def test_set_check(identifier, test_ratio, hash):\n", + " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" ] }, { @@ -307,10 +307,8 @@ }, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", - "test_set.head()" + "housing_with_id = housing.reset_index() # adds an `index` column\n", + "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")" ] }, { @@ -323,7 +321,8 @@ }, "outputs": [], "source": [ - "housing[\"median_income\"].hist()" + "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n", + "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"id\")" ] }, { @@ -336,9 +335,7 @@ }, "outputs": [], "source": [ - "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n", - "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)\n", - "housing[\"income_cat\"].value_counts()" + "test_set.head()" ] }, { @@ -350,6 +347,74 @@ "editable": true }, "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "test_set.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing[\"median_income\"].hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n", + "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing[\"income_cat\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], "source": [ "from sklearn.model_selection import StratifiedShuffleSplit\n", "\n", @@ -361,7 +426,20 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 24, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing[\"income_cat\"].value_counts() / len(housing)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, "metadata": { "collapsed": false, "deletable": true, @@ -385,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "metadata": { "collapsed": false, "deletable": true, @@ -398,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 27, "metadata": { "collapsed": false, "deletable": true, @@ -406,8 +484,8 @@ }, "outputs": [], "source": [ - "for set in (strat_train_set, strat_test_set):\n", - " set.drop(\"income_cat\", axis=1, inplace=True)" + "for set_ in (strat_train_set, strat_test_set):\n", + " set_.drop(\"income_cat\", axis=1, inplace=True)" ] }, { @@ -422,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 28, "metadata": { "collapsed": true, "deletable": true, @@ -435,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 29, "metadata": { "collapsed": false, "deletable": true, @@ -449,7 +527,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 30, "metadata": { "collapsed": false, "deletable": true, @@ -463,28 +541,25 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 31, "metadata": { "collapsed": false, "deletable": true, - "editable": true, - "scrolled": true + "editable": true }, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n", - " s=housing['population']/100, label=\"population\",\n", - " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", - " colorbar=True, alpha=0.4, figsize=(10,7),\n", + "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.4,\n", + " s=housing[\"population\"]/100, label=\"population\", figsize=(10,7),\n", + " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True,\n", ")\n", "plt.legend()\n", - "save_fig(\"housing_prices_scatterplot\")\n", - "plt.show()" + "save_fig(\"housing_prices_scatterplot\")" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 32, "metadata": { "collapsed": false, "deletable": true, @@ -516,7 +591,20 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 33, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "corr_matrix = housing.corr()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, "metadata": { "collapsed": false, "deletable": true, @@ -524,13 +612,12 @@ }, "outputs": [], "source": [ - "corr_matrix = housing.corr()\n", "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 35, "metadata": { "collapsed": false, "deletable": true, @@ -539,15 +626,14 @@ "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n", - " alpha=0.3)\n", + " alpha=0.1)\n", "plt.axis([0, 16, 0, 550000])\n", - "save_fig(\"income_vs_house_value_scatterplot\")\n", - "plt.show()" + "save_fig(\"income_vs_house_value_scatterplot\")" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 36, "metadata": { "collapsed": false, "deletable": true, @@ -557,15 +643,15 @@ "source": [ "from pandas.tools.plotting import scatter_matrix\n", "\n", - "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n", - "scatter_matrix(housing[attributes], figsize=(11, 8))\n", - "save_fig(\"scatter_matrix_plot\")\n", - "plt.show()" + "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n", + " \"housing_median_age\"]\n", + "scatter_matrix(housing[attributes], figsize=(12, 8))\n", + "save_fig(\"scatter_matrix_plot\")" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 37, "metadata": { "collapsed": true, "deletable": true, @@ -573,14 +659,14 @@ }, "outputs": [], "source": [ - "housing[\"rooms_per_household\"] = housing[\"total_rooms\"] / housing[\"population\"]\n", - "housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n", - "housing[\"population_per_household\"] = housing[\"population\"] / housing[\"households\"]" + "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n", + "housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"]/housing[\"total_rooms\"]\n", + "housing[\"population_per_household\"]=housing[\"population\"]/housing[\"households\"]" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 38, "metadata": { "collapsed": false, "deletable": true, @@ -594,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 39, "metadata": { "collapsed": false, "deletable": true, @@ -610,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 40, "metadata": { "collapsed": false, "deletable": true, @@ -633,7 +719,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 41, "metadata": { "collapsed": true, "deletable": true, @@ -647,7 +733,20 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 42, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing.iloc[21:24]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, "metadata": { "collapsed": false, "deletable": true, @@ -656,25 +755,12 @@ "outputs": [], "source": [ "housing_copy = housing.copy().iloc[21:24]\n", - "housing_copy" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ "housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 44, "metadata": { "collapsed": false, "deletable": true, @@ -688,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 45, "metadata": { "collapsed": false, "deletable": true, @@ -704,7 +790,57 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 46, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing_copy.drop(\"total_bedrooms\", axis=1) # option 2" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "median = housing_copy[\"total_bedrooms\"].median()\n", + "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", + "housing_copy" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Summary...\n", + "housing_copy = housing.copy().iloc[21:24]\n", + "housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1\n", + "\n", + "housing_copy = housing.copy().iloc[21:24]\n", + "housing_copy.drop(\"total_bedrooms\", axis=1) # option 2\n", + "\n", + "housing_copy = housing.copy().iloc[21:24]\n", + "median = housing_copy[\"total_bedrooms\"].median()\n", + "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3" + ] + }, + { + "cell_type": "code", + "execution_count": 49, "metadata": { "collapsed": false, "deletable": true, @@ -714,17 +850,38 @@ "source": [ "from sklearn.preprocessing import Imputer\n", "\n", - "imputer = Imputer(strategy='median')\n", - "housing_num = housing.drop(\"ocean_proximity\", axis=1)\n", - "imputer.fit(housing_num)\n", - "X = imputer.transform(housing_num)\n", - "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n", - "housing_tr.iloc[21:24]" + "imputer = Imputer(strategy=\"median\")" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 50, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing_num = housing.drop(\"ocean_proximity\", axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "imputer.fit(housing_num)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, "metadata": { "collapsed": false, "deletable": true, @@ -737,7 +894,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 53, "metadata": { "collapsed": false, "deletable": true, @@ -750,7 +907,46 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 54, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X = imputer.transform(housing_num)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing_tr = pd.DataFrame(X, columns=housing_num.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing_tr.iloc[21:24]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, "metadata": { "collapsed": false, "deletable": true, @@ -763,7 +959,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 58, "metadata": { "collapsed": false, "deletable": true, @@ -777,7 +973,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 59, "metadata": { "collapsed": false, "deletable": true, @@ -795,7 +991,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 60, "metadata": { "collapsed": false, "deletable": true, @@ -808,7 +1004,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 61, "metadata": { "collapsed": false, "deletable": true, @@ -825,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 62, "metadata": { "collapsed": false, "deletable": true, @@ -838,7 +1034,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 63, "metadata": { "collapsed": false, "deletable": true, @@ -849,12 +1045,13 @@ "from sklearn.preprocessing import LabelBinarizer\n", "\n", "encoder = LabelBinarizer()\n", - "encoder.fit_transform(housing_cat)" + "housing_cat_1hot = encoder.fit_transform(housing_cat)\n", + "housing_cat_1hot" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 64, "metadata": { "collapsed": false, "deletable": true, @@ -876,20 +1073,32 @@ " population_per_household = X[:, population_ix] / X[:, household_ix]\n", " if self.add_bedrooms_per_room:\n", " bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n", - " return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n", + " return np.c_[X, rooms_per_household, population_per_household,\n", + " bedrooms_per_room]\n", " else:\n", " return np.c_[X, rooms_per_household, population_per_household]\n", "\n", "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n", - "housing_extra_attribs = attr_adder.transform(housing.values)\n", - "\n", + "housing_extra_attribs = attr_adder.transform(housing.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "housing_extra_attribs = pd.DataFrame(housing_extra_attribs, columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"])\n", "housing_extra_attribs.head()" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 66, "metadata": { "collapsed": false, "deletable": true, @@ -906,12 +1115,46 @@ " ('std_scaler', StandardScaler()),\n", " ])\n", "\n", - "num_pipeline.fit_transform(housing_num)" + "housing_num_tr = num_pipeline.fit_transform(housing_num)" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 67, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing_num_tr" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "\n", + "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", + " def __init__(self, attribute_names):\n", + " self.attribute_names = attribute_names\n", + " def fit(self, X, y=None):\n", + " return self\n", + " def transform(self, X):\n", + " return X[self.attribute_names].values" + ] + }, + { + "cell_type": "code", + "execution_count": 69, "metadata": { "collapsed": false, "deletable": true, @@ -921,14 +1164,6 @@ "source": [ "from sklearn.pipeline import FeatureUnion\n", "\n", - "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", - " def __init__(self, attribute_names):\n", - " self.attribute_names = attribute_names\n", - " def fit(self, X, y=None):\n", - " return self\n", - " def transform(self, X):\n", - " return X[self.attribute_names].values\n", - "\n", "num_attribs = list(housing_num)\n", "cat_attribs = [\"ocean_proximity\"]\n", "\n", @@ -944,15 +1179,15 @@ " ('label_binarizer', LabelBinarizer()),\n", " ])\n", "\n", - "preparation_pipeline = FeatureUnion(transformer_list=[\n", + "full_pipeline = FeatureUnion(transformer_list=[\n", " (\"num_pipeline\", num_pipeline),\n", " (\"cat_pipeline\", cat_pipeline),\n", - " ])\n" + " ])" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 70, "metadata": { "collapsed": false, "deletable": true, @@ -960,13 +1195,13 @@ }, "outputs": [], "source": [ - "housing_prepared = preparation_pipeline.fit_transform(housing)\n", + "housing_prepared = full_pipeline.fit_transform(housing)\n", "housing_prepared" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 71, "metadata": { "collapsed": false, "deletable": true, @@ -989,7 +1224,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 72, "metadata": { "collapsed": false, "deletable": true, @@ -1005,7 +1240,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 73, "metadata": { "collapsed": false, "deletable": true, @@ -1016,15 +1251,40 @@ "# let's try the full pipeline on a few training instances\n", "some_data = housing.iloc[:5]\n", "some_labels = housing_labels.iloc[:5]\n", - "some_data_prepared = preparation_pipeline.transform(some_data)\n", + "some_data_prepared = full_pipeline.transform(some_data)\n", "\n", - "print(\"Predictions:\\t\", lin_reg.predict(some_data_prepared))\n", + "print(\"Predictions:\\t\", lin_reg.predict(some_data_prepared))" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "print(\"Labels:\\t\\t\", list(some_labels))" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 75, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "some_data_prepared" + ] + }, + { + "cell_type": "code", + "execution_count": 76, "metadata": { "collapsed": false, "deletable": true, @@ -1042,7 +1302,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 77, "metadata": { "collapsed": false, "deletable": true, @@ -1058,7 +1318,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 78, "metadata": { "collapsed": false, "deletable": true, @@ -1069,7 +1329,19 @@ "from sklearn.tree import DecisionTreeRegressor\n", "\n", "tree_reg = DecisionTreeRegressor()\n", - "tree_reg.fit(housing_prepared, housing_labels)\n", + "tree_reg.fit(housing_prepared, housing_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "housing_predictions = tree_reg.predict(housing_prepared)\n", "tree_mse = mean_squared_error(housing_labels, housing_predictions)\n", "tree_rmse = np.sqrt(tree_mse)\n", @@ -1088,7 +1360,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 80, "metadata": { "collapsed": false, "deletable": true, @@ -1098,14 +1370,14 @@ "source": [ "from sklearn.model_selection import cross_val_score\n", "\n", - "tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n", - " scoring=\"neg_mean_squared_error\", cv=10)\n", - "tree_rmse_scores = np.sqrt(-tree_scores)" + "scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n", + " scoring=\"neg_mean_squared_error\", cv=10)\n", + "tree_rmse_scores = np.sqrt(-scores)" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 81, "metadata": { "collapsed": false, "deletable": true, @@ -1123,7 +1395,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 82, "metadata": { "collapsed": false, "deletable": true, @@ -1139,7 +1411,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 83, "metadata": { "collapsed": false, "deletable": true, @@ -1150,7 +1422,19 @@ "from sklearn.ensemble import RandomForestRegressor\n", "\n", "forest_reg = RandomForestRegressor()\n", - "forest_reg.fit(housing_prepared, housing_labels)\n", + "forest_reg.fit(housing_prepared, housing_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "housing_predictions = forest_reg.predict(housing_prepared)\n", "forest_mse = mean_squared_error(housing_labels, housing_predictions)\n", "forest_rmse = np.sqrt(forest_mse)\n", @@ -1159,7 +1443,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 85, "metadata": { "collapsed": false, "deletable": true, @@ -1177,7 +1461,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 86, "metadata": { "collapsed": false, "deletable": true, @@ -1191,7 +1475,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 87, "metadata": { "collapsed": false, "deletable": true, @@ -1211,7 +1495,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 88, "metadata": { "collapsed": false, "deletable": true, @@ -1222,18 +1506,19 @@ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = [\n", - " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", - " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", - " ]\n", + " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", + " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", + " ]\n", "\n", "forest_reg = RandomForestRegressor()\n", - "grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n", + "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", + " scoring='neg_mean_squared_error')\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 89, "metadata": { "collapsed": false, "deletable": true, @@ -1246,7 +1531,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 90, "metadata": { "collapsed": false, "deletable": true, @@ -1259,7 +1544,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 91, "metadata": { "collapsed": false, "deletable": true, @@ -1274,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 92, "metadata": { "collapsed": false, "deletable": true, @@ -1287,7 +1572,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 93, "metadata": { "collapsed": false, "deletable": true, @@ -1311,7 +1596,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 94, "metadata": { "collapsed": false, "deletable": true, @@ -1326,7 +1611,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 95, "metadata": { "collapsed": false, "deletable": true, @@ -1340,7 +1625,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 96, "metadata": { "collapsed": false, "deletable": true, @@ -1348,7 +1633,7 @@ }, "outputs": [], "source": [ - "extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n", + "extra_attribs = [\"rooms_per_hhold\", \"pop_per_hhold\", \"bedrooms_per_room\"]\n", "cat_one_hot_attribs = list(encoder.classes_)\n", "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n", "sorted(zip(feature_importances, attributes), reverse=True)" @@ -1356,9 +1641,9 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 97, "metadata": { - "collapsed": false, + "collapsed": true, "deletable": true, "editable": true }, @@ -1369,11 +1654,23 @@ "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n", "y_test = strat_test_set[\"median_house_value\"].copy()\n", "\n", - "X_test_transformed = preparation_pipeline.transform(X_test)\n", - "final_predictions = final_model.predict(X_test_transformed)\n", + "X_test_prepared = full_pipeline.transform(X_test)\n", + "final_predictions = final_model.predict(X_test_prepared)\n", "\n", "final_mse = mean_squared_error(y_test, final_predictions)\n", - "final_rmse = np.sqrt(final_mse)\n", + "final_rmse = np.sqrt(final_mse)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "final_rmse" ] }, @@ -1402,7 +1699,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 99, "metadata": { "collapsed": false, "deletable": true, @@ -1418,13 +1715,13 @@ "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n", "\n", "# Now you can create a full pipeline with a supervised predictor at the end.\n", - "full_pipeline = Pipeline([\n", - " (\"preparation\", preparation_pipeline),\n", + "full_pipeline_with_predictor = Pipeline([\n", + " (\"preparation\", full_pipeline),\n", " (\"linear\", LinearRegression())\n", " ])\n", "\n", - "full_pipeline.fit(housing, housing_labels)\n", - "full_pipeline.predict(some_data)" + "full_pipeline_with_predictor.fit(housing, housing_labels)\n", + "full_pipeline_with_predictor.predict(some_data)" ] }, { @@ -1439,7 +1736,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 100, "metadata": { "collapsed": true, "deletable": true, @@ -1447,34 +1744,23 @@ }, "outputs": [], "source": [ - "from sklearn.externals import joblib" + "my_model = full_pipeline_with_predictor" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 101, "metadata": { - "collapsed": false, + "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ - "joblib.dump(final_model, \"my_random_forest_regressor.pkl\")" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "final_model_loaded = joblib.load(\"my_random_forest_regressor.pkl\")\n", - "final_model_loaded" + "from sklearn.externals import joblib\n", + "joblib.dump(my_model, \"my_model.pkl\") # DIFF\n", + "#...\n", + "my_model_loaded = joblib.load(\"my_model.pkl\") # DIFF" ] }, { @@ -1489,7 +1775,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 102, "metadata": { "collapsed": false, "deletable": true, @@ -1539,7 +1825,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 103, "metadata": { "collapsed": false, "deletable": true, @@ -1572,7 +1858,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 104, "metadata": { "collapsed": false, "deletable": true, @@ -1597,7 +1883,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 105, "metadata": { "collapsed": false, "deletable": true, @@ -1640,7 +1926,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 106, "metadata": { "collapsed": false, "deletable": true, @@ -1679,7 +1965,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 107, "metadata": { "collapsed": false, "deletable": true, @@ -1704,7 +1990,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 108, "metadata": { "collapsed": false, "deletable": true, @@ -1737,7 +2023,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 109, "metadata": { "collapsed": false, "deletable": true, @@ -1769,7 +2055,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 110, "metadata": { "collapsed": false, "deletable": true, @@ -1791,7 +2077,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The reciprocal distribution is useful when you have no idea what the scale of the hyperparameter should be (indeed, as you can see on the figure on the right, all scales are equally likely, within the given range), whereas the exponential distribution is best when you know (more or less) what the scale of the hyperparameter should be." ] @@ -1818,7 +2107,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 111, "metadata": { "collapsed": true, "deletable": true, @@ -1864,7 +2153,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 112, "metadata": { "collapsed": true, "deletable": true, @@ -1887,7 +2176,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 113, "metadata": { "collapsed": false, "deletable": true, @@ -1901,7 +2190,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 114, "metadata": { "collapsed": false, "deletable": true, @@ -1924,7 +2213,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 115, "metadata": { "collapsed": false, "deletable": true, @@ -1947,7 +2236,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 116, "metadata": { "collapsed": false, "deletable": true, @@ -1956,14 +2245,14 @@ "outputs": [], "source": [ "preparation_and_feature_selection_pipeline = Pipeline([\n", - " ('preparation', preparation_pipeline),\n", + " ('preparation', full_pipeline),\n", " ('feature_selection', TopFeatureSelector(feature_importances, k))\n", "])" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 117, "metadata": { "collapsed": true, "deletable": true, @@ -1986,7 +2275,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 118, "metadata": { "collapsed": false, "deletable": true, @@ -2009,7 +2298,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 119, "metadata": { "collapsed": false, "deletable": true, @@ -2052,16 +2341,16 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 121, "metadata": { - "collapsed": true, + "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "prepare_select_and_predict_pipeline = Pipeline([\n", - " ('preparation', preparation_pipeline),\n", + " ('preparation', full_pipeline),\n", " ('feature_selection', TopFeatureSelector(feature_importances, k)),\n", " ('svm_reg', SVR(**rnd_search.best_params_))\n", "])" @@ -2069,7 +2358,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 122, "metadata": { "collapsed": false, "deletable": true, @@ -2092,7 +2381,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 123, "metadata": { "collapsed": false, "deletable": true, @@ -2139,7 +2428,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 124, "metadata": { "collapsed": false, "deletable": true, @@ -2159,7 +2448,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 125, "metadata": { "collapsed": false, "deletable": true, @@ -2182,7 +2471,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 126, "metadata": { "collapsed": false, "deletable": true, diff --git a/09_up_and_running_with_tensorflow.ipynb b/09_up_and_running_with_tensorflow.ipynb index 361cc17..7cfc20b 100644 --- a/09_up_and_running_with_tensorflow.ipynb +++ b/09_up_and_running_with_tensorflow.ipynb @@ -2810,7 +2810,7 @@ }, "outputs": [], "source": [ - "n_epochs = 500\n", + "n_epochs = 1000\n", "batch_size = 50\n", "n_batches = int(np.ceil(m / batch_size))\n", "\n", @@ -2936,14 +2936,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Well, that looks pretty bad, doesn't it? But let's not forget that the Logistic Regression model has a linear decision boundary, so this is actually close to the best we can do with this model (unless we add more features, such as ${x_1}^2$, ${x_2}^2$ and $x_1 x_2$)." + "Well, that looks pretty bad, doesn't it? But let's not forget that the Logistic Regression model has a linear decision boundary, so this is actually close to the best we can do with this model (unless we add more features, as we will show in a second)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's just add all the bells and whistles, as listed in the exercise:\n", + "Now let's start over, but this time we will add all the bells and whistles, as listed in the exercise:\n", "* Define the graph within a `logistic_regression()` function that can be reused easily.\n", "* Save checkpoints using a `Saver` at regular intervals during training, and save the final model at the end of training.\n", "* Restore the last checkpoint upon startup if training was interrupted.\n", @@ -2956,7 +2956,353 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Coming soon**" + "Before we start, we will add 4 more features to the inputs: ${x_1}^2$, ${x_2}^2$, ${x_1}^3$ and ${x_2}^3$. This was not part of the exercise, but it will demonstrate how adding features can improve the model. We will do this manually, but you could also add them using `sklearn.preprocessing.PolynomialFeatures`." + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X_train_enhanced = np.c_[X_train,\n", + " np.square(X_train[:, 1]),\n", + " np.square(X_train[:, 2]),\n", + " X_train[:, 1] ** 3,\n", + " X_train[:, 2] ** 3]\n", + "X_test_enhanced = np.c_[X_test,\n", + " np.square(X_test[:, 1]),\n", + " np.square(X_test[:, 2]),\n", + " X_test[:, 1] ** 3,\n", + " X_test[:, 2] ** 3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is what the \"enhanced\" training set looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_train_enhanced[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, next let's reset the default graph:" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's define the `logistic_regression()` function to create the graph. We will leave out the definition of the inputs `X` and the targets `y`. We could include them here, but leaving them out will make it easier to use this function in a wide range of use cases (e.g. perhaps we will want to add some preprocessing steps for the inputs before we feed them to the Logistic Regression model)." + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def logistic_regression(X, y, initializer=None, seed=42, learning_rate=0.01):\n", + " n_inputs_including_bias = int(X.get_shape()[1])\n", + " with tf.name_scope(\"logistic_regression\"):\n", + " with tf.name_scope(\"model\"):\n", + " if initializer is None:\n", + " initializer = tf.random_uniform([n_inputs_including_bias, 1], -1.0, 1.0, seed=seed)\n", + " theta = tf.Variable(initializer, name=\"theta\")\n", + " logits = tf.matmul(X, theta, name=\"logits\")\n", + " y_proba = tf.sigmoid(logits)\n", + " with tf.name_scope(\"train\"):\n", + " loss = tf.losses.log_loss(y, y_proba, scope=\"loss\")\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)\n", + " training_op = optimizer.minimize(loss)\n", + " loss_summary = tf.summary.scalar('log_loss', loss)\n", + " with tf.name_scope(\"init\"):\n", + " init = tf.global_variables_initializer()\n", + " with tf.name_scope(\"save\"):\n", + " saver = tf.train.Saver()\n", + " return y_proba, loss, training_op, loss_summary, init, saver" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a little function to get the name of the log directory to save the summaries for Tensorboard:" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "def log_dir(prefix=\"\"):\n", + " now = datetime.utcnow().strftime(\"%Y%m%d%H%M%S\")\n", + " root_logdir = \"tf_logs\"\n", + " if prefix:\n", + " prefix += \"-\"\n", + " name = prefix + \"run-\" + now\n", + " return \"{}/{}/\".format(root_logdir, name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's create the graph, using the `logistic_regression()` function. We will also create the `FileWriter` to save the summaries to the log directory for Tensorboard:" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_inputs = 2 + 4\n", + "logdir = log_dir(\"logreg\")\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs + 1), name=\"X\")\n", + "y = tf.placeholder(tf.float32, shape=(None, 1), name=\"y\")\n", + "\n", + "y_proba, loss, training_op, loss_summary, init, saver = logistic_regression(X, y)\n", + "\n", + "file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At last we can train the model! We will start by checking whether a previous training session was interrupted, and if so we will load the checkpoint and continue training from the epoch number we saved. In this example we just save the epoch number to a separate file, but in chapter 11 we will see how to store the training step directly as part of the model, using a non-trainable variable called `global_step` that we pass to the optimizer's `minimize()` method.\n", + "\n", + "You can try interrupting training to verify that it does indeed restore the last checkpoint when you start it again." + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "n_epochs = 10001\n", + "batch_size = 50\n", + "n_batches = int(np.ceil(m / batch_size))\n", + "\n", + "checkpoint_path = \"/tmp/my_logreg_model.ckpt\"\n", + "checkpoint_epoch_path = checkpoint_path + \".epoch\"\n", + "final_model_path = \"./my_logreg_model\"\n", + "\n", + "with tf.Session() as sess:\n", + " if os.path.isfile(checkpoint_epoch_path):\n", + " # if the checkpoint file exists, restore the model and load the epoch number\n", + " with open(checkpoint_epoch_path, \"rb\") as f:\n", + " start_epoch = int(f.read())\n", + " print(\"Training was interrupted. Continuing at epoch\", start_epoch)\n", + " saver.restore(sess, checkpoint_path)\n", + " else:\n", + " start_epoch = 0\n", + " sess.run(init)\n", + "\n", + " for epoch in range(start_epoch, n_epochs):\n", + " for batch_index in range(n_batches):\n", + " X_batch, y_batch = random_batch(X_train_enhanced, y_train, batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " loss_val, summary_str = sess.run([loss, loss_summary], feed_dict={X: X_test_enhanced, y: y_test})\n", + " file_writer.add_summary(summary_str, epoch)\n", + " if epoch % 500 == 0:\n", + " print(\"Epoch:\", epoch, \"\\tLoss:\", loss_val)\n", + " saver.save(sess, checkpoint_path)\n", + " with open(checkpoint_epoch_path, \"wb\") as f:\n", + " f.write(b\"%d\" % (epoch + 1))\n", + "\n", + " saver.save(sess, final_model_path)\n", + " y_proba_val = y_proba.eval(feed_dict={X: X_test_enhanced, y: y_test})\n", + " os.remove(checkpoint_epoch_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once again, we can make predictions by just classifying as positive all the instances whose estimated probability is greater or equal to 0.5:" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "y_pred = (y_proba_val >= 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "precision_score(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "recall_score(y_test, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "y_pred_idx = y_pred.reshape(-1) # a 1D array rather than a column vector\n", + "plt.plot(X_test[y_pred_idx, 1], X_test[y_pred_idx, 2], 'go', label=\"Positive\")\n", + "plt.plot(X_test[~y_pred_idx, 1], X_test[~y_pred_idx, 2], 'r^', label=\"Negative\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that's much, much better! Apparently the new features really helped a lot." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Try starting the tensorboard server, find the latest run and look at the learning curve (i.e., how the loss evaluated on the test set evolves as a function of the epoch number):\n", + "\n", + "```\n", + "$ tensorboard --logdir=tf_logs\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can play around with the hyperparameters (e.g. the `batch_size` or the `learning_rate`) and run training again and again, comparing the learning curves. You can even automate this process by implementing grid search or randomized search. Below is a simple implementation of a randomized search on both the batch size and the learning rate. For the sake of simplicity, the checkpoint mechanism was removed." + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from scipy.stats import reciprocal\n", + "\n", + "n_search_iterations = 10\n", + "\n", + "for search_iteration in range(n_search_iterations):\n", + " batch_size = np.random.randint(1, 100)\n", + " learning_rate = reciprocal(0.0001, 0.1).rvs()\n", + "\n", + " n_inputs = 2 + 4\n", + " logdir = log_dir(\"logreg\")\n", + " \n", + " print(\"Iteration\", search_iteration)\n", + " print(\" logdir:\", logdir)\n", + " print(\" batch size:\", batch_size)\n", + " print(\" learning_rate:\", learning_rate)\n", + " print(\" training: \", end=\"\")\n", + "\n", + " tf.reset_default_graph()\n", + "\n", + " X = tf.placeholder(tf.float32, shape=(None, n_inputs + 1), name=\"X\")\n", + " y = tf.placeholder(tf.float32, shape=(None, 1), name=\"y\")\n", + "\n", + " y_proba, loss, training_op, loss_summary, init, saver = logistic_regression(\n", + " X, y, learning_rate=learning_rate)\n", + "\n", + " file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())\n", + "\n", + " n_epochs = 10001\n", + " n_batches = int(np.ceil(m / batch_size))\n", + "\n", + " final_model_path = \"./my_logreg_model_%d\" % search_iteration\n", + "\n", + " with tf.Session() as sess:\n", + " sess.run(init)\n", + "\n", + " for epoch in range(n_epochs):\n", + " for batch_index in range(n_batches):\n", + " X_batch, y_batch = random_batch(X_train_enhanced, y_train, batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " loss_val, summary_str = sess.run([loss, loss_summary], feed_dict={X: X_test_enhanced, y: y_test})\n", + " file_writer.add_summary(summary_str, epoch)\n", + " if epoch % 500 == 0:\n", + " print(\".\", end=\"\")\n", + "\n", + " saver.save(sess, final_model_path)\n", + "\n", + " print()\n", + " y_proba_val = y_proba.eval(feed_dict={X: X_test_enhanced, y: y_test})\n", + " y_pred = (y_proba_val >= 0.5)\n", + " \n", + " print(\" precision:\", precision_score(y_test, y_pred))\n", + " print(\" recall:\", recall_score(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `reciprocal()` function from SciPy's `stats` module returns a random distribution that is commonly used when you have no idea of the optimal scale of a hyperparameter. See the exercise solutions for chapter 2 for more details. " ] }, {