diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index e3a88ed..1ee28e6 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -92,13 +92,14 @@ "source": [ "import os\n", "import tarfile\n", - "import urllib.request\n", + "from six.moves import urllib\n", "\n", "HOUSING_PATH = \"datasets/housing\"\n", "HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n", "\n", "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", - " os.makedirs(housing_path, exist_ok=True)\n", + " if not os.path.exists(housing_path):\n", + " os.makedirs(housing_path)\n", " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", " urllib.request.urlretrieve(housing_url, tgz_path)\n", " housing_tgz = tarfile.open(tgz_path)\n", @@ -235,7 +236,7 @@ "import hashlib\n", "\n", "def test_set_check(identifier, test_ratio, hash):\n", - " return hash(str(identifier).encode(\"ascii\")).digest()[-1] < 256 * test_ratio\n", + " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n", "\n", "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n", " ids = data[id_column]\n", @@ -264,7 +265,7 @@ }, "outputs": [], "source": [ - "from sklearn.cross_validation import train_test_split\n", + "from sklearn.model_selection import train_test_split\n", "\n", "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", "test_set.head()" @@ -302,12 +303,12 @@ }, "outputs": [], "source": [ - "from sklearn.cross_validation import StratifiedShuffleSplit\n", + "from sklearn.model_selection import StratifiedShuffleSplit\n", "\n", - "split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n", - "train_index, test_index = next(iter(split))\n", - "strat_train_set = housing.loc[train_index]\n", - "strat_test_set = housing.loc[test_index]" + "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n", + "for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n", + " strat_train_set = housing.loc[train_index]\n", + " strat_test_set = housing.loc[test_index]" ] }, { @@ -953,10 +954,10 @@ }, "outputs": [], "source": [ - "from sklearn.cross_validation import cross_val_score\n", + "from sklearn.model_selection import cross_val_score\n", "\n", "tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n", - " scoring=\"mean_squared_error\", cv=10)\n", + " scoring=\"neg_mean_squared_error\", cv=10)\n", "tree_rmse_scores = np.sqrt(-tree_scores)" ] }, @@ -985,7 +986,7 @@ "outputs": [], "source": [ "lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n", - " scoring=\"mean_squared_error\", cv=10)\n", + " scoring=\"neg_mean_squared_error\", cv=10)\n", "lin_rmse_scores = np.sqrt(-lin_scores)\n", "display_scores(lin_rmse_scores)" ] @@ -1016,10 +1017,10 @@ }, "outputs": [], "source": [ - "from sklearn.cross_validation import cross_val_score\n", + "from sklearn.model_selection import cross_val_score\n", "\n", "forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n", - " scoring=\"mean_squared_error\", cv=10)\n", + " scoring=\"neg_mean_squared_error\", cv=10)\n", "forest_rmse_scores = np.sqrt(-forest_scores)\n", "display_scores(forest_rmse_scores)" ] @@ -1032,7 +1033,7 @@ }, "outputs": [], "source": [ - "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n", + "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n", "pd.Series(np.sqrt(-scores)).describe()" ] }, @@ -1062,7 +1063,7 @@ }, "outputs": [], "source": [ - "from sklearn.grid_search import GridSearchCV\n", + "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = [\n", " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", @@ -1070,7 +1071,7 @@ " ]\n", "\n", "forest_reg = RandomForestRegressor()\n", - "grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n", + "grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, @@ -1104,8 +1105,9 @@ }, "outputs": [], "source": [ - "for params, mean_score, scores in grid_search.grid_scores_:\n", - " print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)" + "cvres = grid_search.cv_results_\n", + "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n", + " print(np.sqrt(-mean_score), params)" ] }, { @@ -1116,18 +1118,7 @@ }, "outputs": [], "source": [ - "from sklearn.grid_search import RandomizedSearchCV\n", - "from scipy.stats import randint\n", - "\n", - "param_distribs = {\n", - " 'n_estimators': randint(low=1, high=200),\n", - " 'max_features': randint(low=1, high=8),\n", - " }\n", - "\n", - "forest_reg = RandomForestRegressor()\n", - "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n", - " n_iter=10, cv=5, scoring='mean_squared_error')\n", - "rnd_search.fit(housing_prepared, housing_labels)" + "pd.DataFrame(grid_search.cv_results_)" ] }, { @@ -1138,8 +1129,18 @@ }, "outputs": [], "source": [ - "for params, mean_score, scores in rnd_search.grid_scores_:\n", - " print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)" + "from sklearn.model_selection import RandomizedSearchCV\n", + "from scipy.stats import randint\n", + "\n", + "param_distribs = {\n", + " 'n_estimators': randint(low=1, high=200),\n", + " 'max_features': randint(low=1, high=8),\n", + " }\n", + "\n", + "forest_reg = RandomForestRegressor()\n", + "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n", + " n_iter=10, cv=5, scoring='neg_mean_squared_error')\n", + "rnd_search.fit(housing_prepared, housing_labels)" ] }, { @@ -1149,6 +1150,19 @@ "collapsed": false }, "outputs": [], + "source": [ + "cvres = rnd_search.cv_results_\n", + "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n", + " print(np.sqrt(-mean_score), params)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [], "source": [ "feature_importances = grid_search.best_estimator_.feature_importances_\n", "feature_importances" @@ -1156,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": { "collapsed": false }, @@ -1170,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": { "collapsed": false }, @@ -1208,7 +1222,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": { "collapsed": false }, @@ -1240,7 +1254,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": { "collapsed": true }, @@ -1251,7 +1265,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "metadata": { "collapsed": false }, @@ -1262,7 +1276,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "metadata": { "collapsed": false }, @@ -1281,7 +1295,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, "metadata": { "collapsed": false }, diff --git a/images/end_to_end_project/california.png b/images/end_to_end_project/california.png new file mode 100644 index 0000000..0103e3b Binary files /dev/null and b/images/end_to_end_project/california.png differ