Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

2016-11-03 23:47:11 +01:00 · 2016-11-03 23:47:11 +01:00 · 61f9b54cdb
parent dabda0f806
commit 61f9b54cdb
2 changed files with 54 additions and 40 deletions
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@ -92,13 +92,14 @@
   "source": [
    "import os\n",
    "import tarfile\n",
-    "import urllib.request\n",
+    "from six.moves import urllib\n",
    "\n",
    "HOUSING_PATH = \"datasets/housing\"\n",
    "HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
    "\n",
    "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
-    "    os.makedirs(housing_path, exist_ok=True)\n",
+    "    if not os.path.exists(housing_path):\n",
+    "        os.makedirs(housing_path)\n",
    "    tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
    "    urllib.request.urlretrieve(housing_url, tgz_path)\n",
    "    housing_tgz = tarfile.open(tgz_path)\n",
@ -235,7 +236,7 @@
    "import hashlib\n",
    "\n",
    "def test_set_check(identifier, test_ratio, hash):\n",
-    "    return hash(str(identifier).encode(\"ascii\")).digest()[-1] < 256 * test_ratio\n",
+    "    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n",
    "\n",
    "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
    "    ids = data[id_column]\n",
@ -264,7 +265,7 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.cross_validation import train_test_split\n",
+    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
    "test_set.head()"
@ -302,10 +303,10 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.cross_validation import StratifiedShuffleSplit\n",
+    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "\n",
-    "split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n",
-    "train_index, test_index = next(iter(split))\n",
+    "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
+    "for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
    "    strat_train_set = housing.loc[train_index]\n",
    "    strat_test_set = housing.loc[test_index]"
   ]
@ -953,10 +954,10 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.cross_validation import cross_val_score\n",
+    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
-    "                              scoring=\"mean_squared_error\", cv=10)\n",
+    "                              scoring=\"neg_mean_squared_error\", cv=10)\n",
    "tree_rmse_scores = np.sqrt(-tree_scores)"
   ]
  },
@ -985,7 +986,7 @@
   "outputs": [],
   "source": [
    "lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
-    "                             scoring=\"mean_squared_error\", cv=10)\n",
+    "                             scoring=\"neg_mean_squared_error\", cv=10)\n",
    "lin_rmse_scores = np.sqrt(-lin_scores)\n",
    "display_scores(lin_rmse_scores)"
   ]
@ -1016,10 +1017,10 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.cross_validation import cross_val_score\n",
+    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
-    "                                scoring=\"mean_squared_error\", cv=10)\n",
+    "                                scoring=\"neg_mean_squared_error\", cv=10)\n",
    "forest_rmse_scores = np.sqrt(-forest_scores)\n",
    "display_scores(forest_rmse_scores)"
   ]
@ -1032,7 +1033,7 @@
   },
   "outputs": [],
   "source": [
-    "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n",
+    "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
    "pd.Series(np.sqrt(-scores)).describe()"
   ]
  },
@ -1062,7 +1063,7 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.grid_search import GridSearchCV\n",
+    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_grid = [\n",
    "        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
@ -1070,7 +1071,7 @@
    "    ]\n",
    "\n",
    "forest_reg = RandomForestRegressor()\n",
-    "grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n",
+    "grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n",
    "grid_search.fit(housing_prepared, housing_labels)"
   ]
  },
@ -1104,8 +1105,9 @@
   },
   "outputs": [],
   "source": [
-    "for params, mean_score, scores in grid_search.grid_scores_:\n",
-    "    print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
+    "cvres = grid_search.cv_results_\n",
+    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
+    "    print(np.sqrt(-mean_score), params)"
   ]
  },
  {
@ -1116,18 +1118,7 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.grid_search import RandomizedSearchCV\n",
-    "from scipy.stats import randint\n",
-    "\n",
-    "param_distribs = {\n",
-    "        'n_estimators': randint(low=1, high=200),\n",
-    "        'max_features': randint(low=1, high=8),\n",
-    "    }\n",
-    "\n",
-    "forest_reg = RandomForestRegressor()\n",
-    "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
-    "                                n_iter=10, cv=5, scoring='mean_squared_error')\n",
-    "rnd_search.fit(housing_prepared, housing_labels)"
+    "pd.DataFrame(grid_search.cv_results_)"
   ]
  },
  {
@ -1138,8 +1129,18 @@
   },
   "outputs": [],
   "source": [
-    "for params, mean_score, scores in rnd_search.grid_scores_:\n",
-    "    print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from scipy.stats import randint\n",
+    "\n",
+    "param_distribs = {\n",
+    "        'n_estimators': randint(low=1, high=200),\n",
+    "        'max_features': randint(low=1, high=8),\n",
+    "    }\n",
+    "\n",
+    "forest_reg = RandomForestRegressor()\n",
+    "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
+    "                                n_iter=10, cv=5, scoring='neg_mean_squared_error')\n",
+    "rnd_search.fit(housing_prepared, housing_labels)"
   ]
  },
  {
@ -1149,6 +1150,19 @@
    "collapsed": false
   },
   "outputs": [],
+   "source": [
+    "cvres = rnd_search.cv_results_\n",
+    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
+    "    print(np.sqrt(-mean_score), params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
   "source": [
    "feature_importances = grid_search.best_estimator_.feature_importances_\n",
    "feature_importances"
@ -1156,7 +1170,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
@ -1170,7 +1184,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
@ -1208,7 +1222,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 76,
   "metadata": {
    "collapsed": false
   },
@ -1240,7 +1254,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 77,
   "metadata": {
    "collapsed": true
   },
@ -1251,7 +1265,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
@ -1262,7 +1276,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
@ -1281,7 +1295,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
--- a/images/end_to_end_project/california.png
+++ b/images/end_to_end_project/california.png