Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work

main
Aurélien Geron 2016-11-03 23:47:11 +01:00
parent dabda0f806
commit 61f9b54cdb
2 changed files with 54 additions and 40 deletions

View File

@ -92,13 +92,14 @@
"source": [
"import os\n",
"import tarfile\n",
"import urllib.request\n",
"from six.moves import urllib\n",
"\n",
"HOUSING_PATH = \"datasets/housing\"\n",
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
"\n",
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
" os.makedirs(housing_path, exist_ok=True)\n",
" if not os.path.exists(housing_path):\n",
" os.makedirs(housing_path)\n",
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
" housing_tgz = tarfile.open(tgz_path)\n",
@ -235,7 +236,7 @@
"import hashlib\n",
"\n",
"def test_set_check(identifier, test_ratio, hash):\n",
" return hash(str(identifier).encode(\"ascii\")).digest()[-1] < 256 * test_ratio\n",
" return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n",
"\n",
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
" ids = data[id_column]\n",
@ -264,7 +265,7 @@
},
"outputs": [],
"source": [
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
"test_set.head()"
@ -302,12 +303,12 @@
},
"outputs": [],
"source": [
"from sklearn.cross_validation import StratifiedShuffleSplit\n",
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"\n",
"split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n",
"train_index, test_index = next(iter(split))\n",
"strat_train_set = housing.loc[train_index]\n",
"strat_test_set = housing.loc[test_index]"
"split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
"for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
" strat_train_set = housing.loc[train_index]\n",
" strat_test_set = housing.loc[test_index]"
]
},
{
@ -953,10 +954,10 @@
},
"outputs": [],
"source": [
"from sklearn.cross_validation import cross_val_score\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
" scoring=\"mean_squared_error\", cv=10)\n",
" scoring=\"neg_mean_squared_error\", cv=10)\n",
"tree_rmse_scores = np.sqrt(-tree_scores)"
]
},
@ -985,7 +986,7 @@
"outputs": [],
"source": [
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
" scoring=\"mean_squared_error\", cv=10)\n",
" scoring=\"neg_mean_squared_error\", cv=10)\n",
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
"display_scores(lin_rmse_scores)"
]
@ -1016,10 +1017,10 @@
},
"outputs": [],
"source": [
"from sklearn.cross_validation import cross_val_score\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
" scoring=\"mean_squared_error\", cv=10)\n",
" scoring=\"neg_mean_squared_error\", cv=10)\n",
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
"display_scores(forest_rmse_scores)"
]
@ -1032,7 +1033,7 @@
},
"outputs": [],
"source": [
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n",
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
"pd.Series(np.sqrt(-scores)).describe()"
]
},
@ -1062,7 +1063,7 @@
},
"outputs": [],
"source": [
"from sklearn.grid_search import GridSearchCV\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"param_grid = [\n",
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
@ -1070,7 +1071,7 @@
" ]\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n",
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n",
"grid_search.fit(housing_prepared, housing_labels)"
]
},
@ -1104,8 +1105,9 @@
},
"outputs": [],
"source": [
"for params, mean_score, scores in grid_search.grid_scores_:\n",
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
"cvres = grid_search.cv_results_\n",
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
" print(np.sqrt(-mean_score), params)"
]
},
{
@ -1116,18 +1118,7 @@
},
"outputs": [],
"source": [
"from sklearn.grid_search import RandomizedSearchCV\n",
"from scipy.stats import randint\n",
"\n",
"param_distribs = {\n",
" 'n_estimators': randint(low=1, high=200),\n",
" 'max_features': randint(low=1, high=8),\n",
" }\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
" n_iter=10, cv=5, scoring='mean_squared_error')\n",
"rnd_search.fit(housing_prepared, housing_labels)"
"pd.DataFrame(grid_search.cv_results_)"
]
},
{
@ -1138,8 +1129,18 @@
},
"outputs": [],
"source": [
"for params, mean_score, scores in rnd_search.grid_scores_:\n",
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
"from sklearn.model_selection import RandomizedSearchCV\n",
"from scipy.stats import randint\n",
"\n",
"param_distribs = {\n",
" 'n_estimators': randint(low=1, high=200),\n",
" 'max_features': randint(low=1, high=8),\n",
" }\n",
"\n",
"forest_reg = RandomForestRegressor()\n",
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
" n_iter=10, cv=5, scoring='neg_mean_squared_error')\n",
"rnd_search.fit(housing_prepared, housing_labels)"
]
},
{
@ -1149,6 +1150,19 @@
"collapsed": false
},
"outputs": [],
"source": [
"cvres = rnd_search.cv_results_\n",
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
" print(np.sqrt(-mean_score), params)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
"feature_importances"
@ -1156,7 +1170,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 74,
"metadata": {
"collapsed": false
},
@ -1170,7 +1184,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 75,
"metadata": {
"collapsed": false
},
@ -1208,7 +1222,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 76,
"metadata": {
"collapsed": false
},
@ -1240,7 +1254,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 77,
"metadata": {
"collapsed": true
},
@ -1251,7 +1265,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 78,
"metadata": {
"collapsed": false
},
@ -1262,7 +1276,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 79,
"metadata": {
"collapsed": false
},
@ -1281,7 +1295,7 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": 80,
"metadata": {
"collapsed": false
},

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.8 KiB