Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work
parent
dabda0f806
commit
61f9b54cdb
|
@ -92,13 +92,14 @@
|
|||
"source": [
|
||||
"import os\n",
|
||||
"import tarfile\n",
|
||||
"import urllib.request\n",
|
||||
"from six.moves import urllib\n",
|
||||
"\n",
|
||||
"HOUSING_PATH = \"datasets/housing\"\n",
|
||||
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
|
||||
"\n",
|
||||
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
|
||||
" os.makedirs(housing_path, exist_ok=True)\n",
|
||||
" if not os.path.exists(housing_path):\n",
|
||||
" os.makedirs(housing_path)\n",
|
||||
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
|
||||
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
|
||||
" housing_tgz = tarfile.open(tgz_path)\n",
|
||||
|
@ -235,7 +236,7 @@
|
|||
"import hashlib\n",
|
||||
"\n",
|
||||
"def test_set_check(identifier, test_ratio, hash):\n",
|
||||
" return hash(str(identifier).encode(\"ascii\")).digest()[-1] < 256 * test_ratio\n",
|
||||
" return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n",
|
||||
"\n",
|
||||
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
|
||||
" ids = data[id_column]\n",
|
||||
|
@ -264,7 +265,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.cross_validation import train_test_split\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
|
||||
"test_set.head()"
|
||||
|
@ -302,12 +303,12 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.cross_validation import StratifiedShuffleSplit\n",
|
||||
"from sklearn.model_selection import StratifiedShuffleSplit\n",
|
||||
"\n",
|
||||
"split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n",
|
||||
"train_index, test_index = next(iter(split))\n",
|
||||
"strat_train_set = housing.loc[train_index]\n",
|
||||
"strat_test_set = housing.loc[test_index]"
|
||||
"split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
|
||||
"for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
|
||||
" strat_train_set = housing.loc[train_index]\n",
|
||||
" strat_test_set = housing.loc[test_index]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -953,10 +954,10 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.cross_validation import cross_val_score\n",
|
||||
"from sklearn.model_selection import cross_val_score\n",
|
||||
"\n",
|
||||
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
|
||||
" scoring=\"mean_squared_error\", cv=10)\n",
|
||||
" scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||
"tree_rmse_scores = np.sqrt(-tree_scores)"
|
||||
]
|
||||
},
|
||||
|
@ -985,7 +986,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
|
||||
" scoring=\"mean_squared_error\", cv=10)\n",
|
||||
" scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
|
||||
"display_scores(lin_rmse_scores)"
|
||||
]
|
||||
|
@ -1016,10 +1017,10 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.cross_validation import cross_val_score\n",
|
||||
"from sklearn.model_selection import cross_val_score\n",
|
||||
"\n",
|
||||
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
|
||||
" scoring=\"mean_squared_error\", cv=10)\n",
|
||||
" scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
|
||||
"display_scores(forest_rmse_scores)"
|
||||
]
|
||||
|
@ -1032,7 +1033,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n",
|
||||
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||
"pd.Series(np.sqrt(-scores)).describe()"
|
||||
]
|
||||
},
|
||||
|
@ -1062,7 +1063,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.grid_search import GridSearchCV\n",
|
||||
"from sklearn.model_selection import GridSearchCV\n",
|
||||
"\n",
|
||||
"param_grid = [\n",
|
||||
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
|
||||
|
@ -1070,7 +1071,7 @@
|
|||
" ]\n",
|
||||
"\n",
|
||||
"forest_reg = RandomForestRegressor()\n",
|
||||
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n",
|
||||
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n",
|
||||
"grid_search.fit(housing_prepared, housing_labels)"
|
||||
]
|
||||
},
|
||||
|
@ -1104,8 +1105,9 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for params, mean_score, scores in grid_search.grid_scores_:\n",
|
||||
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
|
||||
"cvres = grid_search.cv_results_\n",
|
||||
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
|
||||
" print(np.sqrt(-mean_score), params)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1116,18 +1118,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.grid_search import RandomizedSearchCV\n",
|
||||
"from scipy.stats import randint\n",
|
||||
"\n",
|
||||
"param_distribs = {\n",
|
||||
" 'n_estimators': randint(low=1, high=200),\n",
|
||||
" 'max_features': randint(low=1, high=8),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"forest_reg = RandomForestRegressor()\n",
|
||||
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
|
||||
" n_iter=10, cv=5, scoring='mean_squared_error')\n",
|
||||
"rnd_search.fit(housing_prepared, housing_labels)"
|
||||
"pd.DataFrame(grid_search.cv_results_)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1138,8 +1129,18 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for params, mean_score, scores in rnd_search.grid_scores_:\n",
|
||||
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
|
||||
"from sklearn.model_selection import RandomizedSearchCV\n",
|
||||
"from scipy.stats import randint\n",
|
||||
"\n",
|
||||
"param_distribs = {\n",
|
||||
" 'n_estimators': randint(low=1, high=200),\n",
|
||||
" 'max_features': randint(low=1, high=8),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"forest_reg = RandomForestRegressor()\n",
|
||||
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
|
||||
" n_iter=10, cv=5, scoring='neg_mean_squared_error')\n",
|
||||
"rnd_search.fit(housing_prepared, housing_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1149,6 +1150,19 @@
|
|||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cvres = rnd_search.cv_results_\n",
|
||||
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
|
||||
" print(np.sqrt(-mean_score), params)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
|
||||
"feature_importances"
|
||||
|
@ -1156,7 +1170,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"execution_count": 74,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1170,7 +1184,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"execution_count": 75,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1208,7 +1222,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 75,
|
||||
"execution_count": 76,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1240,7 +1254,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 76,
|
||||
"execution_count": 77,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
|
@ -1251,7 +1265,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 77,
|
||||
"execution_count": 78,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1262,7 +1276,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 78,
|
||||
"execution_count": 79,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1281,7 +1295,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 79,
|
||||
"execution_count": 80,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 9.8 KiB |
Loading…
Reference in New Issue