Upgrade chapter 2 to sklearn 0.18 and ensure python 2 and python 3 both work
parent
dabda0f806
commit
61f9b54cdb
|
@ -92,13 +92,14 @@
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"import tarfile\n",
|
"import tarfile\n",
|
||||||
"import urllib.request\n",
|
"from six.moves import urllib\n",
|
||||||
"\n",
|
"\n",
|
||||||
"HOUSING_PATH = \"datasets/housing\"\n",
|
"HOUSING_PATH = \"datasets/housing\"\n",
|
||||||
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
|
"HOUSING_URL = DATASETS_URL + \"/housing/housing.tgz\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
|
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
|
||||||
" os.makedirs(housing_path, exist_ok=True)\n",
|
" if not os.path.exists(housing_path):\n",
|
||||||
|
" os.makedirs(housing_path)\n",
|
||||||
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
|
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
|
||||||
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
|
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
|
||||||
" housing_tgz = tarfile.open(tgz_path)\n",
|
" housing_tgz = tarfile.open(tgz_path)\n",
|
||||||
|
@ -235,7 +236,7 @@
|
||||||
"import hashlib\n",
|
"import hashlib\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def test_set_check(identifier, test_ratio, hash):\n",
|
"def test_set_check(identifier, test_ratio, hash):\n",
|
||||||
" return hash(str(identifier).encode(\"ascii\")).digest()[-1] < 256 * test_ratio\n",
|
" return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
|
"def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n",
|
||||||
" ids = data[id_column]\n",
|
" ids = data[id_column]\n",
|
||||||
|
@ -264,7 +265,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.cross_validation import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"\n",
|
"\n",
|
||||||
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
|
"train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n",
|
||||||
"test_set.head()"
|
"test_set.head()"
|
||||||
|
@ -302,10 +303,10 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.cross_validation import StratifiedShuffleSplit\n",
|
"from sklearn.model_selection import StratifiedShuffleSplit\n",
|
||||||
"\n",
|
"\n",
|
||||||
"split = StratifiedShuffleSplit(housing[\"income_cat\"], test_size=0.2, random_state=42)\n",
|
"split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
|
||||||
"train_index, test_index = next(iter(split))\n",
|
"for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n",
|
||||||
" strat_train_set = housing.loc[train_index]\n",
|
" strat_train_set = housing.loc[train_index]\n",
|
||||||
" strat_test_set = housing.loc[test_index]"
|
" strat_test_set = housing.loc[test_index]"
|
||||||
]
|
]
|
||||||
|
@ -953,10 +954,10 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.cross_validation import cross_val_score\n",
|
"from sklearn.model_selection import cross_val_score\n",
|
||||||
"\n",
|
"\n",
|
||||||
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
|
"tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n",
|
||||||
" scoring=\"mean_squared_error\", cv=10)\n",
|
" scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||||
"tree_rmse_scores = np.sqrt(-tree_scores)"
|
"tree_rmse_scores = np.sqrt(-tree_scores)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -985,7 +986,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
|
"lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n",
|
||||||
" scoring=\"mean_squared_error\", cv=10)\n",
|
" scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||||
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
|
"lin_rmse_scores = np.sqrt(-lin_scores)\n",
|
||||||
"display_scores(lin_rmse_scores)"
|
"display_scores(lin_rmse_scores)"
|
||||||
]
|
]
|
||||||
|
@ -1016,10 +1017,10 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.cross_validation import cross_val_score\n",
|
"from sklearn.model_selection import cross_val_score\n",
|
||||||
"\n",
|
"\n",
|
||||||
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
|
"forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n",
|
||||||
" scoring=\"mean_squared_error\", cv=10)\n",
|
" scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||||
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
|
"forest_rmse_scores = np.sqrt(-forest_scores)\n",
|
||||||
"display_scores(forest_rmse_scores)"
|
"display_scores(forest_rmse_scores)"
|
||||||
]
|
]
|
||||||
|
@ -1032,7 +1033,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"mean_squared_error\", cv=10)\n",
|
"scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n",
|
||||||
"pd.Series(np.sqrt(-scores)).describe()"
|
"pd.Series(np.sqrt(-scores)).describe()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -1062,7 +1063,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.grid_search import GridSearchCV\n",
|
"from sklearn.model_selection import GridSearchCV\n",
|
||||||
"\n",
|
"\n",
|
||||||
"param_grid = [\n",
|
"param_grid = [\n",
|
||||||
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
|
" {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
|
||||||
|
@ -1070,7 +1071,7 @@
|
||||||
" ]\n",
|
" ]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"forest_reg = RandomForestRegressor()\n",
|
"forest_reg = RandomForestRegressor()\n",
|
||||||
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='mean_squared_error')\n",
|
"grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')\n",
|
||||||
"grid_search.fit(housing_prepared, housing_labels)"
|
"grid_search.fit(housing_prepared, housing_labels)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -1104,8 +1105,9 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"for params, mean_score, scores in grid_search.grid_scores_:\n",
|
"cvres = grid_search.cv_results_\n",
|
||||||
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
|
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
|
||||||
|
" print(np.sqrt(-mean_score), params)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1116,18 +1118,7 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.grid_search import RandomizedSearchCV\n",
|
"pd.DataFrame(grid_search.cv_results_)"
|
||||||
"from scipy.stats import randint\n",
|
|
||||||
"\n",
|
|
||||||
"param_distribs = {\n",
|
|
||||||
" 'n_estimators': randint(low=1, high=200),\n",
|
|
||||||
" 'max_features': randint(low=1, high=8),\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
"forest_reg = RandomForestRegressor()\n",
|
|
||||||
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
|
|
||||||
" n_iter=10, cv=5, scoring='mean_squared_error')\n",
|
|
||||||
"rnd_search.fit(housing_prepared, housing_labels)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1138,8 +1129,18 @@
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"for params, mean_score, scores in rnd_search.grid_scores_:\n",
|
"from sklearn.model_selection import RandomizedSearchCV\n",
|
||||||
" print(np.sqrt(-mean_score), np.sqrt(-scores).std(), params)"
|
"from scipy.stats import randint\n",
|
||||||
|
"\n",
|
||||||
|
"param_distribs = {\n",
|
||||||
|
" 'n_estimators': randint(low=1, high=200),\n",
|
||||||
|
" 'max_features': randint(low=1, high=8),\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
"forest_reg = RandomForestRegressor()\n",
|
||||||
|
"rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n",
|
||||||
|
" n_iter=10, cv=5, scoring='neg_mean_squared_error')\n",
|
||||||
|
"rnd_search.fit(housing_prepared, housing_labels)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1149,6 +1150,19 @@
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"cvres = rnd_search.cv_results_\n",
|
||||||
|
"for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
|
||||||
|
" print(np.sqrt(-mean_score), params)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 73,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
|
"feature_importances = grid_search.best_estimator_.feature_importances_\n",
|
||||||
"feature_importances"
|
"feature_importances"
|
||||||
|
@ -1156,7 +1170,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 73,
|
"execution_count": 74,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1170,7 +1184,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 74,
|
"execution_count": 75,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1208,7 +1222,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 75,
|
"execution_count": 76,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1240,7 +1254,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 76,
|
"execution_count": 77,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": true
|
||||||
},
|
},
|
||||||
|
@ -1251,7 +1265,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 77,
|
"execution_count": 78,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1262,7 +1276,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 78,
|
"execution_count": 79,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1281,7 +1295,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 79,
|
"execution_count": 80,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 9.8 KiB |
Loading…
Reference in New Issue