Merge remote-tracking branch 'upstream/master' into upstream

main
rickiepark 2018-01-15 11:28:39 +09:00
commit 3699ad54a4
1 changed files with 25 additions and 78 deletions

View File

@ -406,9 +406,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 30, "execution_count": 30,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"housing = strat_train_set.copy()" "housing = strat_train_set.copy()"
@ -486,9 +484,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 35,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"corr_matrix = housing.corr()" "corr_matrix = housing.corr()"
@ -533,9 +529,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 39, "execution_count": 39,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n", "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n",
@ -591,9 +585,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": 43,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n", "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n",
@ -642,9 +634,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 48, "execution_count": 48,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.preprocessing import Imputer\n", "from sklearn.preprocessing import Imputer\n",
@ -662,9 +652,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 49,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"housing_num = housing.drop('ocean_proximity', axis=1)\n", "housing_num = housing.drop('ocean_proximity', axis=1)\n",
@ -715,9 +703,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 53, "execution_count": 53,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"X = imputer.transform(housing_num)" "X = imputer.transform(housing_num)"
@ -726,9 +712,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 54, "execution_count": 54,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
@ -859,9 +843,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 63, "execution_count": 63,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Definition of the CategoricalEncoder class, copied from PR #9151.\n", "# Definition of the CategoricalEncoder class, copied from PR #9151.\n",
@ -1126,9 +1108,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 68, "execution_count": 68,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -1175,9 +1155,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 70, "execution_count": 70,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n",
@ -1211,9 +1189,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 72, "execution_count": 72,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -1261,9 +1237,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 74, "execution_count": 74,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.pipeline import FeatureUnion\n", "from sklearn.pipeline import FeatureUnion\n",
@ -1411,9 +1385,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 85, "execution_count": 85,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import cross_val_score\n",
@ -1644,9 +1616,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 102, "execution_count": 102,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"final_model = grid_search.best_estimator_\n", "final_model = grid_search.best_estimator_\n",
@ -1709,9 +1679,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 105, "execution_count": 105,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"my_model = full_pipeline_with_predictor" "my_model = full_pipeline_with_predictor"
@ -1720,9 +1688,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 106, "execution_count": 106,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.externals import joblib\n", "from sklearn.externals import joblib\n",
@ -1991,9 +1957,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 116, "execution_count": 116,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -2029,9 +1993,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 117, "execution_count": 117,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"k = 5" "k = 5"
@ -2089,9 +2051,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 121, "execution_count": 121,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"preparation_and_feature_selection_pipeline = Pipeline([\n", "preparation_and_feature_selection_pipeline = Pipeline([\n",
@ -2103,9 +2063,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 122, "execution_count": 122,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)" "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)"
@ -2167,9 +2125,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 125, "execution_count": 125,
"metadata": { "metadata": {},
"collapsed": true
},
"outputs": [], "outputs": [],
"source": [ "source": [
"prepare_select_and_predict_pipeline = Pipeline([\n", "prepare_select_and_predict_pipeline = Pipeline([\n",
@ -2237,7 +2193,7 @@
"source": [ "source": [
"param_grid = [\n", "param_grid = [\n",
" {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", " {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
" 'feature_selection__k': [3, 4, 5, 6, 7]}\n", " 'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n",
"]\n", "]\n",
"\n", "\n",
"grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n",
@ -2258,16 +2214,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Great! It seems that we had the right imputer strategy (median), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise." "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise."
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"housing.shape"
] ]
}, },
{ {
@ -2294,7 +2241,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.6.2" "version": "3.6.3"
}, },
"nav_menu": { "nav_menu": {
"height": "279px", "height": "279px",