From c8b7f045eeb0d755e5020105f93c275923318add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sun, 14 Jan 2018 09:11:47 +0100 Subject: [PATCH] Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2 --- 02_end_to_end_machine_learning_project.ipynb | 103 +++++-------------- 1 file changed, 25 insertions(+), 78 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 1cadabb..1e51f9a 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -406,9 +406,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing = strat_train_set.copy()" @@ -486,9 +484,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "corr_matrix = housing.corr()" @@ -533,9 +529,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n", @@ -591,9 +585,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n", @@ -642,9 +634,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", @@ -662,9 +652,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing_num = housing.drop('ocean_proximity', axis=1)\n", @@ -715,9 +703,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X = imputer.transform(housing_num)" @@ -726,9 +712,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", @@ -859,9 +843,7 @@ { "cell_type": "code", "execution_count": 63, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Definition of the CategoricalEncoder class, copied from PR #9151.\n", @@ -1126,9 +1108,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1175,9 +1155,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -1211,9 +1189,7 @@ { "cell_type": "code", "execution_count": 72, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1261,9 +1237,7 @@ { "cell_type": "code", "execution_count": 74, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", @@ -1411,9 +1385,7 @@ { "cell_type": "code", "execution_count": 85, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", @@ -1644,9 +1616,7 @@ { "cell_type": "code", "execution_count": 102, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "final_model = grid_search.best_estimator_\n", @@ -1709,9 +1679,7 @@ { "cell_type": "code", "execution_count": 105, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "my_model = full_pipeline_with_predictor" @@ -1720,9 +1688,7 @@ { "cell_type": "code", "execution_count": 106, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.externals import joblib\n", @@ -1991,9 +1957,7 @@ { "cell_type": "code", "execution_count": 116, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -2029,9 +1993,7 @@ { "cell_type": "code", "execution_count": 117, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "k = 5" @@ -2089,9 +2051,7 @@ { "cell_type": "code", "execution_count": 121, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "preparation_and_feature_selection_pipeline = Pipeline([\n", @@ -2103,9 +2063,7 @@ { "cell_type": "code", "execution_count": 122, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)" @@ -2167,9 +2125,7 @@ { "cell_type": "code", "execution_count": 125, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "prepare_select_and_predict_pipeline = Pipeline([\n", @@ -2237,7 +2193,7 @@ "source": [ "param_grid = [\n", " {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", - " 'feature_selection__k': [3, 4, 5, 6, 7]}\n", + " 'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n", "]\n", "\n", "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", @@ -2258,16 +2214,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Great! It seems that we had the right imputer strategy (median), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise." - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "housing.shape" + "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise." ] }, { @@ -2294,7 +2241,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.3" }, "nav_menu": { "height": "279px",