Merge remote-tracking branch 'upstream/master' into upstream

2018-01-15 11:28:39 +09:00 · 2018-01-15 11:28:39 +09:00 · 3699ad54a4
parent 71a75de2f5 c8b7f045ee
commit 3699ad54a4
1 changed files with 25 additions and 78 deletions
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@ -406,9 +406,7 @@
  {
   "cell_type": "code",
   "execution_count": 30,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "housing = strat_train_set.copy()"
@ -486,9 +484,7 @@
  {
   "cell_type": "code",
   "execution_count": 35,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "corr_matrix = housing.corr()"
@ -533,9 +529,7 @@
  {
   "cell_type": "code",
   "execution_count": 39,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n",
@ -591,9 +585,7 @@
  {
   "cell_type": "code",
   "execution_count": 43,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n",
@ -642,9 +634,7 @@
  {
   "cell_type": "code",
   "execution_count": 48,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Imputer\n",
@ -662,9 +652,7 @@
  {
   "cell_type": "code",
   "execution_count": 49,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "housing_num = housing.drop('ocean_proximity', axis=1)\n",
@ -715,9 +703,7 @@
  {
   "cell_type": "code",
   "execution_count": 53,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "X = imputer.transform(housing_num)"
@ -726,9 +712,7 @@
  {
   "cell_type": "code",
   "execution_count": 54,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
@ -859,9 +843,7 @@
  {
   "cell_type": "code",
   "execution_count": 63,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "# Definition of the CategoricalEncoder class, copied from PR #9151.\n",
@ -1126,9 +1108,7 @@
  {
   "cell_type": "code",
   "execution_count": 68,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -1175,9 +1155,7 @@
  {
   "cell_type": "code",
   "execution_count": 70,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
@ -1211,9 +1189,7 @@
  {
   "cell_type": "code",
   "execution_count": 72,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -1261,9 +1237,7 @@
  {
   "cell_type": "code",
   "execution_count": 74,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import FeatureUnion\n",
@ -1411,9 +1385,7 @@
  {
   "cell_type": "code",
   "execution_count": 85,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
@ -1644,9 +1616,7 @@
  {
   "cell_type": "code",
   "execution_count": 102,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "final_model = grid_search.best_estimator_\n",
@ -1709,9 +1679,7 @@
  {
   "cell_type": "code",
   "execution_count": 105,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "my_model = full_pipeline_with_predictor"
@ -1720,9 +1688,7 @@
  {
   "cell_type": "code",
   "execution_count": 106,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.externals import joblib\n",
@ -1991,9 +1957,7 @@
  {
   "cell_type": "code",
   "execution_count": 116,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -2029,9 +1993,7 @@
  {
   "cell_type": "code",
   "execution_count": 117,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "k = 5"
@ -2089,9 +2051,7 @@
  {
   "cell_type": "code",
   "execution_count": 121,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "preparation_and_feature_selection_pipeline = Pipeline([\n",
@ -2103,9 +2063,7 @@
  {
   "cell_type": "code",
   "execution_count": 122,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)"
@ -2167,9 +2125,7 @@
  {
   "cell_type": "code",
   "execution_count": 125,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "prepare_select_and_predict_pipeline = Pipeline([\n",
@ -2237,7 +2193,7 @@
   "source": [
    "param_grid = [\n",
    "        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
-    "         'feature_selection__k': [3, 4, 5, 6, 7]}\n",
+    "         'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n",
    "]\n",
    "\n",
    "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n",
@ -2258,16 +2214,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Great! It seems that we had the right imputer strategy (median), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "housing.shape"
+    "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise."
   ]
  },
  {
@ -2294,7 +2241,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.6.3"
  },
  "nav_menu": {
   "height": "279px",