From 1b16a81fe5fe242e1a6d9d49489047aebdac3e60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Mon, 11 Oct 2021 20:51:34 +1300
Subject: [PATCH] Set OneHotEncoder's handle_unknown='ignore' to avoid warnings

---
 02_end_to_end_machine_learning_project.ipynb | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb
index d8c8349..2ddabdb 100644
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -2291,12 +2291,21 @@
     "**Warning**: the following cell may take close to 45 minutes to run, or more depending on your hardware."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** In the code below, I've set the `OneHotEncoder`'s `handle_unknown` hyperparameter to `'ignore'`, to avoid warnings during training. Without this, the `OneHotEncoder` would default to `handle_unknown='error'`, meaning that it would raise an error when transforming any data containing a category it didn't see during training. If we kept the default, then the `GridSearchCV` would run into errors during training when evaluating the folds in which not all the categories are in the training set. This is likely to happen since there's only one sample in the `'ISLAND'` category, and it may end up in the test set in some of the folds. So some folds would just be dropped by the `GridSearchCV`, and it's best to avoid that."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 137,
    "metadata": {},
    "outputs": [],
    "source": [
+    "full_pipeline.named_transformers_[\"cat\"].handle_unknown = 'ignore'\n",
+    "\n",
     "param_grid = [{\n",
     "    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
     "    'feature_selection__k': list(range(1, len(feature_importances) + 1))\n",