Remove unneeded imports, remove verbose=3, reformat to fit 80 chars wide

2021-10-30 13:54:57 +13:00 · 2021-10-30 13:54:57 +13:00 · 90ace1a01c
parent ad7bf27fbf
commit 90ace1a01c
1 changed files with 26 additions and 46 deletions
--- a/03_classification.ipynb
+++ b/03_classification.ipynb
@ -1038,6 +1038,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "np.random.seed(42)\n",
    "noise = np.random.randint(0, 100, (len(X_train), 784))\n",
    "X_train_mod = X_train + noise\n",
    "noise = np.random.randint(0, 100, (len(X_test), 784))\n",
@ -1403,7 +1404,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import urllib.request\n",
    "\n",
@ -1624,7 +1624,6 @@
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
-    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "num_pipeline = Pipeline([\n",
    "        (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
@ -1731,8 +1730,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from sklearn.ensemble import RandomForestClassifier\n",
-    "\n",
    "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
    "forest_clf.fit(X_train, y_train)"
   ]
@ -1767,8 +1764,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from sklearn.model_selection import cross_val_score\n",
-    "\n",
    "forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n",
    "forest_scores.mean()"
   ]
@ -1820,8 +1815,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "\n",
    "plt.figure(figsize=(8, 4))\n",
    "plt.plot([1]*10, svm_scores, \".\")\n",
    "plt.plot([2]*10, forest_scores, \".\")\n",
@ -1867,7 +1860,8 @@
   "outputs": [],
   "source": [
    "train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n",
-    "train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(['RelativesOnboard']).mean()"
+    "train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(\n",
+    "    ['RelativesOnboard']).mean()"
   ]
  },
  {
@ -1899,9 +1893,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from pathlib import Path\n",
    "import tarfile\n",
-    "import urllib.request\n",
    "\n",
    "def fetch_spam_data():\n",
    "    root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n",
@ -2141,7 +2133,8 @@
    "X = np.array(ham_emails + spam_emails, dtype=object)\n",
    "y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",
    "\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n",
+    "                                                    random_state=42)"
   ]
  },
  {
@ -2247,9 +2240,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Let's throw in some stemming! For this to work, you need to install the Natural Language Toolkit ([NLTK](http://www.nltk.org/)). It's as simple as running the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n",
-    "\n",
-    "`$ pip3 install nltk`"
+    "Let's throw in some stemming! We will use the Natural Language Toolkit ([NLTK](http://www.nltk.org/)):"
   ]
  },
  {
@ -2258,24 +2249,19 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "try:\n",
-    "    import nltk\n",
+    "import nltk\n",
    "\n",
-    "    stemmer = nltk.PorterStemmer()\n",
-    "    for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n",
-    "        print(word, \"=>\", stemmer.stem(word))\n",
-    "except ImportError:\n",
-    "    print(\"Error: stemming requires the NLTK module.\")\n",
-    "    stemmer = None"
+    "stemmer = nltk.PorterStemmer()\n",
+    "for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\",\n",
+    "             \"Compulsive\"):\n",
+    "    print(word, \"=>\", stemmer.stem(word))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library. You can install it with the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n",
-    "\n",
-    "`$ pip3 install urlextract`"
+    "We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library:"
   ]
  },
  {
@ -2306,14 +2292,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "try:\n",
-    "    import urlextract # may require an Internet connection to download root domain names\n",
-    "    \n",
-    "    url_extractor = urlextract.URLExtract()\n",
-    "    print(url_extractor.find_urls(\"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"))\n",
-    "except ImportError:\n",
-    "    print(\"Error: replacing URLs requires the urlextract module.\")\n",
-    "    url_extractor = None"
+    "import urlextract # may require an Internet connection to download root domain\n",
+    "                  # names\n",
+    "\n",
+    "url_extractor = urlextract.URLExtract()\n",
+    "some_text = \"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"\n",
+    "print(url_extractor.find_urls(some_text))"
   ]
  },
  {
@ -2332,8 +2316,9 @@
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n",
-    "    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n",
-    "                 replace_urls=True, replace_numbers=True, stemming=True):\n",
+    "    def __init__(self, strip_headers=True, lower_case=True,\n",
+    "                 remove_punctuation=True, replace_urls=True,\n",
+    "                 replace_numbers=True, stemming=True):\n",
    "        self.strip_headers = strip_headers\n",
    "        self.lower_case = lower_case\n",
    "        self.remove_punctuation = remove_punctuation\n",
@ -2417,7 +2402,8 @@
    "            for word, count in word_count.items():\n",
    "                total_count[word] += min(count, 10)\n",
    "        most_common = total_count.most_common()[:self.vocabulary_size]\n",
-    "        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n",
+    "        self.vocabulary_ = {word: index + 1\n",
+    "                            for index, (word, count) in enumerate(most_common)}\n",
    "        return self\n",
    "    def transform(self, X, y=None):\n",
    "        rows = []\n",
@ -2428,7 +2414,8 @@
    "                rows.append(row)\n",
    "                cols.append(self.vocabulary_.get(word, 0))\n",
    "                data.append(count)\n",
-    "        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))"
+    "        return csr_matrix((data, (rows, cols)),\n",
+    "                          shape=(len(X), self.vocabulary_size + 1))"
   ]
  },
  {
@ -2490,13 +2477,6 @@
    "X_train_transformed = preprocess_pipeline.fit_transform(X_train)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note**: to be future-proof, we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22."
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 145,
@ -2507,7 +2487,7 @@
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "log_clf = LogisticRegression(max_iter=1000, random_state=42)\n",
-    "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n",
+    "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)\n",
    "score.mean()"
   ]
  },