Remove unneeded imports, remove verbose=3, reformat to fit 80 chars wide

main
Aurélien Geron 2021-10-30 13:54:57 +13:00
parent ad7bf27fbf
commit 90ace1a01c
1 changed files with 26 additions and 46 deletions

View File

@ -1038,6 +1038,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"np.random.seed(42)\n",
"noise = np.random.randint(0, 100, (len(X_train), 784))\n", "noise = np.random.randint(0, 100, (len(X_train), 784))\n",
"X_train_mod = X_train + noise\n", "X_train_mod = X_train + noise\n",
"noise = np.random.randint(0, 100, (len(X_test), 784))\n", "noise = np.random.randint(0, 100, (len(X_test), 784))\n",
@ -1403,7 +1404,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from pathlib import Path\n",
"import pandas as pd\n", "import pandas as pd\n",
"import urllib.request\n", "import urllib.request\n",
"\n", "\n",
@ -1624,7 +1624,6 @@
"source": [ "source": [
"from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n", "from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n", "\n",
"num_pipeline = Pipeline([\n", "num_pipeline = Pipeline([\n",
" (\"imputer\", SimpleImputer(strategy=\"median\")),\n", " (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
@ -1731,8 +1730,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"forest_clf.fit(X_train, y_train)" "forest_clf.fit(X_train, y_train)"
] ]
@ -1767,8 +1764,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from sklearn.model_selection import cross_val_score\n",
"\n",
"forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n", "forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n",
"forest_scores.mean()" "forest_scores.mean()"
] ]
@ -1820,8 +1815,6 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import matplotlib.pyplot as plt\n",
"\n",
"plt.figure(figsize=(8, 4))\n", "plt.figure(figsize=(8, 4))\n",
"plt.plot([1]*10, svm_scores, \".\")\n", "plt.plot([1]*10, svm_scores, \".\")\n",
"plt.plot([2]*10, forest_scores, \".\")\n", "plt.plot([2]*10, forest_scores, \".\")\n",
@ -1867,7 +1860,8 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n", "train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n",
"train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(['RelativesOnboard']).mean()" "train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(\n",
" ['RelativesOnboard']).mean()"
] ]
}, },
{ {
@ -1899,9 +1893,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from pathlib import Path\n",
"import tarfile\n", "import tarfile\n",
"import urllib.request\n",
"\n", "\n",
"def fetch_spam_data():\n", "def fetch_spam_data():\n",
" root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n", " root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n",
@ -2141,7 +2133,8 @@
"X = np.array(ham_emails + spam_emails, dtype=object)\n", "X = np.array(ham_emails + spam_emails, dtype=object)\n",
"y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n", "y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",
"\n", "\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n",
" random_state=42)"
] ]
}, },
{ {
@ -2247,9 +2240,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Let's throw in some stemming! For this to work, you need to install the Natural Language Toolkit ([NLTK](http://www.nltk.org/)). It's as simple as running the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n", "Let's throw in some stemming! We will use the Natural Language Toolkit ([NLTK](http://www.nltk.org/)):"
"\n",
"`$ pip3 install nltk`"
] ]
}, },
{ {
@ -2258,24 +2249,19 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"try:\n",
"import nltk\n", "import nltk\n",
"\n", "\n",
"stemmer = nltk.PorterStemmer()\n", "stemmer = nltk.PorterStemmer()\n",
" for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n", "for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\",\n",
" print(word, \"=>\", stemmer.stem(word))\n", " \"Compulsive\"):\n",
"except ImportError:\n", " print(word, \"=>\", stemmer.stem(word))"
" print(\"Error: stemming requires the NLTK module.\")\n",
" stemmer = None"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library. You can install it with the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n", "We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library:"
"\n",
"`$ pip3 install urlextract`"
] ]
}, },
{ {
@ -2306,14 +2292,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"try:\n", "import urlextract # may require an Internet connection to download root domain\n",
" import urlextract # may require an Internet connection to download root domain names\n", " # names\n",
"\n", "\n",
"url_extractor = urlextract.URLExtract()\n", "url_extractor = urlextract.URLExtract()\n",
" print(url_extractor.find_urls(\"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"))\n", "some_text = \"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"\n",
"except ImportError:\n", "print(url_extractor.find_urls(some_text))"
" print(\"Error: replacing URLs requires the urlextract module.\")\n",
" url_extractor = None"
] ]
}, },
{ {
@ -2332,8 +2316,9 @@
"from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.base import BaseEstimator, TransformerMixin\n",
"\n", "\n",
"class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n", "class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n",
" def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n", " def __init__(self, strip_headers=True, lower_case=True,\n",
" replace_urls=True, replace_numbers=True, stemming=True):\n", " remove_punctuation=True, replace_urls=True,\n",
" replace_numbers=True, stemming=True):\n",
" self.strip_headers = strip_headers\n", " self.strip_headers = strip_headers\n",
" self.lower_case = lower_case\n", " self.lower_case = lower_case\n",
" self.remove_punctuation = remove_punctuation\n", " self.remove_punctuation = remove_punctuation\n",
@ -2417,7 +2402,8 @@
" for word, count in word_count.items():\n", " for word, count in word_count.items():\n",
" total_count[word] += min(count, 10)\n", " total_count[word] += min(count, 10)\n",
" most_common = total_count.most_common()[:self.vocabulary_size]\n", " most_common = total_count.most_common()[:self.vocabulary_size]\n",
" self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n", " self.vocabulary_ = {word: index + 1\n",
" for index, (word, count) in enumerate(most_common)}\n",
" return self\n", " return self\n",
" def transform(self, X, y=None):\n", " def transform(self, X, y=None):\n",
" rows = []\n", " rows = []\n",
@ -2428,7 +2414,8 @@
" rows.append(row)\n", " rows.append(row)\n",
" cols.append(self.vocabulary_.get(word, 0))\n", " cols.append(self.vocabulary_.get(word, 0))\n",
" data.append(count)\n", " data.append(count)\n",
" return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))" " return csr_matrix((data, (rows, cols)),\n",
" shape=(len(X), self.vocabulary_size + 1))"
] ]
}, },
{ {
@ -2490,13 +2477,6 @@
"X_train_transformed = preprocess_pipeline.fit_transform(X_train)" "X_train_transformed = preprocess_pipeline.fit_transform(X_train)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note**: to be future-proof, we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 145, "execution_count": 145,
@ -2507,7 +2487,7 @@
"from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import cross_val_score\n",
"\n", "\n",
"log_clf = LogisticRegression(max_iter=1000, random_state=42)\n", "log_clf = LogisticRegression(max_iter=1000, random_state=42)\n",
"score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n", "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)\n",
"score.mean()" "score.mean()"
] ]
}, },