Remove unneeded imports, remove verbose=3, reformat to fit 80 chars wide
parent
ad7bf27fbf
commit
90ace1a01c
|
@ -1038,6 +1038,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"np.random.seed(42)\n",
|
||||||
"noise = np.random.randint(0, 100, (len(X_train), 784))\n",
|
"noise = np.random.randint(0, 100, (len(X_train), 784))\n",
|
||||||
"X_train_mod = X_train + noise\n",
|
"X_train_mod = X_train + noise\n",
|
||||||
"noise = np.random.randint(0, 100, (len(X_test), 784))\n",
|
"noise = np.random.randint(0, 100, (len(X_test), 784))\n",
|
||||||
|
@ -1403,7 +1404,6 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from pathlib import Path\n",
|
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import urllib.request\n",
|
"import urllib.request\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -1624,7 +1624,6 @@
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
"from sklearn.impute import SimpleImputer\n",
|
"from sklearn.impute import SimpleImputer\n",
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"num_pipeline = Pipeline([\n",
|
"num_pipeline = Pipeline([\n",
|
||||||
" (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
|
" (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
|
||||||
|
@ -1731,8 +1730,6 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
||||||
"\n",
|
|
||||||
"forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
"forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
||||||
"forest_clf.fit(X_train, y_train)"
|
"forest_clf.fit(X_train, y_train)"
|
||||||
]
|
]
|
||||||
|
@ -1767,8 +1764,6 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.model_selection import cross_val_score\n",
|
|
||||||
"\n",
|
|
||||||
"forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n",
|
"forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n",
|
||||||
"forest_scores.mean()"
|
"forest_scores.mean()"
|
||||||
]
|
]
|
||||||
|
@ -1820,8 +1815,6 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"\n",
|
|
||||||
"plt.figure(figsize=(8, 4))\n",
|
"plt.figure(figsize=(8, 4))\n",
|
||||||
"plt.plot([1]*10, svm_scores, \".\")\n",
|
"plt.plot([1]*10, svm_scores, \".\")\n",
|
||||||
"plt.plot([2]*10, forest_scores, \".\")\n",
|
"plt.plot([2]*10, forest_scores, \".\")\n",
|
||||||
|
@ -1867,7 +1860,8 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n",
|
"train_data[\"RelativesOnboard\"] = train_data[\"SibSp\"] + train_data[\"Parch\"]\n",
|
||||||
"train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(['RelativesOnboard']).mean()"
|
"train_data[[\"RelativesOnboard\", \"Survived\"]].groupby(\n",
|
||||||
|
" ['RelativesOnboard']).mean()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1899,9 +1893,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from pathlib import Path\n",
|
|
||||||
"import tarfile\n",
|
"import tarfile\n",
|
||||||
"import urllib.request\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"def fetch_spam_data():\n",
|
"def fetch_spam_data():\n",
|
||||||
" root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n",
|
" root = \"http://spamassassin.apache.org/old/publiccorpus/\"\n",
|
||||||
|
@ -2141,7 +2133,8 @@
|
||||||
"X = np.array(ham_emails + spam_emails, dtype=object)\n",
|
"X = np.array(ham_emails + spam_emails, dtype=object)\n",
|
||||||
"y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",
|
"y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n",
|
||||||
|
" random_state=42)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2247,9 +2240,7 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Let's throw in some stemming! For this to work, you need to install the Natural Language Toolkit ([NLTK](http://www.nltk.org/)). It's as simple as running the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n",
|
"Let's throw in some stemming! We will use the Natural Language Toolkit ([NLTK](http://www.nltk.org/)):"
|
||||||
"\n",
|
|
||||||
"`$ pip3 install nltk`"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2258,24 +2249,19 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"try:\n",
|
|
||||||
"import nltk\n",
|
"import nltk\n",
|
||||||
"\n",
|
"\n",
|
||||||
"stemmer = nltk.PorterStemmer()\n",
|
"stemmer = nltk.PorterStemmer()\n",
|
||||||
" for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n",
|
"for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\",\n",
|
||||||
" print(word, \"=>\", stemmer.stem(word))\n",
|
" \"Compulsive\"):\n",
|
||||||
"except ImportError:\n",
|
" print(word, \"=>\", stemmer.stem(word))"
|
||||||
" print(\"Error: stemming requires the NLTK module.\")\n",
|
|
||||||
" stemmer = None"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library. You can install it with the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n",
|
"We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library:"
|
||||||
"\n",
|
|
||||||
"`$ pip3 install urlextract`"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2306,14 +2292,12 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"try:\n",
|
"import urlextract # may require an Internet connection to download root domain\n",
|
||||||
" import urlextract # may require an Internet connection to download root domain names\n",
|
" # names\n",
|
||||||
"\n",
|
"\n",
|
||||||
"url_extractor = urlextract.URLExtract()\n",
|
"url_extractor = urlextract.URLExtract()\n",
|
||||||
" print(url_extractor.find_urls(\"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"))\n",
|
"some_text = \"Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s\"\n",
|
||||||
"except ImportError:\n",
|
"print(url_extractor.find_urls(some_text))"
|
||||||
" print(\"Error: replacing URLs requires the urlextract module.\")\n",
|
|
||||||
" url_extractor = None"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2332,8 +2316,9 @@
|
||||||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n",
|
"class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n",
|
||||||
" def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n",
|
" def __init__(self, strip_headers=True, lower_case=True,\n",
|
||||||
" replace_urls=True, replace_numbers=True, stemming=True):\n",
|
" remove_punctuation=True, replace_urls=True,\n",
|
||||||
|
" replace_numbers=True, stemming=True):\n",
|
||||||
" self.strip_headers = strip_headers\n",
|
" self.strip_headers = strip_headers\n",
|
||||||
" self.lower_case = lower_case\n",
|
" self.lower_case = lower_case\n",
|
||||||
" self.remove_punctuation = remove_punctuation\n",
|
" self.remove_punctuation = remove_punctuation\n",
|
||||||
|
@ -2417,7 +2402,8 @@
|
||||||
" for word, count in word_count.items():\n",
|
" for word, count in word_count.items():\n",
|
||||||
" total_count[word] += min(count, 10)\n",
|
" total_count[word] += min(count, 10)\n",
|
||||||
" most_common = total_count.most_common()[:self.vocabulary_size]\n",
|
" most_common = total_count.most_common()[:self.vocabulary_size]\n",
|
||||||
" self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n",
|
" self.vocabulary_ = {word: index + 1\n",
|
||||||
|
" for index, (word, count) in enumerate(most_common)}\n",
|
||||||
" return self\n",
|
" return self\n",
|
||||||
" def transform(self, X, y=None):\n",
|
" def transform(self, X, y=None):\n",
|
||||||
" rows = []\n",
|
" rows = []\n",
|
||||||
|
@ -2428,7 +2414,8 @@
|
||||||
" rows.append(row)\n",
|
" rows.append(row)\n",
|
||||||
" cols.append(self.vocabulary_.get(word, 0))\n",
|
" cols.append(self.vocabulary_.get(word, 0))\n",
|
||||||
" data.append(count)\n",
|
" data.append(count)\n",
|
||||||
" return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))"
|
" return csr_matrix((data, (rows, cols)),\n",
|
||||||
|
" shape=(len(X), self.vocabulary_size + 1))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2490,13 +2477,6 @@
|
||||||
"X_train_transformed = preprocess_pipeline.fit_transform(X_train)"
|
"X_train_transformed = preprocess_pipeline.fit_transform(X_train)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"**Note**: to be future-proof, we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 145,
|
"execution_count": 145,
|
||||||
|
@ -2507,7 +2487,7 @@
|
||||||
"from sklearn.model_selection import cross_val_score\n",
|
"from sklearn.model_selection import cross_val_score\n",
|
||||||
"\n",
|
"\n",
|
||||||
"log_clf = LogisticRegression(max_iter=1000, random_state=42)\n",
|
"log_clf = LogisticRegression(max_iter=1000, random_state=42)\n",
|
||||||
"score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n",
|
"score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)\n",
|
||||||
"score.mean()"
|
"score.mean()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
Loading…
Reference in New Issue