From d17126475d43547b8a5de437ec89a351185e693c Mon Sep 17 00:00:00 2001 From: Ian Beauregard Date: Tue, 11 Aug 2020 09:33:51 -0400 Subject: [PATCH 1/4] Update 03_classification.ipynb Use a function parameter instead of a global constant. --- 03_classification.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index b574513..519a16c 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -1906,12 +1906,12 @@ "def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):\n", " if not os.path.isdir(spam_path):\n", " os.makedirs(spam_path)\n", - " for filename, url in ((\"ham.tar.bz2\", HAM_URL), (\"spam.tar.bz2\", SPAM_URL)):\n", + " for filename, url in ((\"ham.tar.bz2\", HAM_URL), (\"spam.tar.bz2\", spam_url)):\n", " path = os.path.join(spam_path, filename)\n", " if not os.path.isfile(path):\n", " urllib.request.urlretrieve(url, path)\n", " tar_bz2_file = tarfile.open(path)\n", - " tar_bz2_file.extractall(path=SPAM_PATH)\n", + " tar_bz2_file.extractall(path=spam_path)\n", " tar_bz2_file.close()" ] }, From a102114c62b248161d7ade9f15cfdecaad5f4831 Mon Sep 17 00:00:00 2001 From: Ian Beauregard Date: Tue, 11 Aug 2020 09:36:32 -0400 Subject: [PATCH 2/4] Update 03_classification.ipynb Create a function parameter for improved consistency. --- 03_classification.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index 519a16c..ca683d8 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -1903,10 +1903,10 @@ "SPAM_URL = DOWNLOAD_ROOT + \"20030228_spam.tar.bz2\"\n", "SPAM_PATH = os.path.join(\"datasets\", \"spam\")\n", "\n", - "def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):\n", + "def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):\n", " if not os.path.isdir(spam_path):\n", " os.makedirs(spam_path)\n", - " for filename, url in ((\"ham.tar.bz2\", HAM_URL), (\"spam.tar.bz2\", spam_url)):\n", + " for filename, url in ((\"ham.tar.bz2\", ham_url), (\"spam.tar.bz2\", spam_url)):\n", " path = os.path.join(spam_path, filename)\n", " if not os.path.isfile(path):\n", " urllib.request.urlretrieve(url, path)\n", From d7afbd511d5bdb40d0d56e2392ac086009fbc74a Mon Sep 17 00:00:00 2001 From: Ian Beauregard Date: Tue, 11 Aug 2020 09:57:53 -0400 Subject: [PATCH 3/4] Better regex to match numbers in 03_classification The previous regex would not match any decimal number where there is no "E" notation. Also added the option "+/-" sign in the "E" part. --- 03_classification.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index ca683d8..f088b87 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -2342,7 +2342,7 @@ " for url in urls:\n", " text = text.replace(url, \" URL \")\n", " if self.replace_numbers:\n", - " text = re.sub(r'\\d+(?:\\.\\d*(?:[eE]\\d+))?', 'NUMBER', text)\n", + " text = re.sub(r'\\d+(?:\\.\\d*)?(?:[eE][+-]?\\d+)?', 'NUMBER', text)\n", " if self.remove_punctuation:\n", " text = re.sub(r'\\W+', ' ', text, flags=re.M)\n", " word_counts = Counter(text.split())\n", From 508b22e84d0826a431f3b7ecd19af1614b19508e Mon Sep 17 00:00:00 2001 From: Ian Beauregard Date: Tue, 11 Aug 2020 20:29:05 -0400 Subject: [PATCH 4/4] Remove unnecessary code in 03_classification --- 03_classification.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index f088b87..4dd82ac 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -2405,7 +2405,6 @@ " for word, count in word_count.items():\n", " total_count[word] += min(count, 10)\n", " most_common = total_count.most_common()[:self.vocabulary_size]\n", - " self.most_common_ = most_common\n", " self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n", " return self\n", " def transform(self, X, y=None):\n",