From d7afbd511d5bdb40d0d56e2392ac086009fbc74a Mon Sep 17 00:00:00 2001 From: Ian Beauregard Date: Tue, 11 Aug 2020 09:57:53 -0400 Subject: [PATCH] Better regex to match numbers in 03_classification The previous regex would not match any decimal number where there is no "E" notation. Also added the option "+/-" sign in the "E" part. --- 03_classification.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index ca683d8..f088b87 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -2342,7 +2342,7 @@ " for url in urls:\n", " text = text.replace(url, \" URL \")\n", " if self.replace_numbers:\n", - " text = re.sub(r'\\d+(?:\\.\\d*(?:[eE]\\d+))?', 'NUMBER', text)\n", + " text = re.sub(r'\\d+(?:\\.\\d*)?(?:[eE][+-]?\\d+)?', 'NUMBER', text)\n", " if self.remove_punctuation:\n", " text = re.sub(r'\\W+', ' ', text, flags=re.M)\n", " word_counts = Counter(text.split())\n",