Replace pip with pip3, and add random_state to LogisticRegression

2018-05-08 12:33:46 +02:00 · 2018-05-08 12:33:46 +02:00 · c4e72ddc49
commit c4e72ddc49
parent 4dbb575933
1 changed files with 27 additions and 46 deletions
--- a/03_classification.ipynb
+++ b/03_classification.ipynb
@ -451,7 +451,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Note: there is an [issue](https://github.com/scikit-learn/scikit-learn/issues/9589) introduced in Scikit-Learn 0.19.0 where the result of `cross_val_predict()` is incorrect in the binary classification case when using `method=\"decision_function\"`, as in the code above. The resulting array has an extra first dimension full of 0s. We need to add this small hack for now to work around this issue:"
+    "Note: there was an [issue](https://github.com/scikit-learn/scikit-learn/issues/9589) in Scikit-Learn 0.19.0 (fixed in 0.19.1) where the result of `cross_val_predict()` was incorrect in the binary classification case when using `method=\"decision_function\"`, as in the code above. The resulting array had an extra first dimension full of 0s. Just in case you are using 0.19.0, we need to add this small hack to work around this issue:"
   ]
  },
  {
@ -469,7 +469,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# hack to work around issue #9589 introduced in Scikit-Learn 0.19.0\n",
+    "# hack to work around issue #9589 in Scikit-Learn 0.19.0\n",
    "if y_scores.ndim == 2:\n",
    "    y_scores = y_scores[:, 1]"
   ]
@ -1831,9 +1831,7 @@
  {
   "cell_type": "code",
   "execution_count": 126,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
@ -1860,9 +1858,7 @@
  {
   "cell_type": "code",
   "execution_count": 127,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "fetch_spam_data()"
@ -1878,9 +1874,7 @@
  {
   "cell_type": "code",
   "execution_count": 128,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "HAM_DIR = os.path.join(SPAM_PATH, \"easy_ham\")\n",
@ -1917,9 +1911,7 @@
  {
   "cell_type": "code",
   "execution_count": 131,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import email\n",
@ -1934,9 +1926,7 @@
  {
   "cell_type": "code",
   "execution_count": 132,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]\n",
@ -1978,9 +1968,7 @@
  {
   "cell_type": "code",
   "execution_count": 135,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def get_email_structure(email):\n",
@ -1999,9 +1987,7 @@
  {
   "cell_type": "code",
   "execution_count": 136,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
@ -2082,9 +2068,7 @@
  {
   "cell_type": "code",
   "execution_count": 141,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
@ -2106,9 +2090,7 @@
  {
   "cell_type": "code",
   "execution_count": 142,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
@ -2167,9 +2149,7 @@
  {
   "cell_type": "code",
   "execution_count": 145,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "def email_to_text(email):\n",
@ -2205,7 +2185,7 @@
   "source": [
    "Let's throw in some stemming! For this to work, you need to install the Natural Language Toolkit ([NLTK](http://www.nltk.org/)). It's as simple as running the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n",
    "\n",
-    "`$ pip install nltk`"
+    "`$ pip3 install nltk`"
   ]
  },
  {
@ -2231,7 +2211,7 @@
   "source": [
    "We will also need a way to replace URLs with the word \"URL\". For this, we could use hard core [regular expressions](https://mathiasbynens.be/demo/url-regex) but we will just use the [urlextract](https://github.com/lipoja/URLExtract) library. You can install it with the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the `--user` option):\n",
    "\n",
-    "`$ pip install urlextract`"
+    "`$ pip3 install urlextract`"
   ]
  },
  {
@ -2260,9 +2240,7 @@
  {
   "cell_type": "code",
   "execution_count": 149,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -2339,9 +2317,7 @@
  {
   "cell_type": "code",
   "execution_count": 151,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix\n",
@ -2416,9 +2392,7 @@
  {
   "cell_type": "code",
   "execution_count": 155,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
@ -2440,7 +2414,7 @@
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
-    "log_clf = LogisticRegression()\n",
+    "log_clf = LogisticRegression(random_state=42)\n",
    "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n",
    "score.mean()"
   ]
@ -2464,7 +2438,7 @@
    "\n",
    "X_test_transformed = preprocess_pipeline.transform(X_test)\n",
    "\n",
-    "log_clf = LogisticRegression()\n",
+    "log_clf = LogisticRegression(random_state=42)\n",
    "log_clf.fit(X_train_transformed, y_train)\n",
    "\n",
    "y_pred = log_clf.predict(X_test_transformed)\n",
@ -2472,6 +2446,13 @@
    "print(\"Precision: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n",
    "print(\"Recall: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
@ -2490,7 +2471,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.5.2"
  },
  "nav_menu": {},
  "toc": {