From e7a1273c486d7629b889482e41743d956d2fac11 Mon Sep 17 00:00:00 2001 From: ziembla Date: Thu, 30 Nov 2017 06:09:45 +0100 Subject: [PATCH 01/42] Docker environment minutiae Docker compose project name set to avoid collisions, smiley dropped from README heading --- docker/.env | 1 + docker/README.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docker/.env diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000..16adf41 --- /dev/null +++ b/docker/.env @@ -0,0 +1 @@ +COMPOSE_PROJECT_NAME=handson-ml diff --git a/docker/README.md b/docker/README.md index 50b6f12..2355c45 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,5 +1,5 @@ -# Hands-on Machine Learning in Docker :-) +# Hands-on Machine Learning in Docker This is the Docker configuration which allows you to run and tweak the book's notebooks without installing any dependencies on your machine!
OK, any except `docker`. With `docker-compose`. Well, you may also want `make` (but it is only used as thin layer to call a few simple `docker-compose` commands). From 8d16b3061d5ba3b5282190c13547f33819099ede Mon Sep 17 00:00:00 2001 From: ziembla Date: Thu, 30 Nov 2017 12:09:16 +0100 Subject: [PATCH 02/42] Patches to nbdiff for skipping noisy metadata, some local config Nbdiff --ignore-details skils autoscroll, collapsed, deletable, editable, toc (pull request on the way). Enabling empty pass, no git pager, ignoring gitdiff nbdiff details. --- docker/Dockerfile | 43 ++++++++++++++++++++++++----------- docker/bashrc | 4 ++-- docker/nbdime-1-details.patch | 17 ++++++++++++++ docker/nbdime-2-toc.patch | 11 +++++++++ 4 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 docker/nbdime-1-details.patch create mode 100644 docker/nbdime-2-toc.patch diff --git a/docker/Dockerfile b/docker/Dockerfile index 54e5510..6b2852e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,8 +21,10 @@ RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ ENV HOME /home/${username} -WORKDIR ${HOME}/handson-ml -RUN chown ${username}:${username} ${HOME}/handson-ml +ARG workdir=${HOME}/handson-ml + +WORKDIR ${workdir} +RUN chown ${username}:${username} ${workdir} USER ${username} @@ -30,7 +32,7 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main -# INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) +## INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) # That will switch jupyter to using empty password instead of a token. # To avoid making a security hole you SHOULD in fact not only uncomment but # regenerate the hash for your own non-empty password and replace the hash below. @@ -38,12 +40,12 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py +RUN mkdir -p ${HOME}/.jupyter && \ + echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ + >> ${HOME}/.jupyter/jupyter_notebook_config.py -# INFO: Uncomment the RUN command below to disable git diff paging -#RUN git config --global core.pager '' +## INFO: Uncomment the RUN command below to disable git diff paging +RUN git config --global core.pager '' # INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing @@ -54,18 +56,33 @@ RUN jupyter nbextension enable toc2/main # to get nbdiff between checkpointed version and current version of the given notebook USER root WORKDIR / - RUN conda install -y -c conda-forge nbdime - USER ${username} -WORKDIR ${HOME}/handson-ml +WORKDIR ${workdir} RUN git-nbdiffdriver config --enable --global -# INFO: Uncomment the RUN command below to ignore metadata in nbdiff within git diff +## INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# metadata or details in nbdiff within git diff #RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' + + +## +RUN ls -l /tmp/ +COPY docker/nbdime-*.patch /tmp/ +RUN ls -l /tmp/ +USER root +WORKDIR / +RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-1-details.patch \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch +RUN rm /tmp/nbdime-*.patch +USER ${username} +WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${HOME}/.bashrc -RUN sudo rm -rf /tmp/bashrc +RUN sudo rm /tmp/bashrc diff --git a/docker/bashrc b/docker/bashrc index 3535389..b1bce45 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -1,4 +1,4 @@ -alias ll="ls -l" +alias ll="ls -alF" nbd() { DIRNAME=$(dirname "$1") @@ -8,5 +8,5 @@ nbd() { CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" + nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details } diff --git a/docker/nbdime-1-details.patch b/docker/nbdime-1-details.patch new file mode 100644 index 0000000..98f76d6 --- /dev/null +++ b/docker/nbdime-1-details.patch @@ -0,0 +1,17 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -548,8 +548,12 @@ def set_notebook_diff_targets(sources=True, outputs=True, attachments=True, meta + metadata_keys = ("/cells/*/metadata", "/metadata", "/cells/*/outputs/*/metadata") + if metadata: + for key in metadata_keys: +- if key in notebook_differs: +- del notebook_differs[key] ++ if details: ++ if key in notebook_differs: ++ del notebook_differs[key] ++ else: ++ notebook_differs[key] = diff_ignore_keys( ++ inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore diff --git a/docker/nbdime-2-toc.patch b/docker/nbdime-2-toc.patch new file mode 100644 index 0000000..4924e66 --- /dev/null +++ b/docker/nbdime-2-toc.patch @@ -0,0 +1,11 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -553,7 +553,7 @@ + del notebook_differs[key] + else: + notebook_differs[key] = diff_ignore_keys( +- inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) ++ inner_differ=diff, ignore_keys=['toc', 'collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore From 8586120c3d21f4b0b6c11db18fe86b7b3f22f8c1 Mon Sep 17 00:00:00 2001 From: ziembla Date: Thu, 30 Nov 2017 12:59:26 +0100 Subject: [PATCH 03/42] Git filter testing demo --- docker/Dockerfile | 22 ++++++++++++++------- docker/ipynb_cleaner.py | 42 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 7 deletions(-) create mode 100755 docker/ipynb_cleaner.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 6b2852e..5daacee 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -40,9 +40,9 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -RUN mkdir -p ${HOME}/.jupyter && \ - echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ - >> ${HOME}/.jupyter/jupyter_notebook_config.py +#RUN mkdir -p ${HOME}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${HOME}/.jupyter/jupyter_notebook_config.py ## INFO: Uncomment the RUN command below to disable git diff paging RUN git config --global core.pager '' @@ -65,13 +65,11 @@ RUN git-nbdiffdriver config --enable --global ## INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either # metadata or details in nbdiff within git diff #RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' -RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' +#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' -## -RUN ls -l /tmp/ +# INFO: Dirty nbdime patching COPY docker/nbdime-*.patch /tmp/ -RUN ls -l /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ @@ -86,3 +84,13 @@ WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${HOME}/.bashrc RUN sudo rm /tmp/bashrc + + +# INFO: Git filter testing +COPY docker/ipynb_cleaner.py /usr/bin/ipynb_cleaner +RUN mkdir -p ~/.config/git \ + && echo '*.ipynb filter=clean_ipynb' >> ~/.config/git/attributes \ + && git config --global filter.clean_ipynb.clean ipynb_cleaner \ + && git config --global filter.clean_ipynb.smudge cat + +# && git config --global filter.clean_ipynb.clean 'ipynb_cleaner %f' diff --git a/docker/ipynb_cleaner.py b/docker/ipynb_cleaner.py new file mode 100755 index 0000000..d34d7a6 --- /dev/null +++ b/docker/ipynb_cleaner.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +""" + +******************************** +DANGER - W.I.P. - TESTING ONLY!! +******************************** + +Clean jupyter notebook for git operations +Based on "Keeping IPython notebooks under Git version control" +(see: + https://gist.github.com/pbugnion/ea2797393033b54674af + http://pascalbugnion.net/blog/ipython-notebooks-and-git.html + http://stackoverflow.com/a/20844506/827862 +) +""" + +import sys +import json + +sys.stderr.write("\n\nCAUTION ! W.I.P ! Only dropping some test metadata, don't commit!\n\n") + +def log(x): + sys.stderr.write("\n\n[{}]\n\n\n".format(x)) +def logj(x): + sys.stderr.write("\n\n") + json.dump(x, sys.stderr, sort_keys=True, indent=1, separators=(",",": ")) + sys.stderr.write("\n\n") + +log(sys.argv) +#sys.exit(17) + +nb = sys.stdin.read() +json_in = json.loads(nb) + +logj(json_in["metadata"]) +del json_in["metadata"]["nav_menu"] +del json_in["metadata"]["toc"] +json_in["metadata"]["language_info"]["version"]="17.0" +logj(json_in["metadata"]) + +json.dump(json_in, sys.stdout, sort_keys=True, indent=1, separators=(",",": ")) From c50b5d9b3b5754f8bb84a5b290a22b3688c7d30a Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 30 Nov 2017 17:36:19 +0100 Subject: [PATCH 04/42] capsnet: fix margin loss formula --- extra_capsnets.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra_capsnets.ipynb b/extra_capsnets.ipynb index 67e67bd..89e343e 100644 --- a/extra_capsnets.ipynb +++ b/extra_capsnets.ipynb @@ -1191,7 +1191,7 @@ "source": [ "The paper uses a special margin loss to make it possible to detect two or more different digits in each image:\n", "\n", - "$ L_k = T_k \\max(0, m^{+} - \\|\\mathbf{v}_k\\|)^2 - \\lambda (1 - T_k) \\max(0, \\|\\mathbf{v}_k\\| - m^{-})^2$\n", + "$ L_k = T_k \\max(0, m^{+} - \\|\\mathbf{v}_k\\|)^2 + \\lambda (1 - T_k) \\max(0, \\|\\mathbf{v}_k\\| - m^{-})^2$\n", "\n", "* $T_k$ is equal to 1 if the digit of class $k$ is present, or 0 otherwise.\n", "* In the paper, $m^{+} = 0.9$, $m^{-} = 0.1$ and $\\lambda = 0.5$.\n", From 72621ecdc575dfa159749f2beb30d504d09410be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 30 Nov 2017 19:00:21 +0100 Subject: [PATCH 05/42] Fix the link to the first video, and add a link and an embed for the second video --- extra_capsnets.ipynb | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/extra_capsnets.ipynb b/extra_capsnets.ipynb index 67e67bd..ea3ae25 100644 --- a/extra_capsnets.ipynb +++ b/extra_capsnets.ipynb @@ -32,7 +32,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Watch [this video](https://www.youtube.com/embed/pPN8d0E3900) to understand the key ideas behind Capsule Networks:" + "Watch [this video](https://youtu.be/pPN8d0E3900) to understand the key ideas behind Capsule Networks:" ] }, { @@ -42,12 +42,23 @@ "outputs": [], "source": [ "from IPython.display import HTML\n", - "\n", - "# Display the video in an iframe:\n", - "HTML(\"\"\"\"\"\")" + "HTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may also want to watch [this video](https://youtu.be/2Kawrd5szHE), which presents the main difficulties in this notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "HTML(\"\"\"\"\"\")" ] }, { @@ -510,6 +521,7 @@ "metadata": {}, "source": [ "We can apply this function to compute $\\hat{\\mathbf{u}}_{j|i}$ for every pair of capsules ($i$, $j$) like this (recall that there are 6×6×32=1152 capsules in the first layer, and 10 in the second layer):\n", + "\n", "$\n", "\\pmatrix{\n", " \\mathbf{W}_{1,1} & \\mathbf{W}_{1,2} & \\cdots & \\mathbf{W}_{1,10} \\\\\n", @@ -2172,7 +2184,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.3" } }, "nbformat": 4, From ef9df82689a0a530e50b0433033594f41cdb4af7 Mon Sep 17 00:00:00 2001 From: ziembla Date: Fri, 1 Dec 2017 10:56:36 +0100 Subject: [PATCH 06/42] Dockerfile publishable cleanup, git diff filter testing removed --- docker/Dockerfile | 54 +++++++++++++++++------------------------ docker/ipynb_cleaner.py | 42 -------------------------------- 2 files changed, 22 insertions(+), 74 deletions(-) delete mode 100755 docker/ipynb_cleaner.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 5daacee..e7efc36 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -32,28 +32,13 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main -## INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) -# That will switch jupyter to using empty password instead of a token. -# To avoid making a security hole you SHOULD in fact not only uncomment but -# regenerate the hash for your own non-empty password and replace the hash below. -# You can compute a password hash in any notebook, just run the code: -# from notebook.auth import passwd -# passwd() -# and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py - -## INFO: Uncomment the RUN command below to disable git diff paging -RUN git config --global core.pager '' - - -# INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing -# notebooks with nbdiff (and nbdiff support in git diff command) after connecting to -# the container by "make exec" (docker exec) -# Try: -# nbd NOTEBOOK_NAME.ipynb -# to get nbdiff between checkpointed version and current version of the given notebook +# INFO: Jupyter and nbdime extension are not totally integrated (anaconda image is py36, +# nbdime checks for py35 at the moment, still the config below enables diffing +# notebooks with nbdiff (and nbdiff support in git diff command) after connecting +# to the container by "make exec" (or "docker-compose exec handson-ml bash") +# You may also try running: +# nbd NOTEBOOK_NAME.ipynb +# to get nbdiff between checkpointed version and current version of the given notebook USER root WORKDIR / RUN conda install -y -c conda-forge nbdime @@ -62,10 +47,10 @@ WORKDIR ${workdir} RUN git-nbdiffdriver config --enable --global -## INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either # metadata or details in nbdiff within git diff #RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' -#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' # INFO: Dirty nbdime patching @@ -85,12 +70,17 @@ COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${HOME}/.bashrc RUN sudo rm /tmp/bashrc +# INFO: Uncomment the RUN command below to disable git diff paging +#RUN git config --global core.pager '' -# INFO: Git filter testing -COPY docker/ipynb_cleaner.py /usr/bin/ipynb_cleaner -RUN mkdir -p ~/.config/git \ - && echo '*.ipynb filter=clean_ipynb' >> ~/.config/git/attributes \ - && git config --global filter.clean_ipynb.clean ipynb_cleaner \ - && git config --global filter.clean_ipynb.smudge cat - -# && git config --global filter.clean_ipynb.clean 'ipynb_cleaner %f' +# INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) +# That will switch jupyter to using empty password instead of a token. +# To avoid making a security hole you SHOULD in fact not only uncomment but +# regenerate the hash for your own non-empty password and replace the hash below. +# You can compute a password hash in any notebook, just run the code: +# from notebook.auth import passwd +# passwd() +# and take the hash from the output +#RUN mkdir -p ${HOME}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${HOME}/.jupyter/jupyter_notebook_config.py diff --git a/docker/ipynb_cleaner.py b/docker/ipynb_cleaner.py deleted file mode 100755 index d34d7a6..0000000 --- a/docker/ipynb_cleaner.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python - -""" - -******************************** -DANGER - W.I.P. - TESTING ONLY!! -******************************** - -Clean jupyter notebook for git operations -Based on "Keeping IPython notebooks under Git version control" -(see: - https://gist.github.com/pbugnion/ea2797393033b54674af - http://pascalbugnion.net/blog/ipython-notebooks-and-git.html - http://stackoverflow.com/a/20844506/827862 -) -""" - -import sys -import json - -sys.stderr.write("\n\nCAUTION ! W.I.P ! Only dropping some test metadata, don't commit!\n\n") - -def log(x): - sys.stderr.write("\n\n[{}]\n\n\n".format(x)) -def logj(x): - sys.stderr.write("\n\n") - json.dump(x, sys.stderr, sort_keys=True, indent=1, separators=(",",": ")) - sys.stderr.write("\n\n") - -log(sys.argv) -#sys.exit(17) - -nb = sys.stdin.read() -json_in = json.loads(nb) - -logj(json_in["metadata"]) -del json_in["metadata"]["nav_menu"] -del json_in["metadata"]["toc"] -json_in["metadata"]["language_info"]["version"]="17.0" -logj(json_in["metadata"]) - -json.dump(json_in, sys.stdout, sort_keys=True, indent=1, separators=(",",": ")) From 107de893049dea3afa26e432beb4158ceddf64ed Mon Sep 17 00:00:00 2001 From: ziembla Date: Fri, 1 Dec 2017 11:28:18 +0100 Subject: [PATCH 07/42] Nbdime patching ignored if the original file was changed --- docker/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e7efc36..a8fafa0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -53,14 +53,14 @@ RUN git-nbdiffdriver config --enable --global RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' -# INFO: Dirty nbdime patching +# INFO: Dirty nbdime patching (ignored if not matching) COPY docker/nbdime-*.patch /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-1-details.patch \ + /tmp/nbdime-2-toc.patch || true \ && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-2-toc.patch + /tmp/nbdime-2-toc.patch || true RUN rm /tmp/nbdime-*.patch USER ${username} WORKDIR ${workdir} From ddb9784176586d618a9e6b4cc39f5f10ae6d19a1 Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 4 Dec 2017 11:33:16 +0100 Subject: [PATCH 08/42] tensorflow version unpined, tensorboard support, home variable fix --- docker/Dockerfile | 17 ++++++++--------- docker/README.md | 4 +++- docker/bashrc | 6 ++++++ docker/docker-compose.yml | 1 + 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a8fafa0..bfccb99 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,20 +9,19 @@ RUN apt-get update && apt-get upgrade -y \ && rm -rf /var/lib/apt/lists/* RUN conda install -y -c conda-forge \ - tensorflow=1.0.0 \ + tensorflow \ jupyter_contrib_nbextensions ARG username ARG userid +ARG home=/home/${username} +ARG workdir=${home}/handson-ml + RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ && chmod 0440 /etc/sudoers.d/${username} -ENV HOME /home/${username} - -ARG workdir=${HOME}/handson-ml - WORKDIR ${workdir} RUN chown ${username}:${username} ${workdir} @@ -58,7 +57,7 @@ COPY docker/nbdime-*.patch /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-2-toc.patch || true \ + /tmp/nbdime-1-details.patch || true \ && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ /tmp/nbdime-2-toc.patch || true RUN rm /tmp/nbdime-*.patch @@ -67,7 +66,7 @@ WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${HOME}/.bashrc +RUN cat /tmp/bashrc >> ${home}/.bashrc RUN sudo rm /tmp/bashrc # INFO: Uncomment the RUN command below to disable git diff paging @@ -81,6 +80,6 @@ RUN sudo rm /tmp/bashrc # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ +#RUN mkdir -p ${home}/.jupyter && \ # echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/README.md b/docker/README.md index 2355c45..037ae22 100644 --- a/docker/README.md +++ b/docker/README.md @@ -32,7 +32,9 @@ You can close the server just by pressing `Ctrl-C` in terminal window. Run `make exec` (or `docker-compose exec handson-ml bash`) while the server is running to run an additional `bash` shell inside the `handson-ml` container. Now you're inside the environment prepared within the image. -One of the usefull things that can be done there may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. +One of the usefull things that can be done there would be starting TensorBoard (for example with simple `tb` command, see bashrc file). + +Another one may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. You can see changes you made relative to the version in git using `git diff` which is integrated with `nbdiff`. diff --git a/docker/bashrc b/docker/bashrc index b1bce45..619677d 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -10,3 +10,9 @@ nbd() { # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details } + +tb() { + python -m tensorboard.main --logdir=tf_logs +} + +alias tensorboard="python -m tensorboard.main" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8a9718c..d4b46e4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,6 +15,7 @@ services: max-size: 50m ports: - "8888:8888" + - "6006:6006" volumes: - ../:/home/devel/handson-ml command: /opt/conda/bin/jupyter notebook --ip='*' --port=8888 --no-browser From 63c1523528dae7fa10878b0af3f6bc71765ff11b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 7 Dec 2017 18:57:30 -0800 Subject: [PATCH 09/42] Replace n_inputs with n_outputs, fixes #125 --- 04_training_linear_models.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index c0bea14..a32fdea 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -1909,7 +1909,7 @@ " error = Y_proba - Y_train_one_hot\n", " if iteration % 500 == 0:\n", " print(iteration, loss)\n", - " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_inputs]), alpha * Theta[1:]]\n", + " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n", " Theta = Theta - eta * gradients" ] }, @@ -1987,7 +1987,7 @@ " l2_loss = 1/2 * np.sum(np.square(Theta[1:]))\n", " loss = xentropy_loss + alpha * l2_loss\n", " error = Y_proba - Y_train_one_hot\n", - " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_inputs]), alpha * Theta[1:]]\n", + " gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n", " Theta = Theta - eta * gradients\n", "\n", " logits = X_valid.dot(Theta)\n", From 6e4004f16c8ffe31d61c7fc127feb0d8f947cc4b Mon Sep 17 00:00:00 2001 From: ziembla Date: Sat, 9 Dec 2017 20:17:56 +0100 Subject: [PATCH 10/42] scripts for jupyter notebooks cleanup, bin subdir on path --- docker/Dockerfile | 1 + docker/bashrc | 19 +----- docker/bin/nbclean_checkpoints | 116 +++++++++++++++++++++++++++++++++ docker/bin/nbdiff_checkpoint | 9 +++ docker/bin/rm_empty_subdirs | 54 +++++++++++++++ docker/bin/tensorboard | 2 + 6 files changed, 184 insertions(+), 17 deletions(-) create mode 100755 docker/bin/nbclean_checkpoints create mode 100755 docker/bin/nbdiff_checkpoint create mode 100755 docker/bin/rm_empty_subdirs create mode 100755 docker/bin/tensorboard diff --git a/docker/Dockerfile b/docker/Dockerfile index bfccb99..adf97f1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -67,6 +67,7 @@ WORKDIR ${workdir} COPY docker/bashrc /tmp/bashrc RUN cat /tmp/bashrc >> ${home}/.bashrc +RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc RUN sudo rm /tmp/bashrc # INFO: Uncomment the RUN command below to disable git diff paging diff --git a/docker/bashrc b/docker/bashrc index 619677d..ff19745 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -1,18 +1,3 @@ alias ll="ls -alF" - -nbd() { - DIRNAME=$(dirname "$1") - BASENAME=$(basename "$1" .ipynb) - - WORKING_COPY=$DIRNAME/$BASENAME.ipynb - CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb - - # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details -} - -tb() { - python -m tensorboard.main --logdir=tf_logs -} - -alias tensorboard="python -m tensorboard.main" +alias nbd="nbdiff_checkpoint" +alias tb="tensorboard --logdir=tf_logs" diff --git a/docker/bin/nbclean_checkpoints b/docker/bin/nbclean_checkpoints new file mode 100755 index 0000000..ba4aaf9 --- /dev/null +++ b/docker/bin/nbclean_checkpoints @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +import collections +import glob +import hashlib +import os +import subprocess + + +class NotebookAnalyser: + + def __init__(self, dry_run=False, verbose=False, colorful=False): + self._dry_run = dry_run + self._verbose = verbose + self._colors = collections.defaultdict(lambda: "") + if colorful: + for color in [ + NotebookAnalyser.COLOR_WHITE, + NotebookAnalyser.COLOR_RED, + NotebookAnalyser.COLOR_GREEN, + NotebookAnalyser.COLOR_YELLOW, + ]: + self._colors[color] = "\033[{}m".format(color) + + NOTEBOOK_SUFFIX = ".ipynb" + CHECKPOINT_DIR = NOTEBOOK_SUFFIX + "_checkpoints" + CHECKPOINT_MASK = "*-checkpoint" + NOTEBOOK_SUFFIX + CHECKPOINT_MASK_LEN = len(CHECKPOINT_MASK) - 1 + + @staticmethod + def get_hash(file_path): + with open(file_path, "rb") as input: + hash = hashlib.md5() + for chunk in iter(lambda: input.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() + + MESSAGE_ORPHANED = "missing " + MESSAGE_MODIFIED = "modified" + MESSAGE_DELETED = "DELETING" + + COLOR_WHITE = "0" + COLOR_RED = "31" + COLOR_GREEN = "32" + COLOR_YELLOW = "33" + + def log(self, message, file, color=COLOR_WHITE): + color_on = self._colors[color] + color_off = self._colors[NotebookAnalyser.COLOR_WHITE] + print("{}{}{}: {}".format(color_on, message, color_off, file)) + + def clean_checkpoints(self, directory): + for checkpoint_path in sorted(glob.glob(os.path.join(directory, NotebookAnalyser.CHECKPOINT_MASK))): + + workfile_dir = os.path.dirname(os.path.dirname(checkpoint_path)) + workfile_name = os.path.basename(checkpoint_path)[:-NotebookAnalyser.CHECKPOINT_MASK_LEN] + NotebookAnalyser.NOTEBOOK_SUFFIX + workfile_path = os.path.join(workfile_dir, workfile_name) + + status = "" + if not os.path.isfile(workfile_path): + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_ORPHANED, workfile_path, NotebookAnalyser.COLOR_RED) + else: + checkpoint_stat = os.stat(checkpoint_path) + workfile_stat = os.stat(workfile_path) + + modified = workfile_stat.st_size != checkpoint_stat.st_size + + if not modified: + checkpoint_hash = NotebookAnalyser.get_hash(checkpoint_path) + workfile_hash = NotebookAnalyser.get_hash(workfile_path) + modified = checkpoint_hash != workfile_hash + + if modified: + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_MODIFIED, workfile_path, NotebookAnalyser.COLOR_YELLOW) + else: + self.log(NotebookAnalyser.MESSAGE_DELETED, checkpoint_path, NotebookAnalyser.COLOR_GREEN) + if not self._dry_run: + os.remove(checkpoint_path) + + if not self._dry_run and not os.listdir(directory): + self.log(NotebookAnalyser.MESSAGE_DELETED, directory, NotebookAnalyser.COLOR_GREEN) + os.rmdir(directory) + + def clean_checkpoints_recursively(self, directory): + for (root, subdirs, files) in os.walk(directory): + subdirs.sort() # INFO: traverse alphabetically + if NotebookAnalyser.CHECKPOINT_DIR in subdirs: + subdirs.remove(NotebookAnalyser.CHECKPOINT_DIR) # INFO: don't recurse there + self.clean_checkpoints(os.path.join(root, NotebookAnalyser.CHECKPOINT_DIR)) + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove checkpointed versions of those jupyter notebooks that are identical to their working copies.", + epilog="""Notebooks will be reported as either + "DELETED" if the working copy and checkpointed version are identical + (checkpoint will be deleted), + "missing" if there is a checkpoint but no corresponding working file can be found + or "modified" if notebook and the checkpoint are not byte-to-byte identical. + If removal of checkpoints results in empty ".ipynb_checkpoints" directory + that directory is also deleted. + """) #, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("dirs", metavar="DIR", type=str, nargs="*", default=".", help="directories to search") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") + parser.add_argument("-c", "--color", action="store_true", help="colorful mode") + args = parser.parse_args() + + analyser = NotebookAnalyser(args.dry_run, args.verbose, args.color) + for directory in args.dirs: + analyser.clean_checkpoints_recursively(directory) + +if __name__ == "__main__": + main() diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint new file mode 100755 index 0000000..ffbb21c --- /dev/null +++ b/docker/bin/nbdiff_checkpoint @@ -0,0 +1,9 @@ +#!/bin/bash +DIRNAME=$(dirname "$1") +BASENAME=$(basename "$1" .ipynb) + +WORKING_COPY=$DIRNAME/$BASENAME.ipynb +CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb + +echo "How change $CHECKPOINT_COPY into $WORKING_COPY" +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs new file mode 100755 index 0000000..8734b84 --- /dev/null +++ b/docker/bin/rm_empty_subdirs @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import os + +def remove_empty_directories(initial_dir, + allow_initial_delete=False, ignore_nonexistant_initial=False, + dry_run=False, quiet=False): + + FORBIDDEN_SUBDIRS = set([".git"]) + + if not os.path.isdir(initial_dir) and not ignore_nonexistant_initial: + raise RuntimeError("Initial directory '{}' not found!".format(initial_dir)) + + message = "removed" + if dry_run: + message = "to be " + message + + deleted = set() + + for (directory, subdirs, files) in os.walk(initial_dir, topdown=False): + forbidden = False + parent = directory + while parent: + parent, dirname = os.path.split(parent) + if dirname in FORBIDDEN_SUBDIRS: + forbidden = True + break + if forbidden: + continue + + is_empty = len(files) < 1 and len(set([os.path.join(directory, s) for s in subdirs]) - deleted) < 1 + + if is_empty and (initial_dir != directory or allow_initial_delete): + if not quiet: + print("{}: {}".format(message, directory)) + deleted.add(directory) + if not dry_run: + os.rmdir(directory) + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") + parser.add_argument("dir", metavar="DIR", type=str, nargs="*", default=".", help="directory to be searched") + parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") + parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-q", "--quiet", action="store_true", help="don't print names of directories being removed") + args = parser.parse_args() + for directory in args.dir: + remove_empty_directories(directory, args.allow_dir_removal, args.ignore_nonexistent_dir, + args.dry_run, args.quiet) + +if __name__ == "__main__": + main() diff --git a/docker/bin/tensorboard b/docker/bin/tensorboard new file mode 100755 index 0000000..dd7294d --- /dev/null +++ b/docker/bin/tensorboard @@ -0,0 +1,2 @@ +#!/bin/bash +python -m tensorboard.main "$@" From 5bb9d6d3dfba750b7e0cbcfe26733b17e8685219 Mon Sep 17 00:00:00 2001 From: ziembla Date: Sun, 10 Dec 2017 18:38:25 +0100 Subject: [PATCH 11/42] help message for nbdiff_checkpoint --- docker/bin/nbdiff_checkpoint | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint index ffbb21c..2969e1b 100755 --- a/docker/bin/nbdiff_checkpoint +++ b/docker/bin/nbdiff_checkpoint @@ -1,9 +1,16 @@ #!/bin/bash +if [ "$#" -ne 1 ]; then + echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" + echo + echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" + exit +fi + DIRNAME=$(dirname "$1") BASENAME=$(basename "$1" .ipynb) WORKING_COPY=$DIRNAME/$BASENAME.ipynb CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb -echo "How change $CHECKPOINT_COPY into $WORKING_COPY" +echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details From 30fef69ed026ee117db464766bd95af8c7df1d5e Mon Sep 17 00:00:00 2001 From: ziembla Date: Sun, 10 Dec 2017 18:18:33 +0000 Subject: [PATCH 12/42] rm_empty_subdirs changed to require explicit argument (defaulting to current dir withdrawn as potentially harmful) --- docker/bin/rm_empty_subdirs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs index 8734b84..34f3ea9 100755 --- a/docker/bin/rm_empty_subdirs +++ b/docker/bin/rm_empty_subdirs @@ -40,7 +40,7 @@ def remove_empty_directories(initial_dir, def main(): import argparse parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") - parser.add_argument("dir", metavar="DIR", type=str, nargs="*", default=".", help="directory to be searched") + parser.add_argument("dir", metavar="DIR", type=str, nargs="+", help="directory to be searched") parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") From 1d370f40016b2f9fa88b6486b5c3a726f7aac473 Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 11 Dec 2017 06:52:17 +0100 Subject: [PATCH 13/42] nbdiff_checkpoint parameter parsing fixed --- docker/bin/nbdiff_checkpoint | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint index 2969e1b..9ce7cd0 100755 --- a/docker/bin/nbdiff_checkpoint +++ b/docker/bin/nbdiff_checkpoint @@ -1,5 +1,5 @@ #!/bin/bash -if [ "$#" -ne 1 ]; then +if [[ "$#" -lt 1 || "$1" =~ ^((-h)|(--help))$ ]] ; then echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" echo echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" @@ -8,9 +8,10 @@ fi DIRNAME=$(dirname "$1") BASENAME=$(basename "$1" .ipynb) +shift WORKING_COPY=$DIRNAME/$BASENAME.ipynb CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" -nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details "$@" From 60bb0e4e502bdc711ca5d339b2e2d2692195c14c Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 11 Dec 2017 16:19:24 +0100 Subject: [PATCH 14/42] Uncommentable section in Dockerfile to autosave .py and .html alongside .ipynb --- docker/Dockerfile | 13 ++++++++++--- docker/{bashrc => bashrc.bash} | 0 docker/jupyter_notebook_config.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) rename docker/{bashrc => bashrc.bash} (100%) create mode 100644 docker/jupyter_notebook_config.py diff --git a/docker/Dockerfile b/docker/Dockerfile index adf97f1..2d24d04 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -65,10 +65,17 @@ USER ${username} WORKDIR ${workdir} -COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${home}/.bashrc +COPY docker/bashrc.bash /tmp/ +RUN cat /tmp/bashrc.bash >> ${home}/.bashrc RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc -RUN sudo rm /tmp/bashrc +RUN sudo rm /tmp/bashrc.bash + + +# INFO: Uncomment lines below to enable automatic save of python-only and html-only +# exports alongside the notebook +#COPY docker/jupyter_notebook_config.py /tmp/ +#RUN cat /tmp/jupyter_notebook_config.py >> ${home}/.jupyter/jupyter_notebook_config.py +#RUN sudo rm /tmp/jupyter_notebook_config.py # INFO: Uncomment the RUN command below to disable git diff paging #RUN git config --global core.pager '' diff --git a/docker/bashrc b/docker/bashrc.bash similarity index 100% rename from docker/bashrc rename to docker/bashrc.bash diff --git a/docker/jupyter_notebook_config.py b/docker/jupyter_notebook_config.py new file mode 100644 index 0000000..971a49a --- /dev/null +++ b/docker/jupyter_notebook_config.py @@ -0,0 +1,15 @@ +import os +import subprocess + +def export_script_and_view(model, os_path, contents_manager): + if model["type"] != "notebook": + return + dir_name, file_name = os.path.split(os_path) + file_base, file_ext = os.path.splitext(file_name) + if file_base.startswith("Untitled"): + return + export_name = file_base if file_ext == ".ipynb" else file_name + subprocess.check_call(["jupyter", "nbconvert", "--to", "script", file_name, "--output", export_name + "_script"], cwd=dir_name) + subprocess.check_call(["jupyter", "nbconvert", "--to", "html", file_name, "--output", export_name + "_view"], cwd=dir_name) + +c.FileContentsManager.post_save_hook = export_script_and_view From 9dfaa950d2091e7f37ddba996c68c60e79e05c3b Mon Sep 17 00:00:00 2001 From: ziembla Date: Mon, 11 Dec 2017 22:02:42 +0100 Subject: [PATCH 15/42] Dockerfile to spaces --- docker/Dockerfile | 30 +++++++++++++++--------------- docker/Makefile | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 2d24d04..b4ec526 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,16 +1,16 @@ FROM continuumio/anaconda3 RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y \ - libpq-dev \ - build-essential \ - git \ - sudo \ - && rm -rf /var/lib/apt/lists/* + && apt-get install -y \ + libpq-dev \ + build-essential \ + git \ + sudo \ + && rm -rf /var/lib/apt/lists/* RUN conda install -y -c conda-forge \ - tensorflow \ - jupyter_contrib_nbextensions + tensorflow \ + jupyter_contrib_nbextensions ARG username ARG userid @@ -19,8 +19,8 @@ ARG home=/home/${username} ARG workdir=${home}/handson-ml RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ - && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ - && chmod 0440 /etc/sudoers.d/${username} + && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ + && chmod 0440 /etc/sudoers.d/${username} WORKDIR ${workdir} RUN chown ${username}:${username} ${workdir} @@ -57,9 +57,9 @@ COPY docker/nbdime-*.patch /tmp/ USER root WORKDIR / RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-1-details.patch || true \ - && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ - /tmp/nbdime-2-toc.patch || true + /tmp/nbdime-1-details.patch || true \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch || true RUN rm /tmp/nbdime-*.patch USER ${username} WORKDIR ${workdir} @@ -89,5 +89,5 @@ RUN sudo rm /tmp/bashrc.bash # passwd() # and take the hash from the output #RUN mkdir -p ${home}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${home}/.jupyter/jupyter_notebook_config.py +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/Makefile b/docker/Makefile index 6078fc9..f85c49a 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -4,7 +4,7 @@ help: run: docker-compose up exec: - docker-compose exec handson-ml /bin/bash + docker-compose exec handson-ml bash build: stop .FORCE docker-compose build rebuild: stop .FORCE From ed40ca2be3bd4c569e3125df0863810177c833da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Dec 2017 21:35:51 +0100 Subject: [PATCH 16/42] Add comment about Python 3.6 SSL issue on MacOSX, fixes #145 --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bbec5aa..96c0fb3 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,11 @@ Of course, you obviously need Python. Python 2 is already preinstalled on most s $ python --version # for Python 2 $ python3 --version # for Python 3 -Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: +Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). If you are using Python 3.6 on MacOSX, you need to run the following command to install the `certifi` package of certificates because Python 3.6 on MacOSX has no certificates to validate SSL connections (see this [StackOverflow question](https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error)): + + $ /Applications/Python\ 3.6/Install\ Certificates.command + +On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: $ sudo apt-get update $ sudo apt-get install python3 @@ -49,9 +53,9 @@ When using Anaconda, you can optionally create an isolated Python environment de This creates a fresh Python 3.5 environment called `mlbook` (you can change the name if you want to), and it activates it. This environment contains all the scientific libraries that come with Anaconda. This includes all the libraries we will need (NumPy, Matplotlib, Pandas, Jupyter and a few others), except for TensorFlow, so let's install it: - $ conda install -n mlbook -c conda-forge tensorflow=1.0.0 + $ conda install -n mlbook -c conda-forge tensorflow=1.4.0 -This installs TensorFlow 1.0.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. +This installs TensorFlow 1.4.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. Next, you can optionally install Jupyter extensions. These are useful to have nice tables of contents in the notebooks, but they are not required. From f2020952d00a7343e130f4b78b07d6b063ef4cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Dec 2017 22:40:17 +0100 Subject: [PATCH 17/42] Fix error in MyLinearSVC, fixes #140 --- 05_support_vector_machines.ipynb | 660 ++++++------------------------- 1 file changed, 121 insertions(+), 539 deletions(-) diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index 687d74b..abbc1c1 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 5 – Support Vector Machines**\n", "\n", @@ -14,20 +11,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -35,11 +26,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -74,20 +61,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin classification" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after:" ] @@ -95,11 +76,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -121,11 +98,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Bad models\n", @@ -179,10 +152,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to feature scales" ] @@ -190,11 +160,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)\n", @@ -230,10 +196,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to outliers" ] @@ -241,11 +204,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])\n", @@ -295,20 +254,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin *vs* margin violations" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This is the first code example in chapter 5:" ] @@ -316,11 +269,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -344,11 +293,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf.predict([[5.5, 1.7]])" @@ -356,10 +301,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's generate the graph comparing different regularization settings:" ] @@ -367,11 +309,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -394,11 +332,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Convert to unscaled parameters\n", @@ -422,11 +356,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,3.2))\n", @@ -454,9 +384,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "# Non-linear classification" @@ -465,11 +393,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X1D = np.linspace(-4, 4, 9).reshape(-1, 1)\n", @@ -508,11 +432,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -533,11 +453,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -556,11 +472,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_predictions(clf, axes):\n", @@ -583,11 +495,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -602,11 +510,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "poly100_kernel_svm_clf = Pipeline([\n", @@ -619,11 +523,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(11, 4))\n", @@ -646,9 +546,6 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -716,11 +613,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "x1_example = X1D[3, 0]\n", @@ -732,11 +625,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rbf_kernel_svm_clf = Pipeline([\n", @@ -750,9 +639,6 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -787,10 +673,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Regression\n" ] @@ -798,11 +681,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -814,11 +693,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -830,11 +705,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n", @@ -857,11 +728,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_svm_regression(svm_reg, X, y, axes):\n", @@ -898,11 +765,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -914,11 +777,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -930,11 +789,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -948,11 +803,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9, 4))\n", @@ -969,10 +820,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Under the hood" ] @@ -980,11 +828,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "iris = datasets.load_iris()\n", @@ -995,11 +839,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", @@ -1042,10 +882,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Small weight vector results in a large margin" ] @@ -1053,11 +890,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_2D_decision_function(w, b, ylabel=True, x1_lim=[-3, 3]):\n", @@ -1091,11 +924,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -1112,10 +941,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Hinge loss" ] @@ -1123,11 +949,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-2, 4, 200)\n", @@ -1148,20 +970,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Extra material" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Training time" ] @@ -1169,11 +985,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n", @@ -1184,11 +996,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -1210,10 +1018,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Linear SVM classifier implementation using Batch Gradient Descent" ] @@ -1221,11 +1026,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Training set\n", @@ -1236,11 +1037,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", @@ -1286,7 +1083,7 @@ "\n", " self.intercept_ = np.array([b])\n", " self.coef_ = np.array([w])\n", - " support_vectors_idx = (X_t.dot(w) + b < 1).ravel()\n", + " support_vectors_idx = (X_t.dot(w) + t * b < 1).ravel()\n", " self.support_vectors_ = X[support_vectors_idx]\n", " return self\n", "\n", @@ -1305,11 +1102,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(range(svm_clf.n_epochs), svm_clf.Js)\n", @@ -1319,11 +1112,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "print(svm_clf.intercept_, svm_clf.coef_)" @@ -1332,11 +1121,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf2 = SVC(kernel=\"linear\", C=C)\n", @@ -1347,11 +1132,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "yr = y.ravel()\n", @@ -1378,9 +1159,6 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -1412,20 +1190,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 1. to 7." ] @@ -1433,9 +1205,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "See appendix A." @@ -1443,30 +1213,21 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 8." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train a `LinearSVC` on a linearly separable dataset. Then train an `SVC` and a `SGDClassifier` on the same dataset. See if you can get them to produce roughly the same model._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's use the Iris dataset: the Iris Setosa and Iris Versicolor classes are linearly separable." ] @@ -1475,9 +1236,7 @@ "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1495,11 +1254,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC, LinearSVC\n", @@ -1528,10 +1283,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's plot the decision boundaries of these three models:" ] @@ -1539,11 +1291,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Compute the slope and bias of each decision boundary\n", @@ -1576,40 +1324,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Close enough!" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 9." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's load the dataset and split it into a training set and a test set. We could use `train_test_split()` but people usually just take the first 60,000 instances for the training set, and the last 10,000 instances for the test set (this makes it possible to compare your model's performance with others): " ] @@ -1617,11 +1353,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_mldata\n", @@ -1638,10 +1370,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first:" ] @@ -1650,9 +1379,7 @@ "cell_type": "code", "execution_count": 48, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1664,10 +1391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!" ] @@ -1675,11 +1399,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1688,10 +1408,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's make predictions on the training set and measure the accuracy (we don't want to measure it on the test set yet, since we have not selected and trained the final model yet):" ] @@ -1699,11 +1416,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", @@ -1714,10 +1427,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Wow, 82% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" ] @@ -1725,11 +1435,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -1740,11 +1446,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1754,11 +1456,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = lin_clf.predict(X_train_scaled)\n", @@ -1767,10 +1465,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's much better (we cut the error rate in two), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default).\n", "\n", @@ -1780,11 +1475,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf = SVC(decision_function_shape=\"ovr\")\n", @@ -1794,11 +1485,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = svm_clf.predict(X_train_scaled)\n", @@ -1807,10 +1494,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's promising, we get better performance even though we trained the model on 6 times less data. Let's tune the hyperparameters by doing a randomized search with cross validation. We will do this on a small dataset just to speed up the process:" ] @@ -1818,11 +1502,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", @@ -1836,11 +1516,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -1849,11 +1525,7 @@ { "cell_type": "code", "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_score_" @@ -1861,10 +1533,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This looks pretty low but remember we only trained the model on 1,000 instances. Let's retrain the best estimator on the whole training set (run this at night, it will take hours):" ] @@ -1872,11 +1541,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)" @@ -1885,11 +1550,7 @@ { "cell_type": "code", "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -1898,10 +1559,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Ah, this looks good! Let's select this model. Now we can test it on the test set:" ] @@ -1909,11 +1567,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -1922,40 +1576,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Not too bad, but apparently the model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing `C` and/or `gamma`), but we would run the risk of overfitting the test set. Other people have found that the hyperparameters `C=5` and `gamma=0.005` yield even better performance (over 98% accuracy). By running the randomized search for longer and on a larger part of the training set, you may be able to find this as well." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 10." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM regressor on the California housing dataset._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's load the dataset using Scikit-Learn's `fetch_california_housing()` function:" ] @@ -1964,9 +1606,7 @@ "cell_type": "code", "execution_count": 62, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1979,10 +1619,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Split it into a training set and a test set:" ] @@ -1991,9 +1628,7 @@ "cell_type": "code", "execution_count": 63, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2004,10 +1639,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Don't forget to scale the data:" ] @@ -2016,9 +1648,7 @@ "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2031,10 +1661,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's train a simple `LinearSVR` first:" ] @@ -2042,11 +1669,7 @@ { "cell_type": "code", "execution_count": 65, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -2057,10 +1680,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's see how it performs on the training set:" ] @@ -2068,11 +1688,7 @@ { "cell_type": "code", "execution_count": 66, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", @@ -2084,10 +1700,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's look at the RMSE:" ] @@ -2095,11 +1708,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(mse)" @@ -2107,10 +1716,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In this training set, the targets are tens of thousands of dollars. The RMSE gives a rough idea of the kind of error you should expect (with a higher weight for large errors): so with this model we can expect errors somewhere around $10,000. Not great. Let's see if we can do better with an RBF Kernel. We will use randomized search with cross validation to find the appropriate hyperparameter values for `C` and `gamma`:" ] @@ -2118,11 +1724,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -2137,11 +1739,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -2149,10 +1747,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's measure the RMSE on the training set:" ] @@ -2160,11 +1755,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -2174,10 +1765,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Looks much better than the linear model. Let's select this model and evaluate it on the test set:" ] @@ -2185,11 +1773,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -2201,9 +1785,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [] @@ -2225,7 +1807,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.3" }, "nav_menu": {}, "toc": { @@ -2239,5 +1821,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 1eaa53a6a21cd22a4a10c125e9dcb78c25b35375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Dec 2017 23:14:20 +0100 Subject: [PATCH 18/42] Add thanks to contributors in README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 96c0fb3..faa749c 100644 --- a/README.md +++ b/README.md @@ -109,3 +109,6 @@ This should open up your browser, and you should see Jupyter's tree view, with t Note: you can also visit [http://localhost:8888/nbextensions](http://localhost:8888/nbextensions) to activate and configure Jupyter extensions. Congrats! You are ready to learn Machine Learning, hands on! + +# Contributors +I would like to thank everyone who contributed to this project, either by providing useful feedback, filing issues or submitting Pull Requests. Special thanks go to Steven Bunkley and Ziembla who created the `docker` directory. From 9328b858c55c98562256bfa70b93e57ea7fc5172 Mon Sep 17 00:00:00 2001 From: rickiepark Date: Thu, 21 Dec 2017 16:03:37 +0900 Subject: [PATCH 19/42] add gitignore, environment.yml --- .gitignore | 2 ++ environment.yml | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 environment.yml diff --git a/.gitignore b/.gitignore index c77a27e..b8f995c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ my_* datasets/words datasets/flowers datasets/spam +*.gz +datasets/mnist/train-labels-idx1-ubyte diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..d930bee --- /dev/null +++ b/environment.yml @@ -0,0 +1,16 @@ +name: handson-ml +dependencies: +- python=3.5 +- jupyter +- matplotlib +- numexpr +- numpy +- pandas +- Pillow +- psutil +- scikit-learn +- scipy +- sympy +- pip: + - tensorflow + - watermark From f558bf43e52a62c65ac5c36c613f11923b06e4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 5 Jan 2018 14:36:11 +0100 Subject: [PATCH 20/42] Upgrade to latest pandas version, update resampling API --- tools_pandas.ipynb | 923 +++++++++++++++++---------------------------- 1 file changed, 340 insertions(+), 583 deletions(-) diff --git a/tools_pandas.ipynb b/tools_pandas.ipynb index 379443e..6580f20 100644 --- a/tools_pandas.ipynb +++ b/tools_pandas.ipynb @@ -23,9 +23,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function, unicode_literals" @@ -41,9 +39,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd" @@ -71,9 +67,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s = pd.Series([2,-1,3,5])\n", @@ -91,9 +85,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -110,9 +102,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s + [1000,2000,3000,4000]" @@ -128,9 +118,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s + 1000" @@ -146,9 +134,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s < 0" @@ -165,9 +151,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2 = pd.Series([68, 83, 112, 68], index=[\"alice\", \"bob\", \"charles\", \"darwin\"])\n", @@ -184,9 +168,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2[\"bob\"]" @@ -202,9 +184,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2[1]" @@ -220,9 +200,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.loc[\"bob\"]" @@ -231,9 +209,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.iloc[1]" @@ -249,9 +225,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.iloc[1:3]" @@ -267,9 +241,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise = pd.Series([1000, 1001, 1002, 1003])\n", @@ -279,9 +251,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise_slice = surprise[2:]\n", @@ -298,9 +268,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -319,9 +287,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise_slice.iloc[0]" @@ -338,9 +304,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "weights = {\"alice\": 68, \"bob\": 83, \"colin\": 86, \"darwin\": 68}\n", @@ -358,9 +322,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s4 = pd.Series(weights, index = [\"colin\", \"alice\"])\n", @@ -378,9 +340,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print(s2.keys())\n", @@ -401,9 +361,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s5 = pd.Series([1000,1000,1000,1000])\n", @@ -431,9 +389,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "meaning = pd.Series(42, [\"life\", \"universe\", \"everything\"])\n", @@ -451,9 +407,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s6 = pd.Series([83, 68], index=[\"bob\", \"alice\"], name=\"weights\")\n", @@ -465,14 +419,13 @@ "metadata": {}, "source": [ "## Plotting a `Series`\n", - "Pandas makes it easy to plot `Series` data using matplotlib (for more details on matplotlib, check out the [matplotlib tutorial](tools_matplotlib.ipynb)). Just import matplotlib and call the `plot` method:" + "Pandas makes it easy to plot `Series` data using matplotlib (for more details on matplotlib, check out the [matplotlib tutorial](tools_matplotlib.ipynb)). Just import matplotlib and call the `plot()` method:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -504,15 +457,13 @@ "* it can handle timezones.\n", "\n", "## Time range\n", - "Let's start by creating a time series using `timerange`. This returns a `DatetimeIndex` containing one datetime per hour for 12 hours starting on October 29th 2016 at 5:30pm." + "Let's start by creating a time series using `pd.date_range()`. This returns a `DatetimeIndex` containing one datetime per hour for 12 hours starting on October 29th 2016 at 5:30pm." ] }, { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dates = pd.date_range('2016/10/29 5:30pm', periods=12, freq='H')\n", @@ -529,9 +480,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series = pd.Series(temperatures, dates)\n", @@ -548,9 +497,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series.plot(kind=\"bar\")\n", @@ -564,15 +511,13 @@ "metadata": {}, "source": [ "## Resampling\n", - "Pandas let's us resample a time series very simply. Just call the `resample` method and specify a new frequency:" + "Pandas lets us resample a time series very simply. Just call the `resample()` method and specify a new frequency:" ] }, { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series_freq_2H = temp_series.resample(\"2H\")\n", @@ -583,15 +528,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's take a look at the result:" + "The resampling operation is actually a deferred operation, which is why we did not get a `Series` object, but a `DatetimeIndexResampler` object instead. To actually perform the resampling operation, we can simply call the `mean()` method: Pandas will compute the mean of every pair of consecutive hours:" ] }, { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, + "outputs": [], + "source": [ + "temp_series_freq_2H = temp_series_freq_2H.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, "outputs": [], "source": [ "temp_series_freq_2H.plot(kind=\"bar\")\n", @@ -602,18 +561,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note how the values have automatically been aggregated into 2-hour periods. If we look at the 6-8pm period, for example, we had a value of `5.1` at 6:30pm, and `6.1` at 7:30pm. After resampling, we just have one value of `5.6`, which is the mean of `5.1` and `6.1`. Computing the mean is the default behavior, but it is also possible to use a different aggregation function, for example we can decide to keep the minimum value of each period:" + "Note how the values have automatically been aggregated into 2-hour periods. If we look at the 6-8pm period, for example, we had a value of `5.1` at 6:30pm, and `6.1` at 7:30pm. After resampling, we just have one value of `5.6`, which is the mean of `5.1` and `6.1`. Rather than computing the mean, we could have used any other aggregation function, for example we can decide to keep the minimum value of each period:" ] }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ - "temp_series_freq_2H = temp_series.resample(\"2H\", how=np.min)\n", + "temp_series_freq_2H = temp_series.resample(\"2H\").min()\n", + "temp_series_freq_2H" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or, equivalently, we could use the `apply()` method instead:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "temp_series_freq_2H = temp_series.resample(\"2H\").apply(np.min)\n", "temp_series_freq_2H" ] }, @@ -627,13 +601,11 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ - "temp_series_freq_15min = temp_series.resample(\"15Min\")\n", + "temp_series_freq_15min = temp_series.resample(\"15Min\").mean()\n", "temp_series_freq_15min.head(n=10) # `head` displays the top n values" ] }, @@ -641,14 +613,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "One solution is to fill the gaps by interpolating. We just call the `interpolate` method. The default is to use linear interpolation, but we can also select another method, such as cubic interpolation:" + "One solution is to fill the gaps by interpolating. We just call the `interpolate()` method. The default is to use linear interpolation, but we can also select another method, such as cubic interpolation:" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -659,10 +630,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, + "execution_count": 35, + "metadata": {}, "outputs": [], "source": [ "temp_series.plot(label=\"Period: 1 hour\")\n", @@ -676,15 +645,13 @@ "metadata": {}, "source": [ "## Timezones\n", - "By default datetimes are *naive*: they are not aware of timezones, so 2016-10-30 02:30 might mean October 30th 2016 at 2:30am in Paris or in New York. We can make datetimes timezone *aware* by calling the `tz_localize` method:" + "By default datetimes are *naive*: they are not aware of timezones, so 2016-10-30 02:30 might mean October 30th 2016 at 2:30am in Paris or in New York. We can make datetimes timezone *aware* by calling the `tz_localize()` method:" ] }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": false - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "temp_series_ny = temp_series.tz_localize(\"America/New_York\")\n", @@ -702,10 +669,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, + "execution_count": 37, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris = temp_series_ny.tz_convert(\"Europe/Paris\")\n", @@ -721,10 +686,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": false - }, + "execution_count": 38, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris_naive = temp_series_paris.tz_localize(None)\n", @@ -740,10 +703,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false - }, + "execution_count": 39, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -762,10 +723,8 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": false - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris_naive.tz_localize(\"Europe/Paris\", ambiguous=\"infer\")" @@ -776,15 +735,13 @@ "metadata": {}, "source": [ "## Periods\n", - "The `period_range` function returns a `PeriodIndex` instead of a `DatetimeIndex`. For example, let's get all quarters in 2016 and 2017:" + "The `pd.period_range()` function returns a `PeriodIndex` instead of a `DatetimeIndex`. For example, let's get all quarters in 2016 and 2017:" ] }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ "quarters = pd.period_range('2016Q1', periods=8, freq='Q')\n", @@ -800,10 +757,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, + "execution_count": 42, + "metadata": {}, "outputs": [], "source": [ "quarters + 3" @@ -813,15 +768,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `asfreq` method lets us change the frequency of the `PeriodIndex`. All periods are lengthened or shortened accordingly. For example, let's convert all the quarterly periods to monthly periods (zooming in):" + "The `asfreq()` method lets us change the frequency of the `PeriodIndex`. All periods are lengthened or shortened accordingly. For example, let's convert all the quarterly periods to monthly periods (zooming in):" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, + "execution_count": 43, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"M\")" @@ -836,10 +789,8 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"M\", how=\"start\")" @@ -854,10 +805,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": false - }, + "execution_count": 45, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"A\")" @@ -872,10 +821,8 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": false - }, + "execution_count": 46, + "metadata": {}, "outputs": [], "source": [ "quarterly_revenue = pd.Series([300, 320, 290, 390, 320, 360, 310, 410], index = quarters)\n", @@ -884,10 +831,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": false - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "quarterly_revenue.plot(kind=\"line\")\n", @@ -903,10 +848,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false - }, + "execution_count": 48, + "metadata": {}, "outputs": [], "source": [ "last_hours = quarterly_revenue.to_timestamp(how=\"end\", freq=\"H\")\n", @@ -922,10 +865,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [], "source": [ "last_hours.to_period()" @@ -940,10 +881,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, + "execution_count": 50, + "metadata": {}, "outputs": [], "source": [ "months_2016 = pd.period_range(\"2016\", periods=12, freq=\"M\")\n", @@ -965,10 +904,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, + "execution_count": 51, + "metadata": {}, "outputs": [], "source": [ "people_dict = {\n", @@ -1001,10 +938,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false - }, + "execution_count": 52, + "metadata": {}, "outputs": [], "source": [ "people[\"birthyear\"]" @@ -1019,10 +954,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, + "execution_count": 53, + "metadata": {}, "outputs": [], "source": [ "people[[\"birthyear\", \"hobby\"]]" @@ -1037,10 +970,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "d2 = pd.DataFrame(\n", @@ -1060,10 +991,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "values = [\n", @@ -1088,16 +1017,14 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, + "execution_count": 56, + "metadata": {}, "outputs": [], "source": [ "masked_array = np.ma.asarray(values, dtype=np.object)\n", "masked_array[(0, 2), (1, 2)] = np.ma.masked\n", "d3 = pd.DataFrame(\n", - " values,\n", + " masked_array,\n", " columns=[\"birthyear\", \"children\", \"hobby\", \"weight\"],\n", " index=[\"alice\", \"bob\", \"charles\"]\n", " )\n", @@ -1113,10 +1040,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false - }, + "execution_count": 57, + "metadata": {}, "outputs": [], "source": [ "d4 = pd.DataFrame(\n", @@ -1136,10 +1061,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, + "execution_count": 58, + "metadata": {}, "outputs": [], "source": [ "people = pd.DataFrame({\n", @@ -1161,10 +1084,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, + "execution_count": 59, + "metadata": {}, "outputs": [], "source": [ "d5 = pd.DataFrame(\n", @@ -1191,10 +1112,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, + "execution_count": 60, + "metadata": {}, "outputs": [], "source": [ "d5[\"public\"]" @@ -1202,13 +1121,11 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false - }, + "execution_count": 61, + "metadata": {}, "outputs": [], "source": [ - "d5[\"public\", \"hobby\"] # Same result as d4[\"public\"][\"hobby\"]" + "d5[\"public\", \"hobby\"] # Same result as d5[\"public\"][\"hobby\"]" ] }, { @@ -1221,10 +1138,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false - }, + "execution_count": 62, + "metadata": {}, "outputs": [], "source": [ "d5" @@ -1234,15 +1149,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are two levels of columns, and two levels of indices. We can drop a column level by calling `droplevel` (the same goes for indices):" + "There are two levels of columns, and two levels of indices. We can drop a column level by calling `droplevel()` (the same goes for indices):" ] }, { "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": false - }, + "execution_count": 63, + "metadata": {}, "outputs": [], "source": [ "d5.columns = d5.columns.droplevel(level = 0)\n", @@ -1259,10 +1172,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": false - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "d6 = d5.T\n", @@ -1274,15 +1185,13 @@ "metadata": {}, "source": [ "## Stacking and unstacking levels\n", - "Calling the `stack` method will push the lowest column level after the lowest index:" + "Calling the `stack()` method will push the lowest column level after the lowest index:" ] }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": false - }, + "execution_count": 65, + "metadata": {}, "outputs": [], "source": [ "d7 = d6.stack()\n", @@ -1295,15 +1204,13 @@ "source": [ "Note that many `NaN` values appeared. This makes sense because many new combinations did not exist before (eg. there was no `bob` in `London`).\n", "\n", - "Calling `unstack` will do the reverse, once again creating many `NaN` values." + "Calling `unstack()` will do the reverse, once again creating many `NaN` values." ] }, { "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": false - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "d8 = d7.unstack()\n", @@ -1319,10 +1226,8 @@ }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": false - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "d9 = d8.unstack()\n", @@ -1333,14 +1238,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `stack` and `unstack` methods let you select the `level` to stack/unstack. You can even stack/unstack multiple levels at once:" + "The `stack()` and `unstack()` methods let you select the `level` to stack/unstack. You can even stack/unstack multiple levels at once:" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 68, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -1354,7 +1258,7 @@ "metadata": {}, "source": [ "## Most methods return modified copies\n", - "As you may have noticed, the `stack` and `unstack` methods do not modify the object they apply to. Instead, they work on a copy and return that copy. This is true of most methods in pandas." + "As you may have noticed, the `stack()` and `unstack()` methods do not modify the object they apply to. Instead, they work on a copy and return that copy. This is true of most methods in pandas." ] }, { @@ -1367,10 +1271,8 @@ }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": false - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "people" @@ -1380,15 +1282,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `loc` attribute lets you access rows instead of columns. The result is `Series` object in which the `DataFrame`'s column names are mapped to row index labels:" + "The `loc` attribute lets you access rows instead of columns. The result is a `Series` object in which the `DataFrame`'s column names are mapped to row index labels:" ] }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": false - }, + "execution_count": 70, + "metadata": {}, "outputs": [], "source": [ "people.loc[\"charles\"]" @@ -1403,10 +1303,8 @@ }, { "cell_type": "code", - "execution_count": 69, - "metadata": { - "collapsed": false - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "people.iloc[2]" @@ -1421,10 +1319,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": false - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "people.iloc[1:3]" @@ -1439,10 +1335,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": false - }, + "execution_count": 73, + "metadata": {}, "outputs": [], "source": [ "people[np.array([True, False, True])]" @@ -1457,10 +1351,8 @@ }, { "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, + "execution_count": 74, + "metadata": {}, "outputs": [], "source": [ "people[people[\"birthyear\"] < 1990]" @@ -1476,10 +1368,8 @@ }, { "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": false - }, + "execution_count": 75, + "metadata": {}, "outputs": [], "source": [ "people" @@ -1487,13 +1377,11 @@ }, { "cell_type": "code", - "execution_count": 74, - "metadata": { - "collapsed": false - }, + "execution_count": 76, + "metadata": {}, "outputs": [], "source": [ - "people[\"age\"] = 2016 - people[\"birthyear\"] # adds a new column \"age\"\n", + "people[\"age\"] = 2018 - people[\"birthyear\"] # adds a new column \"age\"\n", "people[\"over 30\"] = people[\"age\"] > 30 # adds another column \"over 30\"\n", "birthyears = people.pop(\"birthyear\")\n", "del people[\"children\"]\n", @@ -1503,10 +1391,8 @@ }, { "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, + "execution_count": 77, + "metadata": {}, "outputs": [], "source": [ "birthyears" @@ -1521,10 +1407,8 @@ }, { "cell_type": "code", - "execution_count": 76, - "metadata": { - "collapsed": false - }, + "execution_count": 78, + "metadata": {}, "outputs": [], "source": [ "people[\"pets\"] = pd.Series({\"bob\": 0, \"charles\": 5, \"eugene\":1}) # alice is missing, eugene is ignored\n", @@ -1535,15 +1419,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When adding a new column, it is added at the end (on the right) by default. You can also insert a column anywhere else using the `insert` method:" + "When adding a new column, it is added at the end (on the right) by default. You can also insert a column anywhere else using the `insert()` method:" ] }, { "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": false - }, + "execution_count": 79, + "metadata": {}, "outputs": [], "source": [ "people.insert(1, \"height\", [172, 181, 185])\n", @@ -1555,15 +1437,13 @@ "metadata": {}, "source": [ "## Assigning new columns\n", - "You can also create new columns by calling the `assign` method. Note that this returns a new `DataFrame` object, the original is not modified:" + "You can also create new columns by calling the `assign()` method. Note that this returns a new `DataFrame` object, the original is not modified:" ] }, { "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": false - }, + "execution_count": 80, + "metadata": {}, "outputs": [], "source": [ "people.assign(\n", @@ -1581,10 +1461,8 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false - }, + "execution_count": 81, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -1605,10 +1483,8 @@ }, { "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": false - }, + "execution_count": 82, + "metadata": {}, "outputs": [], "source": [ "d6 = people.assign(body_mass_index = people[\"weight\"] / (people[\"height\"] / 100) ** 2)\n", @@ -1624,10 +1500,8 @@ }, { "cell_type": "code", - "execution_count": 81, - "metadata": { - "collapsed": false - }, + "execution_count": 83, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -1643,15 +1517,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "But fear not, there is a simple solution. You can pass a function to the `assign` method (typically a `lambda` function), and this function will be called with the `DataFrame` as a parameter:" + "But fear not, there is a simple solution. You can pass a function to the `assign()` method (typically a `lambda` function), and this function will be called with the `DataFrame` as a parameter:" ] }, { "cell_type": "code", - "execution_count": 82, - "metadata": { - "collapsed": false - }, + "execution_count": 84, + "metadata": {}, "outputs": [], "source": [ "(people\n", @@ -1677,10 +1549,8 @@ }, { "cell_type": "code", - "execution_count": 83, - "metadata": { - "collapsed": false - }, + "execution_count": 85, + "metadata": {}, "outputs": [], "source": [ "people.eval(\"weight / (height/100) ** 2 > 25\")" @@ -1690,18 +1560,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Assignment expressions are also supported, and contrary to the `assign` method, this does not create a copy of the `DataFrame`, instead it directly modifies it:" + "Assignment expressions are also supported. Let's set `inplace=True` to directly modify the `DataFrame` rather than getting a modified copy:" ] }, { "cell_type": "code", - "execution_count": 84, - "metadata": { - "collapsed": false - }, + "execution_count": 86, + "metadata": {}, "outputs": [], "source": [ - "people.eval(\"body_mass_index = weight / (height/100) ** 2\")\n", + "people.eval(\"body_mass_index = weight / (height/100) ** 2\", inplace=True)\n", "people" ] }, @@ -1714,14 +1582,12 @@ }, { "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, + "execution_count": 87, + "metadata": {}, "outputs": [], "source": [ "overweight_threshold = 30\n", - "people.eval(\"overweight = body_mass_index > @overweight_threshold\")\n", + "people.eval(\"overweight = body_mass_index > @overweight_threshold\", inplace=True)\n", "people" ] }, @@ -1730,15 +1596,13 @@ "metadata": {}, "source": [ "## Querying a `DataFrame`\n", - "The `query` method lets you filter a `DataFrame` based on a query expression:" + "The `query()` method lets you filter a `DataFrame` based on a query expression:" ] }, { "cell_type": "code", - "execution_count": 86, - "metadata": { - "collapsed": false - }, + "execution_count": 88, + "metadata": {}, "outputs": [], "source": [ "people.query(\"age > 30 and pets == 0\")" @@ -1754,10 +1618,8 @@ }, { "cell_type": "code", - "execution_count": 87, - "metadata": { - "collapsed": false - }, + "execution_count": 89, + "metadata": {}, "outputs": [], "source": [ "people.sort_index(ascending=False)" @@ -1772,10 +1634,8 @@ }, { "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": false - }, + "execution_count": 90, + "metadata": {}, "outputs": [], "source": [ "people.sort_index(axis=1, inplace=True)\n", @@ -1791,10 +1651,8 @@ }, { "cell_type": "code", - "execution_count": 89, - "metadata": { - "collapsed": false - }, + "execution_count": 91, + "metadata": {}, "outputs": [], "source": [ "people.sort_values(by=\"age\", inplace=True)\n", @@ -1813,10 +1671,8 @@ }, { "cell_type": "code", - "execution_count": 90, - "metadata": { - "collapsed": false - }, + "execution_count": 92, + "metadata": {}, "outputs": [], "source": [ "people.plot(kind = \"line\", x = \"body_mass_index\", y = [\"height\", \"weight\"])\n", @@ -1827,14 +1683,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can pass extra arguments supported by matplotlib's functions. For example, we can create scatterplot and pass it a list of sizes using the `s` argument of matplotlib's `scatter` function:" + "You can pass extra arguments supported by matplotlib's functions. For example, we can create scatterplot and pass it a list of sizes using the `s` argument of matplotlib's `scatter()` function:" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 93, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -1860,10 +1715,8 @@ }, { "cell_type": "code", - "execution_count": 92, - "metadata": { - "collapsed": false - }, + "execution_count": 94, + "metadata": {}, "outputs": [], "source": [ "grades_array = np.array([[8,8,9],[10,9,9],[4, 8, 2], [9, 10, 10]])\n", @@ -1880,10 +1733,8 @@ }, { "cell_type": "code", - "execution_count": 93, - "metadata": { - "collapsed": false - }, + "execution_count": 95, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(grades)" @@ -1898,10 +1749,8 @@ }, { "cell_type": "code", - "execution_count": 94, - "metadata": { - "collapsed": false - }, + "execution_count": 96, + "metadata": {}, "outputs": [], "source": [ "grades + 1" @@ -1916,9 +1765,8 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 97, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -1935,10 +1783,8 @@ }, { "cell_type": "code", - "execution_count": 96, - "metadata": { - "collapsed": false - }, + "execution_count": 98, + "metadata": {}, "outputs": [], "source": [ "grades.mean()" @@ -1953,10 +1799,8 @@ }, { "cell_type": "code", - "execution_count": 97, - "metadata": { - "collapsed": false - }, + "execution_count": 99, + "metadata": {}, "outputs": [], "source": [ "(grades > 5).all()" @@ -1971,10 +1815,8 @@ }, { "cell_type": "code", - "execution_count": 98, - "metadata": { - "collapsed": false - }, + "execution_count": 100, + "metadata": {}, "outputs": [], "source": [ "(grades > 5).all(axis = 1)" @@ -1989,10 +1831,8 @@ }, { "cell_type": "code", - "execution_count": 99, - "metadata": { - "collapsed": false - }, + "execution_count": 101, + "metadata": {}, "outputs": [], "source": [ "(grades == 10).any(axis = 1)" @@ -2007,10 +1847,8 @@ }, { "cell_type": "code", - "execution_count": 100, - "metadata": { - "collapsed": false - }, + "execution_count": 102, + "metadata": {}, "outputs": [], "source": [ "grades - grades.mean() # equivalent to: grades - [7.75, 8.75, 7.50]" @@ -2025,10 +1863,8 @@ }, { "cell_type": "code", - "execution_count": 101, - "metadata": { - "collapsed": false - }, + "execution_count": 103, + "metadata": {}, "outputs": [], "source": [ "pd.DataFrame([[7.75, 8.75, 7.50]]*4, index=grades.index, columns=grades.columns)" @@ -2043,9 +1879,8 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 104, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2063,10 +1898,8 @@ }, { "cell_type": "code", - "execution_count": 103, - "metadata": { - "collapsed": false - }, + "execution_count": 105, + "metadata": {}, "outputs": [], "source": [ "bonus_array = np.array([[0,np.nan,2],[np.nan,1,0],[0, 1, 0], [3, 3, 0]])\n", @@ -2076,9 +1909,8 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 106, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2095,14 +1927,13 @@ "## Handling missing data\n", "Dealing with missing data is a frequent task when working with real life data. Pandas offers a few tools to handle missing data.\n", " \n", - "Let's try to fix the problem above. For example, we can decide that missing data should result in a zero, instead of `NaN`. We can replace all `NaN` values by a any value using the `fillna` method:" + "Let's try to fix the problem above. For example, we can decide that missing data should result in a zero, instead of `NaN`. We can replace all `NaN` values by a any value using the `fillna()` method:" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 107, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2119,10 +1950,8 @@ }, { "cell_type": "code", - "execution_count": 106, - "metadata": { - "collapsed": false - }, + "execution_count": 108, + "metadata": {}, "outputs": [], "source": [ "fixed_bonus_points = bonus_points.fillna(0)\n", @@ -2142,10 +1971,8 @@ }, { "cell_type": "code", - "execution_count": 107, - "metadata": { - "collapsed": false - }, + "execution_count": 109, + "metadata": {}, "outputs": [], "source": [ "bonus_points" @@ -2160,9 +1987,8 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 110, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2179,10 +2005,8 @@ }, { "cell_type": "code", - "execution_count": 109, - "metadata": { - "collapsed": false - }, + "execution_count": 111, + "metadata": {}, "outputs": [], "source": [ "better_bonus_points = bonus_points.copy()\n", @@ -2201,10 +2025,8 @@ }, { "cell_type": "code", - "execution_count": 110, - "metadata": { - "collapsed": false - }, + "execution_count": 112, + "metadata": {}, "outputs": [], "source": [ "grades + better_bonus_points" @@ -2219,9 +2041,8 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 113, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2235,15 +2056,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There's not much we can do about December and Colin: it's bad enough that we are making up bonus points, but we can't reasonably make up grades (well I guess some teachers probably do). So let's call the `dropna` method to get rid of rows that are full of `NaN`s:" + "There's not much we can do about December and Colin: it's bad enough that we are making up bonus points, but we can't reasonably make up grades (well I guess some teachers probably do). So let's call the `dropna()` method to get rid of rows that are full of `NaN`s:" ] }, { "cell_type": "code", - "execution_count": 112, - "metadata": { - "collapsed": false - }, + "execution_count": 114, + "metadata": {}, "outputs": [], "source": [ "final_grades_clean = final_grades.dropna(how=\"all\")\n", @@ -2259,10 +2078,8 @@ }, { "cell_type": "code", - "execution_count": 113, - "metadata": { - "collapsed": false - }, + "execution_count": 115, + "metadata": {}, "outputs": [], "source": [ "final_grades_clean = final_grades_clean.dropna(axis=1, how=\"all\")\n", @@ -2281,9 +2098,8 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 116, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2301,10 +2117,8 @@ }, { "cell_type": "code", - "execution_count": 115, - "metadata": { - "collapsed": false - }, + "execution_count": 117, + "metadata": {}, "outputs": [], "source": [ "grouped_grades = final_grades.groupby(\"hobby\")\n", @@ -2320,10 +2134,8 @@ }, { "cell_type": "code", - "execution_count": 116, - "metadata": { - "collapsed": false - }, + "execution_count": 118, + "metadata": {}, "outputs": [], "source": [ "grouped_grades.mean()" @@ -2346,10 +2158,8 @@ }, { "cell_type": "code", - "execution_count": 117, - "metadata": { - "collapsed": false - }, + "execution_count": 119, + "metadata": {}, "outputs": [], "source": [ "bonus_points" @@ -2357,10 +2167,8 @@ }, { "cell_type": "code", - "execution_count": 118, - "metadata": { - "collapsed": false - }, + "execution_count": 120, + "metadata": {}, "outputs": [], "source": [ "more_grades = final_grades_clean.stack().reset_index()\n", @@ -2373,15 +2181,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can call the `pivot_table` function for this `DataFrame`, asking to group by the `name` column. By default, `pivot_table` computes the `mean` of each numeric column:" + "Now we can call the `pd.pivot_table()` function for this `DataFrame`, asking to group by the `name` column. By default, `pivot_table()` computes the mean of each numeric column:" ] }, { "cell_type": "code", - "execution_count": 119, - "metadata": { - "collapsed": false - }, + "execution_count": 121, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\")" @@ -2391,15 +2197,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can change the aggregation function by setting the `aggfunc` attribute, and we can also specify the list of columns whose values will be aggregated:" + "We can change the aggregation function by setting the `aggfunc` argument, and we can also specify the list of columns whose values will be aggregated:" ] }, { "cell_type": "code", - "execution_count": 120, - "metadata": { - "collapsed": false - }, + "execution_count": 122, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\", values=[\"grade\",\"bonus\"], aggfunc=np.max)" @@ -2414,10 +2218,8 @@ }, { "cell_type": "code", - "execution_count": 121, - "metadata": { - "collapsed": false - }, + "execution_count": 123, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\", values=\"grade\", columns=\"month\", margins=True)" @@ -2432,10 +2234,8 @@ }, { "cell_type": "code", - "execution_count": 122, - "metadata": { - "collapsed": false - }, + "execution_count": 124, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=(\"name\", \"month\"), margins=True)" @@ -2451,9 +2251,8 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 125, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2469,14 +2268,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `head` method returns the top 5 rows:" + "The `head()` method returns the top 5 rows:" ] }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 126, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2488,15 +2286,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Of course there's also a `tail` function to view the bottom 5 rows. You can pass the number of rows you want:" + "Of course there's also a `tail()` function to view the bottom 5 rows. You can pass the number of rows you want:" ] }, { "cell_type": "code", - "execution_count": 125, - "metadata": { - "collapsed": false - }, + "execution_count": 127, + "metadata": {}, "outputs": [], "source": [ "large_df.tail(n=2)" @@ -2506,14 +2302,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `info` method prints out a summary of each columns contents:" + "The `info()` method prints out a summary of each columns contents:" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 128, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2525,7 +2320,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, the `describe` method gives a nice overview of the main aggregated values over each column:\n", + "Finally, the `describe()` method gives a nice overview of the main aggregated values over each column:\n", "* `count`: number of non-null (not NaN) values\n", "* `mean`: mean of non-null values\n", "* `std`: [standard deviation](https://en.wikipedia.org/wiki/Standard_deviation) of non-null values\n", @@ -2536,9 +2331,8 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 129, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2556,10 +2350,8 @@ }, { "cell_type": "code", - "execution_count": 128, - "metadata": { - "collapsed": false - }, + "execution_count": 130, + "metadata": {}, "outputs": [], "source": [ "my_df = pd.DataFrame(\n", @@ -2580,10 +2372,8 @@ }, { "cell_type": "code", - "execution_count": 129, - "metadata": { - "collapsed": true - }, + "execution_count": 131, + "metadata": {}, "outputs": [], "source": [ "my_df.to_csv(\"my_df.csv\")\n", @@ -2600,10 +2390,8 @@ }, { "cell_type": "code", - "execution_count": 130, - "metadata": { - "collapsed": false - }, + "execution_count": 132, + "metadata": {}, "outputs": [], "source": [ "for filename in (\"my_df.csv\", \"my_df.html\", \"my_df.json\"):\n", @@ -2624,10 +2412,8 @@ }, { "cell_type": "code", - "execution_count": 131, - "metadata": { - "collapsed": false - }, + "execution_count": 133, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -2646,10 +2432,8 @@ }, { "cell_type": "code", - "execution_count": 132, - "metadata": { - "collapsed": false - }, + "execution_count": 134, + "metadata": {}, "outputs": [], "source": [ "my_df_loaded = pd.read_csv(\"my_df.csv\", index_col=0)\n", @@ -2665,10 +2449,8 @@ }, { "cell_type": "code", - "execution_count": 133, - "metadata": { - "collapsed": false - }, + "execution_count": 135, + "metadata": {}, "outputs": [], "source": [ "us_cities = None\n", @@ -2700,10 +2482,8 @@ }, { "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": false - }, + "execution_count": 136, + "metadata": {}, "outputs": [], "source": [ "city_loc = pd.DataFrame(\n", @@ -2719,10 +2499,8 @@ }, { "cell_type": "code", - "execution_count": 135, - "metadata": { - "collapsed": false - }, + "execution_count": 137, + "metadata": {}, "outputs": [], "source": [ "city_pop = pd.DataFrame(\n", @@ -2739,15 +2517,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's join these `DataFrame`s using the `merge` function:" + "Now let's join these `DataFrame`s using the `merge()` function:" ] }, { "cell_type": "code", - "execution_count": 136, - "metadata": { - "collapsed": false - }, + "execution_count": 138, + "metadata": {}, "outputs": [], "source": [ "pd.merge(left=city_loc, right=city_pop, on=\"city\")" @@ -2764,10 +2540,8 @@ }, { "cell_type": "code", - "execution_count": 137, - "metadata": { - "collapsed": false - }, + "execution_count": 139, + "metadata": {}, "outputs": [], "source": [ "all_cities = pd.merge(left=city_loc, right=city_pop, on=\"city\", how=\"outer\")\n", @@ -2783,10 +2557,8 @@ }, { "cell_type": "code", - "execution_count": 138, - "metadata": { - "collapsed": false - }, + "execution_count": 140, + "metadata": {}, "outputs": [], "source": [ "pd.merge(left=city_loc, right=city_pop, on=\"city\", how=\"right\")" @@ -2801,10 +2573,8 @@ }, { "cell_type": "code", - "execution_count": 139, - "metadata": { - "collapsed": false - }, + "execution_count": 141, + "metadata": {}, "outputs": [], "source": [ "city_pop2 = city_pop.copy()\n", @@ -2817,15 +2587,13 @@ "metadata": {}, "source": [ "## Concatenation\n", - "Rather than joining `DataFrame`s, we may just want to concatenate them. That's what `concat` is for:" + "Rather than joining `DataFrame`s, we may just want to concatenate them. That's what `concat()` is for:" ] }, { "cell_type": "code", - "execution_count": 140, - "metadata": { - "collapsed": false - }, + "execution_count": 142, + "metadata": {}, "outputs": [], "source": [ "result_concat = pd.concat([city_loc, city_pop])\n", @@ -2841,10 +2609,8 @@ }, { "cell_type": "code", - "execution_count": 141, - "metadata": { - "collapsed": false - }, + "execution_count": 143, + "metadata": {}, "outputs": [], "source": [ "result_concat.loc[3]" @@ -2859,10 +2625,8 @@ }, { "cell_type": "code", - "execution_count": 142, - "metadata": { - "collapsed": false - }, + "execution_count": 144, + "metadata": {}, "outputs": [], "source": [ "pd.concat([city_loc, city_pop], ignore_index=True)" @@ -2877,10 +2641,8 @@ }, { "cell_type": "code", - "execution_count": 143, - "metadata": { - "collapsed": false - }, + "execution_count": 145, + "metadata": {}, "outputs": [], "source": [ "pd.concat([city_loc, city_pop], join=\"inner\")" @@ -2895,9 +2657,8 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 146, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2914,9 +2675,8 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 147, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2935,15 +2695,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `append` method is a useful shorthand for concatenating `DataFrame`s vertically:" + "The `append()` method is a useful shorthand for concatenating `DataFrame`s vertically:" ] }, { "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": false - }, + "execution_count": 148, + "metadata": {}, "outputs": [], "source": [ "city_loc.append(city_pop)" @@ -2953,7 +2711,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As always in pandas, the `append` method does *not* actually modify `city_loc`: it works on a copy and returns the modified copy." + "As always in pandas, the `append()` method does *not* actually modify `city_loc`: it works on a copy and returns the modified copy." ] }, { @@ -2966,10 +2724,8 @@ }, { "cell_type": "code", - "execution_count": 147, - "metadata": { - "collapsed": false - }, + "execution_count": 149, + "metadata": {}, "outputs": [], "source": [ "city_eco = city_pop.copy()\n", @@ -2986,10 +2742,8 @@ }, { "cell_type": "code", - "execution_count": 148, - "metadata": { - "collapsed": false - }, + "execution_count": 150, + "metadata": {}, "outputs": [], "source": [ "city_eco[\"economy\"] = city_eco[\"eco_code\"].astype('category')\n", @@ -3005,10 +2759,8 @@ }, { "cell_type": "code", - "execution_count": 149, - "metadata": { - "collapsed": false - }, + "execution_count": 151, + "metadata": {}, "outputs": [], "source": [ "city_eco[\"economy\"].cat.categories = [\"Finance\", \"Energy\", \"Tourism\"]\n", @@ -3024,10 +2776,8 @@ }, { "cell_type": "code", - "execution_count": 150, - "metadata": { - "collapsed": false - }, + "execution_count": 152, + "metadata": {}, "outputs": [], "source": [ "city_eco.sort_values(by=\"economy\", ascending=False)" @@ -3042,25 +2792,32 @@ "# What next?\n", "As you probably noticed by now, pandas is quite a large library with *many* features. Although we went through the most important features, there is still a lot to discover. Probably the best way to learn more is to get your hands dirty with some real-life data. It is also a good idea to go through pandas' excellent [documentation](http://pandas.pydata.org/pandas-docs/stable/index.html), in particular the [Cookbook](http://pandas.pydata.org/pandas-docs/stable/cookbook.html)." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.6.3" }, "toc": { "toc_cell": false, @@ -3071,5 +2828,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From c8b7f045eeb0d755e5020105f93c275923318add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sun, 14 Jan 2018 09:11:47 +0100 Subject: [PATCH 21/42] Fix hyperparameter search and comment at the end of the solution of exercise 5, chapter 2 --- 02_end_to_end_machine_learning_project.ipynb | 103 +++++-------------- 1 file changed, 25 insertions(+), 78 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 1cadabb..1e51f9a 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -406,9 +406,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing = strat_train_set.copy()" @@ -486,9 +484,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "corr_matrix = housing.corr()" @@ -533,9 +529,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n", @@ -591,9 +585,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n", @@ -642,9 +634,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", @@ -662,9 +652,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing_num = housing.drop('ocean_proximity', axis=1)\n", @@ -715,9 +703,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X = imputer.transform(housing_num)" @@ -726,9 +712,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", @@ -859,9 +843,7 @@ { "cell_type": "code", "execution_count": 63, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Definition of the CategoricalEncoder class, copied from PR #9151.\n", @@ -1126,9 +1108,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1175,9 +1155,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -1211,9 +1189,7 @@ { "cell_type": "code", "execution_count": 72, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1261,9 +1237,7 @@ { "cell_type": "code", "execution_count": 74, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", @@ -1411,9 +1385,7 @@ { "cell_type": "code", "execution_count": 85, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", @@ -1644,9 +1616,7 @@ { "cell_type": "code", "execution_count": 102, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "final_model = grid_search.best_estimator_\n", @@ -1709,9 +1679,7 @@ { "cell_type": "code", "execution_count": 105, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "my_model = full_pipeline_with_predictor" @@ -1720,9 +1688,7 @@ { "cell_type": "code", "execution_count": 106, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.externals import joblib\n", @@ -1991,9 +1957,7 @@ { "cell_type": "code", "execution_count": 116, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -2029,9 +1993,7 @@ { "cell_type": "code", "execution_count": 117, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "k = 5" @@ -2089,9 +2051,7 @@ { "cell_type": "code", "execution_count": 121, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "preparation_and_feature_selection_pipeline = Pipeline([\n", @@ -2103,9 +2063,7 @@ { "cell_type": "code", "execution_count": 122, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)" @@ -2167,9 +2125,7 @@ { "cell_type": "code", "execution_count": 125, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "prepare_select_and_predict_pipeline = Pipeline([\n", @@ -2237,7 +2193,7 @@ "source": [ "param_grid = [\n", " {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", - " 'feature_selection__k': [3, 4, 5, 6, 7]}\n", + " 'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n", "]\n", "\n", "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", @@ -2258,16 +2214,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Great! It seems that we had the right imputer strategy (median), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise." - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "housing.shape" + "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise." ] }, { @@ -2294,7 +2241,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.3" }, "nav_menu": { "height": "279px", From 94914db82ed5ee17dfa08f31aaf7a8175717c4eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Mon, 15 Jan 2018 17:25:17 +0100 Subject: [PATCH 22/42] Make the code example 1-1 easier to read, and create a better `prepare_country_stats()` function --- 01_the_machine_learning_landscape.ipynb | 455 ++++++++++++------------ 1 file changed, 236 insertions(+), 219 deletions(-) diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb index 6a080af..8b99fce 100644 --- a/01_the_machine_learning_landscape.ipynb +++ b/01_the_machine_learning_landscape.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 1 – The Machine Learning landscape**\n", "\n", @@ -14,20 +11,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -36,9 +27,6 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "slideshow": { "slide_type": "-" } @@ -50,11 +38,10 @@ "\n", "# Common imports\n", "import numpy as np\n", - "import numpy.random as rnd\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", - "rnd.seed(42)\n", + "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", @@ -73,35 +60,173 @@ " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format='png', dpi=300)" + " plt.savefig(path, format='png', dpi=300)\n", + "\n", + "# Ignore useless warnings (see SciPy issue #5998)\n", + "import warnings\n", + "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ - "# Load and prepare Life satisfaction data" + "# Code example 1-1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function just merges the OECD's life satisfaction data and the IMF's GDP per capita data. It's a bit too long and boring and it's not specific to Machine Learning, which is why I left it out of the book." ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ + "def prepare_country_stats(oecd_bli, gdp_per_capita):\n", + " oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", + " oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", + " gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", + " gdp_per_capita.set_index(\"Country\", inplace=True)\n", + " full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,\n", + " left_index=True, right_index=True)\n", + " full_country_stats.sort_values(by=\"GDP per capita\", inplace=True)\n", + " remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", + " keep_indices = list(set(range(36)) - set(remove_indices))\n", + " return full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code in the book expects the data files to be located in the current directory. I just tweaked it here to fetch the files in datasets/lifesat." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "datapath = os.path.join(\"datasets\", \"lifesat\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Code example\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pandas as pd\n", + "import sklearn.linear_model\n", "\n", - "# Download CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n", - "datapath = \"datasets/lifesat/\"\n", + "# Load the data\n", + "oecd_bli = pd.read_csv(datapath + \"oecd_bli_2015.csv\", thousands=',')\n", + "gdp_per_capita = pd.read_csv(datapath + \"gdp_per_capita.csv\",thousands=',',delimiter='\\t',\n", + " encoding='latin1', na_values=\"n/a\")\n", "\n", - "oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", + "# Prepare the data\n", + "country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n", + "X = np.c_[country_stats[\"GDP per capita\"]]\n", + "y = np.c_[country_stats[\"Life satisfaction\"]]\n", + "\n", + "# Visualize the data\n", + "country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n", + "plt.show()\n", + "\n", + "# Select a linear model\n", + "model = sklearn.linear_model.LinearRegression()\n", + "\n", + "# Train the model\n", + "model.fit(X, y)\n", + "\n", + "# Make a prediction for Cyprus\n", + "X_new = [[22587]] # Cyprus' GDP per capita\n", + "print(model.predict(X_new)) # outputs [[ 5.96242338]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Note: you can ignore the rest of this notebook, it just generates many of the figures in chapter 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load and prepare Life satisfaction data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want, you can get fresh data from the OECD's website.\n", + "Download the CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n", + "and save it to `datasets/lifesat/`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "oecd_bli = pd.read_csv(datapath + \"oecd_bli_2015.csv\", thousands=',')\n", "oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", "oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", "oecd_bli.head(2)" @@ -109,12 +234,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "oecd_bli[\"Life satisfaction\"].head()" @@ -122,25 +243,24 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Load and prepare GDP per capita data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just like above, you can update the GDP per capita data if you want. Just download data from http://goo.gl/j1MSKe (=> imf.org) and save it to `datasets/lifesat/`." + ] + }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ - "# Download data from http://goo.gl/j1MSKe (=> imf.org)\n", "gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',', delimiter='\\t',\n", " encoding='latin1', na_values=\"n/a\")\n", "gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", @@ -150,12 +270,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 8, + "metadata": {}, "outputs": [], "source": [ "full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)\n", @@ -165,12 +281,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 9, + "metadata": {}, "outputs": [], "source": [ "full_country_stats[[\"GDP per capita\", 'Life satisfaction']].loc[\"United States\"]" @@ -178,12 +290,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 10, + "metadata": {}, "outputs": [], "source": [ "remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", @@ -195,12 +303,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", @@ -224,25 +328,17 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 12, + "metadata": {}, "outputs": [], "source": [ - "sample_data.to_csv(\"life_satisfaction_vs_gdp_per_capita.csv\")" + "sample_data.to_csv(os.path.join(\"datasets\", \"lifesat\", \"lifesat.csv\"))" ] }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "sample_data.loc[list(position_text.keys())]" @@ -250,12 +346,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -278,12 +370,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [], "source": [ "from sklearn import linear_model\n", @@ -297,12 +385,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", @@ -317,12 +401,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "cyprus_gdp_per_capita = gdp_per_capita.loc[\"Cyprus\"][\"GDP per capita\"]\n", @@ -333,12 +413,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 18, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3), s=1)\n", @@ -356,12 +432,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ "sample_data[7:10]" @@ -369,12 +441,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 20, + "metadata": {}, "outputs": [], "source": [ "(5.1+5.7+6.5)/3" @@ -382,28 +450,29 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "backup = oecd_bli, gdp_per_capita\n", "\n", "def prepare_country_stats(oecd_bli, gdp_per_capita):\n", - " return sample_data" + " oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", + " oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", + " gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", + " gdp_per_capita.set_index(\"Country\", inplace=True)\n", + " full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,\n", + " left_index=True, right_index=True)\n", + " full_country_stats.sort_values(by=\"GDP per capita\", inplace=True)\n", + " remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", + " keep_indices = list(set(range(36)) - set(remove_indices))\n", + " return full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]" ] }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 22, + "metadata": {}, "outputs": [], "source": [ "# Code example\n", @@ -440,12 +509,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 23, + "metadata": {}, "outputs": [], "source": [ "oecd_bli, gdp_per_capita = backup" @@ -453,12 +518,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [], "source": [ "missing_data" @@ -466,12 +527,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 25, + "metadata": {}, "outputs": [], "source": [ "position_text2 = {\n", @@ -487,12 +544,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 26, + "metadata": {}, "outputs": [], "source": [ "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n", @@ -522,12 +575,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "full_country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n", @@ -550,12 +599,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [], "source": [ "full_country_stats.loc[[c for c in full_country_stats.index if \"W\" in c.upper()]][\"Life satisfaction\"]" @@ -563,12 +608,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 29, + "metadata": {}, "outputs": [], "source": [ "gdp_per_capita.loc[[c for c in gdp_per_capita.index if \"W\" in c.upper()]].head()" @@ -576,12 +617,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 30, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8,3))\n", @@ -611,12 +648,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ "backup = oecd_bli, gdp_per_capita\n", @@ -627,12 +660,8 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 32, + "metadata": {}, "outputs": [], "source": [ "# Replace this linear model:\n", @@ -641,12 +670,8 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "# with this k-neighbors regression model:\n", @@ -655,12 +680,8 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 34, + "metadata": {}, "outputs": [], "source": [ "X = np.c_[country_stats[\"GDP per capita\"]]\n", @@ -677,11 +698,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -702,7 +719,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.3" }, "nav_menu": {}, "toc": { @@ -723,5 +740,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 87040e084e8ac70d8c91745b5fb31206dcdf49de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 18 Jan 2018 17:41:32 +0100 Subject: [PATCH 23/42] Replace reduce_sum with reduce_mean: adds an extra .1% accuracy :) --- extra_capsnets.ipynb | 386 ++++++++++++++++--------------------------- 1 file changed, 138 insertions(+), 248 deletions(-) diff --git a/extra_capsnets.ipynb b/extra_capsnets.ipynb index cdfeb2d..20e5dd1 100644 --- a/extra_capsnets.ipynb +++ b/extra_capsnets.ipynb @@ -77,10 +77,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, + "execution_count": 3, + "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function, unicode_literals" @@ -95,10 +93,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -115,10 +111,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, + "execution_count": 5, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -141,10 +135,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, + "execution_count": 6, + "metadata": {}, "outputs": [], "source": [ "tf.reset_default_graph()" @@ -159,10 +151,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, + "execution_count": 7, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -185,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -203,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -228,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -286,10 +276,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true - }, + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ "X = tf.placeholder(shape=[None, 28, 28, 1], dtype=tf.float32, name=\"X\")" @@ -311,10 +299,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": true - }, + "execution_count": 12, + "metadata": {}, "outputs": [], "source": [ "caps1_n_maps = 32\n", @@ -331,10 +317,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "conv1_params = {\n", @@ -356,10 +340,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": true - }, + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "conv1 = tf.layers.conv2d(X, name=\"conv1\", **conv1_params)\n", @@ -382,10 +364,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": true - }, + "execution_count": 15, + "metadata": {}, "outputs": [], "source": [ "caps1_raw = tf.reshape(conv2, [-1, caps1_n_caps, caps1_n_dims],\n", @@ -407,10 +387,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true - }, + "execution_count": 16, + "metadata": {}, "outputs": [], "source": [ "def squash(s, axis=-1, epsilon=1e-7, name=None):\n", @@ -432,10 +410,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "caps1_output = squash(caps1_raw, name=\"caps1_output\")" @@ -478,10 +454,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": true - }, + "execution_count": 18, + "metadata": {}, "outputs": [], "source": [ "caps2_n_caps = 10\n", @@ -568,10 +542,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": true - }, + "execution_count": 19, + "metadata": {}, "outputs": [], "source": [ "init_sigma = 0.01\n", @@ -591,10 +563,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true - }, + "execution_count": 20, + "metadata": {}, "outputs": [], "source": [ "batch_size = tf.shape(X)[0]\n", @@ -610,10 +580,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": true - }, + "execution_count": 21, + "metadata": {}, "outputs": [], "source": [ "caps1_output_expanded = tf.expand_dims(caps1_output, -1,\n", @@ -633,7 +601,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -649,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -665,10 +633,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": true - }, + "execution_count": 24, + "metadata": {}, "outputs": [], "source": [ "caps2_predicted = tf.matmul(W_tiled, caps1_output_tiled,\n", @@ -684,7 +650,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -714,10 +680,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true - }, + "execution_count": 26, + "metadata": {}, "outputs": [], "source": [ "raw_weights = tf.zeros([batch_size, caps1_n_caps, caps2_n_caps, 1, 1],\n", @@ -747,10 +711,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "routing_weights = tf.nn.softmax(raw_weights, dim=2, name=\"routing_weights\")" @@ -765,10 +727,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": { - "collapsed": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [], "source": [ "weighted_predictions = tf.multiply(routing_weights, caps2_predicted,\n", @@ -797,10 +757,8 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": true - }, + "execution_count": 29, + "metadata": {}, "outputs": [], "source": [ "caps2_output_round_1 = squash(weighted_sum, axis=-2,\n", @@ -809,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -853,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -869,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -885,10 +843,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "caps2_output_round_1_tiled = tf.tile(\n", @@ -905,10 +861,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": true - }, + "execution_count": 34, + "metadata": {}, "outputs": [], "source": [ "agreement = tf.matmul(caps2_predicted, caps2_output_round_1_tiled,\n", @@ -924,10 +878,8 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": true - }, + "execution_count": 35, + "metadata": {}, "outputs": [], "source": [ "raw_weights_round_2 = tf.add(raw_weights, agreement,\n", @@ -943,10 +895,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": true - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "routing_weights_round_2 = tf.nn.softmax(raw_weights_round_2,\n", @@ -972,10 +922,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": true - }, + "execution_count": 37, + "metadata": {}, "outputs": [], "source": [ "caps2_output = caps2_output_round_2" @@ -1003,7 +951,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1043,7 +991,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1073,10 +1021,8 @@ }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": true - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ "def safe_norm(s, axis=-1, epsilon=1e-7, keep_dims=False, name=None):\n", @@ -1088,10 +1034,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": true - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ "y_proba = safe_norm(caps2_output, axis=-2, name=\"y_proba\")" @@ -1106,10 +1050,8 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": true - }, + "execution_count": 42, + "metadata": {}, "outputs": [], "source": [ "y_proba_argmax = tf.argmax(y_proba, axis=2, name=\"y_proba\")" @@ -1124,7 +1066,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1140,10 +1082,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": true - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "y_pred = tf.squeeze(y_proba_argmax, axis=[1,2], name=\"y_pred\")" @@ -1151,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1181,10 +1121,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": true - }, + "execution_count": 46, + "metadata": {}, "outputs": [], "source": [ "y = tf.placeholder(shape=[None], dtype=tf.int64, name=\"y\")" @@ -1212,10 +1150,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": true - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "m_plus = 0.9\n", @@ -1232,10 +1168,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": true - }, + "execution_count": 48, + "metadata": {}, "outputs": [], "source": [ "T = tf.one_hot(y, depth=caps2_n_caps, name=\"T\")" @@ -1250,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1267,7 +1201,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -1283,10 +1217,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": true - }, + "execution_count": 51, + "metadata": {}, "outputs": [], "source": [ "caps2_output_norm = safe_norm(caps2_output, axis=-2, keep_dims=True,\n", @@ -1302,10 +1234,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": true - }, + "execution_count": 52, + "metadata": {}, "outputs": [], "source": [ "present_error_raw = tf.square(tf.maximum(0., m_plus - caps2_output_norm),\n", @@ -1323,10 +1253,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true - }, + "execution_count": 53, + "metadata": {}, "outputs": [], "source": [ "absent_error_raw = tf.square(tf.maximum(0., caps2_output_norm - m_minus),\n", @@ -1344,10 +1272,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": true - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "L = tf.add(T * present_error, lambda_ * (1.0 - T) * absent_error,\n", @@ -1363,10 +1289,8 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": true - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "margin_loss = tf.reduce_mean(tf.reduce_sum(L, axis=1), name=\"margin_loss\")" @@ -1409,10 +1333,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": true - }, + "execution_count": 56, + "metadata": {}, "outputs": [], "source": [ "mask_with_labels = tf.placeholder_with_default(False, shape=(),\n", @@ -1428,10 +1350,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": true - }, + "execution_count": 57, + "metadata": {}, "outputs": [], "source": [ "reconstruction_targets = tf.cond(mask_with_labels, # condition\n", @@ -1458,10 +1378,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": true - }, + "execution_count": 58, + "metadata": {}, "outputs": [], "source": [ "reconstruction_mask = tf.one_hot(reconstruction_targets,\n", @@ -1478,7 +1396,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1494,7 +1412,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1510,10 +1428,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": true - }, + "execution_count": 61, + "metadata": {}, "outputs": [], "source": [ "reconstruction_mask_reshaped = tf.reshape(\n", @@ -1530,10 +1446,8 @@ }, { "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": true - }, + "execution_count": 62, + "metadata": {}, "outputs": [], "source": [ "caps2_output_masked = tf.multiply(\n", @@ -1543,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1559,10 +1473,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "decoder_input = tf.reshape(caps2_output_masked,\n", @@ -1579,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1602,10 +1514,8 @@ }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": true - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "n_hidden1 = 512\n", @@ -1615,10 +1525,8 @@ }, { "cell_type": "code", - "execution_count": 66, - "metadata": { - "collapsed": true - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"decoder\"):\n", @@ -1649,16 +1557,14 @@ }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": true - }, + "execution_count": 68, + "metadata": {}, "outputs": [], "source": [ "X_flat = tf.reshape(X, [-1, n_output], name=\"X_flat\")\n", "squared_difference = tf.square(X_flat - decoder_output,\n", " name=\"squared_difference\")\n", - "reconstruction_loss = tf.reduce_sum(squared_difference,\n", + "reconstruction_loss = tf.reduce_mean(squared_difference,\n", " name=\"reconstruction_loss\")" ] }, @@ -1678,10 +1584,8 @@ }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": true - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "alpha = 0.0005\n", @@ -1712,10 +1616,8 @@ }, { "cell_type": "code", - "execution_count": 69, - "metadata": { - "collapsed": true - }, + "execution_count": 70, + "metadata": {}, "outputs": [], "source": [ "correct = tf.equal(y, y_pred, name=\"correct\")\n", @@ -1738,10 +1640,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": true - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "optimizer = tf.train.AdamOptimizer()\n", @@ -1764,10 +1664,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": true - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -1804,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1870,7 +1768,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Training is finished, we reached over 99.3% accuracy on the validation set after just 5 epochs, things are looking good. Now let's evaluate the model on the test set." + "Training is finished, we reached over 99.4% accuracy on the validation set after just 5 epochs, things are looking good. Now let's evaluate the model on the test set." ] }, { @@ -1882,7 +1780,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1915,7 +1813,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We reach 99.43% accuracy on the test set. Pretty nice. :)" + "We reach 99.53% accuracy on the test set. Pretty nice. :)" ] }, { @@ -1934,7 +1832,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1966,7 +1864,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -2022,7 +1920,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -2038,10 +1936,8 @@ }, { "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": true - }, + "execution_count": 78, + "metadata": {}, "outputs": [], "source": [ "def tweak_pose_parameters(output_vectors, min=-0.5, max=0.5, n_steps=11):\n", @@ -2062,10 +1958,8 @@ }, { "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": true - }, + "execution_count": 79, + "metadata": {}, "outputs": [], "source": [ "n_steps = 11\n", @@ -2084,7 +1978,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -2108,10 +2002,8 @@ }, { "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": true - }, + "execution_count": 81, + "metadata": {}, "outputs": [], "source": [ "tweak_reconstructions = decoder_output_value.reshape(\n", @@ -2127,7 +2019,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -2161,9 +2053,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } From 916d64c7a27e56afd985691318503d0e3fd75efa Mon Sep 17 00:00:00 2001 From: Nick Graham Date: Mon, 22 Jan 2018 14:08:12 -0600 Subject: [PATCH 24/42] Fixes typo --- 03_classification.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index 1e7960a..b021d97 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -2281,7 +2281,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It seems that the ham emails are more often plain text, while spam has quite a lot of HTML. Moreover, quite a few ham emails are signed using PGP, while no spam is. In short, it seems that the email structure is a usual information to have." + "It seems that the ham emails are more often plain text, while spam has quite a lot of HTML. Moreover, quite a few ham emails are signed using PGP, while no spam is. In short, it seems that the email structure is useful information to have." ] }, { From 487dd9a2c5b2af04b8e73fb21e7cc933ce3f5659 Mon Sep 17 00:00:00 2001 From: Nick Graham Date: Mon, 22 Jan 2018 14:33:22 -0600 Subject: [PATCH 25/42] Fixes precision / recall percent notation 0.95 = 95% 0.95% = 0.0095 --- 03_classification.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index 1e7960a..062fccb 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -2714,8 +2714,8 @@ "\n", "y_pred = log_clf.predict(X_test_transformed)\n", "\n", - "print(\"Precision: {:.2f}%\".format(precision_score(y_test, y_pred)))\n", - "print(\"Recall: {:.2f}%\".format(recall_score(y_test, y_pred)))" + "print(\"Precision: {:.2f}%\".format(100 * precision_score(y_test, y_pred)))\n", + "print(\"Recall: {:.2f}%\".format(100 * recall_score(y_test, y_pred)))" ] } ], From 385d635e929fafbb396bb99171b0292a8bcda078 Mon Sep 17 00:00:00 2001 From: rickiepark Date: Tue, 30 Jan 2018 17:19:37 +0900 Subject: [PATCH 26/42] add params for avoiding warn and improving perf. --- 02_end_to_end_machine_learning_project.ipynb | 2 +- 03_classification.ipynb | 6 +++--- 04_training_linear_models.ipynb | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 1e51f9a..14a7c20 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -1503,7 +1503,7 @@ "forest_reg = RandomForestRegressor(random_state=42)\n", "# train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n", "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", - " scoring='neg_mean_squared_error')\n", + " scoring='neg_mean_squared_error', return_train_score=True)\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, diff --git a/03_classification.ipynb b/03_classification.ipynb index 0f8b455..7c66716 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -241,7 +241,7 @@ "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(random_state=42)\n", + "sgd_clf = SGDClassifier(max_iter=5, random_state=42)\n", "sgd_clf.fit(X_train, y_train_5)" ] }, @@ -766,7 +766,7 @@ "outputs": [], "source": [ "from sklearn.multiclass import OneVsOneClassifier\n", - "ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))\n", + "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))\n", "ovo_clf.fit(X_train, y_train)\n", "ovo_clf.predict([some_digit])" ] @@ -1185,7 +1185,7 @@ "param_grid = [{'weights': [\"uniform\", \"distance\"], 'n_neighbors': [3, 4, 5]}]\n", "\n", "knn_clf = KNeighborsClassifier()\n", - "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)\n", + "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)\n", "grid_search.fit(X_train, y_train)" ] }, diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index a32fdea..4cf264e 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -452,7 +452,7 @@ "outputs": [], "source": [ "from sklearn.linear_model import SGDRegressor\n", - "sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1, random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)\n", "sgd_reg.fit(X, y.ravel())" ] }, @@ -880,7 +880,7 @@ }, "outputs": [], "source": [ - "sgd_reg = SGDRegressor(penalty=\"l2\", random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=5, penalty=\"l2\", random_state=42)\n", "sgd_reg.fit(X, y.ravel())\n", "sgd_reg.predict([[1.5]])" ] @@ -981,7 +981,7 @@ "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n", "X_val_poly_scaled = poly_scaler.transform(X_val)\n", "\n", - "sgd_reg = SGDRegressor(n_iter=1,\n", + "sgd_reg = SGDRegressor(max_iter=1,\n", " penalty=None,\n", " eta0=0.0005,\n", " warm_start=True,\n", @@ -1030,7 +1030,7 @@ "outputs": [], "source": [ "from sklearn.base import clone\n", - "sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None,\n", + "sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,\n", " learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", "\n", "minimum_val_error = float(\"inf\")\n", From 483bc589cef24b7e4bfce066f85bbb618329ed2c Mon Sep 17 00:00:00 2001 From: rickiepark Date: Tue, 30 Jan 2018 17:26:07 +0900 Subject: [PATCH 27/42] sync with upstream --- .gitignore | 2 -- environment.yml | 16 ---------------- 2 files changed, 18 deletions(-) delete mode 100644 environment.yml diff --git a/.gitignore b/.gitignore index b8f995c..c77a27e 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,3 @@ my_* datasets/words datasets/flowers datasets/spam -*.gz -datasets/mnist/train-labels-idx1-ubyte diff --git a/environment.yml b/environment.yml deleted file mode 100644 index d930bee..0000000 --- a/environment.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: handson-ml -dependencies: -- python=3.5 -- jupyter -- matplotlib -- numexpr -- numpy -- pandas -- Pillow -- psutil -- scikit-learn -- scipy -- sympy -- pip: - - tensorflow - - watermark From 31d2f0d6955f98d5a3f2751a1fefa118318c64c3 Mon Sep 17 00:00:00 2001 From: rickiepark Date: Tue, 30 Jan 2018 17:49:44 +0900 Subject: [PATCH 28/42] add n_jobs param --- 03_classification.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index 7c66716..284d0a6 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -948,7 +948,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)\n", + "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)\n", "f1_score(y_multilabel, y_train_knn_pred, average=\"macro\")" ] }, From 1a6094a8dcd3a4e347427b218bf5d166c4517e76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 6 Feb 2018 16:36:23 +0100 Subject: [PATCH 29/42] Fix error in training a stacked encoder one encoder at a time (fixes #166) --- 15_autoencoders.ipynb | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/15_autoencoders.ipynb b/15_autoencoders.ipynb index 0629c1f..e42329f 100644 --- a/15_autoencoders.ipynb +++ b/15_autoencoders.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -80,9 +78,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_image(image, shape=[28, 28]):\n", @@ -93,9 +89,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_multiple_images(images, n_rows, n_cols, pad=2):\n", @@ -126,9 +120,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy.random as rnd\n", @@ -419,9 +411,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -479,9 +469,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "saver = tf.train.Saver()" @@ -545,9 +533,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -555,8 +541,9 @@ "from functools import partial\n", "\n", "def train_autoencoder(X_train, n_neurons, n_epochs, batch_size,\n", - " learning_rate = 0.01, l2_reg = 0.0005,\n", - " activation=tf.nn.elu, seed=42):\n", + " learning_rate = 0.01, l2_reg = 0.0005, seed=42,\n", + " hidden_activation=tf.nn.elu,\n", + " output_activation=tf.nn.elu):\n", " graph = tf.Graph()\n", " with graph.as_default():\n", " tf.set_random_seed(seed)\n", @@ -567,12 +554,11 @@ " \n", " my_dense_layer = partial(\n", " tf.layers.dense,\n", - " activation=activation,\n", " kernel_initializer=tf.contrib.layers.variance_scaling_initializer(),\n", " kernel_regularizer=tf.contrib.layers.l2_regularizer(l2_reg))\n", "\n", - " hidden = my_dense_layer(X, n_neurons, name=\"hidden\")\n", - " outputs = my_dense_layer(hidden, n_inputs, activation=None, name=\"outputs\")\n", + " hidden = my_dense_layer(X, n_neurons, activation=hidden_activation, name=\"hidden\")\n", + " outputs = my_dense_layer(hidden, n_inputs, activation=output_activation, name=\"outputs\")\n", "\n", " reconstruction_loss = tf.reduce_mean(tf.square(outputs - X))\n", "\n", @@ -614,7 +600,8 @@ "metadata": {}, "outputs": [], "source": [ - "hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150)\n", + "hidden_output, W1, b1, W4, b4 = train_autoencoder(mnist.train.images, n_neurons=300, n_epochs=4, batch_size=150,\n", + " output_activation=None)\n", "_, W2, b2, W3, b3 = train_autoencoder(hidden_output, n_neurons=150, n_epochs=4, batch_size=150)" ] }, @@ -1748,7 +1735,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" }, "nav_menu": { "height": "381px", From 3a72a2b4aafed0d422e996f33142685a1c13ee7e Mon Sep 17 00:00:00 2001 From: arodiss Date: Sat, 10 Feb 2018 19:24:56 +0200 Subject: [PATCH 30/42] made initial weights higher-variance --- extra_capsnets.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extra_capsnets.ipynb b/extra_capsnets.ipynb index 20e5dd1..1569b4a 100644 --- a/extra_capsnets.ipynb +++ b/extra_capsnets.ipynb @@ -537,7 +537,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Okay, let's start by creating a trainable variable of shape (1, 1152, 10, 16, 8) that will hold all the transformation matrices. The first dimension of size 1 will make this array easy to tile. We initialize this variable randomly using a normal distribution with a standard deviation to 0.01." + "Okay, let's start by creating a trainable variable of shape (1, 1152, 10, 16, 8) that will hold all the transformation matrices. The first dimension of size 1 will make this array easy to tile. We initialize this variable randomly using a normal distribution with a standard deviation to 0.1." ] }, { @@ -546,7 +546,7 @@ "metadata": {}, "outputs": [], "source": [ - "init_sigma = 0.01\n", + "init_sigma = 0.1\n", "\n", "W_init = tf.random_normal(\n", " shape=(1, caps1_n_caps, caps2_n_caps, caps2_n_dims, caps1_n_dims),\n", From de6f9895e923ca1051c3e645316800784de083fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Wed, 21 Feb 2018 23:04:09 +0100 Subject: [PATCH 31/42] Remove duplicate code in notebook for chapter 4, fixed #180 --- 04_training_linear_models.ipynb | 670 ++++++-------------------------- 1 file changed, 121 insertions(+), 549 deletions(-) diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index 4cf264e..1804968 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -2,40 +2,28 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 4 – Training Linear Models**" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_This notebook contains all the sample code and solutions to the exercises in chapter 4._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -43,11 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -82,10 +66,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Linear regression using the Normal Equation" ] @@ -93,11 +74,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -109,11 +86,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X, y, \"b.\")\n", @@ -127,11 +100,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance\n", @@ -141,11 +110,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta_best" @@ -154,11 +119,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_new = np.array([[0], [2]])\n", @@ -170,11 +131,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X_new, y_predict, \"r-\")\n", @@ -185,10 +142,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The figure in the book actually corresponds to the following code, with a legend and axis labels:" ] @@ -196,11 +150,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n", @@ -216,11 +166,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", @@ -232,11 +178,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_reg.predict(X_new)" @@ -244,10 +186,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Linear regression using batch gradient descent" ] @@ -255,11 +194,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "eta = 0.1\n", @@ -275,11 +210,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta" @@ -288,11 +219,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_new_b.dot(theta)" @@ -301,11 +228,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta_path_bgd = []\n", @@ -325,29 +248,13 @@ " theta_path.append(theta)\n", " plt.xlabel(\"$x_1$\", fontsize=18)\n", " plt.axis([0, 2, 0, 15])\n", - " plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)\n", - "\n", - "np.random.seed(42)\n", - "theta = np.random.randn(2,1) # random initialization\n", - "\n", - "plt.figure(figsize=(10,4))\n", - "plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n", - "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", - "plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n", - "plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n", - "\n", - "save_fig(\"gradient_descent_plot\")\n", - "plt.show()" + " plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)" ] }, { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -365,10 +272,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Stochastic Gradient Descent" ] @@ -376,11 +280,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta_path_sgd = []\n", @@ -391,11 +291,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "n_epochs = 50\n", @@ -431,11 +327,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta" @@ -444,11 +336,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDRegressor\n", @@ -459,11 +347,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "sgd_reg.intercept_, sgd_reg.coef_" @@ -471,10 +355,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Mini-batch gradient descent" ] @@ -482,11 +363,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta_path_mgd = []\n", @@ -519,11 +396,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta" @@ -532,11 +405,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "theta_path_bgd = np.array(theta_path_bgd)\n", @@ -547,11 +416,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(7,4))\n", @@ -568,10 +433,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Polynomial regression" ] @@ -580,9 +442,7 @@ "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -596,9 +456,7 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -610,11 +468,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(X, y, \"b.\")\n", @@ -628,11 +482,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import PolynomialFeatures\n", @@ -644,11 +494,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_poly[0]" @@ -657,11 +503,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", @@ -672,11 +514,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n", @@ -695,11 +533,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", @@ -730,11 +564,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", @@ -760,11 +590,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_reg = LinearRegression()\n", @@ -777,11 +603,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -799,10 +621,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Regularized models" ] @@ -810,11 +629,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", @@ -857,11 +672,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Ridge\n", @@ -873,11 +684,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "sgd_reg = SGDRegressor(max_iter=5, penalty=\"l2\", random_state=42)\n", @@ -888,11 +695,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n", @@ -903,11 +706,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso\n", @@ -926,11 +725,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import Lasso\n", @@ -942,11 +737,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import ElasticNet\n", @@ -959,9 +750,6 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -1022,11 +810,7 @@ { "cell_type": "code", "execution_count": 44, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import clone\n", @@ -1049,11 +833,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "best_epoch, best_model" @@ -1062,11 +842,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", @@ -1077,11 +853,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5\n", @@ -1108,11 +880,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):\n", @@ -1175,10 +943,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Logistic regression" ] @@ -1186,11 +951,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-10, 10, 100)\n", @@ -1211,11 +972,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn import datasets\n", @@ -1226,11 +983,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "print(iris.DESCR)" @@ -1240,9 +993,7 @@ "cell_type": "code", "execution_count": 52, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1253,11 +1004,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", @@ -1268,11 +1015,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", @@ -1284,10 +1027,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The figure in the book actually is actually a bit fancier:" ] @@ -1295,11 +1035,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", @@ -1326,11 +1062,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "decision_boundary" @@ -1339,11 +1071,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "log_reg.predict([[1.7], [1.5]])" @@ -1352,11 +1080,7 @@ { "cell_type": "code", "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", @@ -1400,11 +1124,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", @@ -1417,11 +1137,7 @@ { "cell_type": "code", "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "x0, x1 = np.meshgrid(\n", @@ -1459,11 +1175,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "softmax_reg.predict([[5, 2]])" @@ -1472,11 +1184,7 @@ { "cell_type": "code", "execution_count": 62, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "softmax_reg.predict_proba([[5, 2]])" @@ -1484,40 +1192,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 1. to 11." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "See appendix A." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 12. Batch Gradient Descent with early stopping for Softmax Regression\n", "(without using Scikit-Learn)" @@ -1525,10 +1221,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's start by loading the data. We will just reuse the Iris dataset we loaded earlier." ] @@ -1537,9 +1230,7 @@ "cell_type": "code", "execution_count": 63, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1549,10 +1240,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "We need to add the bias term for every instance ($x_0 = 1$):" ] @@ -1561,9 +1249,7 @@ "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1572,10 +1258,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "And let's set the random seed so the output of this exercise solution is reproducible:" ] @@ -1584,9 +1267,7 @@ "cell_type": "code", "execution_count": 65, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1595,10 +1276,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The easiest option to split the dataset into a training set, a validation set and a test set would be to use Scikit-Learn's `train_test_split()` function, but the point of this exercise is to try understand the algorithms by implementing them manually. So here is one possible implementation:" ] @@ -1606,11 +1284,7 @@ { "cell_type": "code", "execution_count": 66, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "test_ratio = 0.2\n", @@ -1633,10 +1307,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The targets are currently class indices (0, 1 or 2), but we need target class probabilities to train the Softmax Regression model. Each instance will have target class probabilities equal to 0.0 for all classes except for the target class which will have a probability of 1.0 (in other words, the vector of class probabilities for ay given instance is a one-hot vector). Let's write a small function to convert the vector of class indices into a matrix containing a one-hot vector for each instance:" ] @@ -1645,9 +1316,7 @@ "cell_type": "code", "execution_count": 67, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1661,10 +1330,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's test this function on the first 10 instances:" ] @@ -1672,11 +1338,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_train[:10]" @@ -1685,11 +1347,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "to_one_hot(y_train[:10])" @@ -1697,10 +1355,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Looks good, so let's create the target class probabilities matrix for the training set and the test set:" ] @@ -1709,9 +1364,7 @@ "cell_type": "code", "execution_count": 70, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1722,10 +1375,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's implement the Softmax function. Recall that it is defined by the following equation:\n", "\n", @@ -1736,9 +1386,7 @@ "cell_type": "code", "execution_count": 71, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1750,10 +1398,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "We are almost ready to start training. Let's define the number of inputs and outputs:" ] @@ -1762,9 +1407,7 @@ "cell_type": "code", "execution_count": 72, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1774,10 +1417,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now here comes the hardest part: training! Theoretically, it's simple: it's just a matter of translating the math equations into Python code. But in practice, it can be quite tricky: in particular, it's easy to mix up the order of the terms, or the indices. You can even end up with code that looks like it's working but is actually not computing exactly the right thing. When unsure, you should write down the shape of each term in the equation and make sure the corresponding terms in your code match closely. It can also help to evaluate each term independently and print them out. The good news it that you won't have to do this everyday, since all this is well implemented by Scikit-Learn, but it will help you understand what's going on under the hood.\n", "\n", @@ -1796,11 +1436,7 @@ { "cell_type": "code", "execution_count": 73, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "eta = 0.01\n", @@ -1823,10 +1459,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "And that's it! The Softmax model is trained. Let's look at the model parameters:" ] @@ -1834,11 +1467,7 @@ { "cell_type": "code", "execution_count": 74, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "Theta" @@ -1846,10 +1475,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's make predictions for the validation set and check the accuracy score:" ] @@ -1857,11 +1483,7 @@ { "cell_type": "code", "execution_count": 75, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "logits = X_valid.dot(Theta)\n", @@ -1874,10 +1496,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Well, this model looks pretty good. For the sake of the exercise, let's add a bit of $\\ell_2$ regularization. The following training code is similar to the one above, but the loss now has an additional $\\ell_2$ penalty, and the gradients have the proper additional term (note that we don't regularize the first element of `Theta` since this corresponds to the bias term). Also, let's try increasing the learning rate `eta`." ] @@ -1885,11 +1504,7 @@ { "cell_type": "code", "execution_count": 76, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "eta = 0.1\n", @@ -1915,10 +1530,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Because of the additional $\\ell_2$ penalty, the loss seems greater than earlier, but perhaps this model will perform better? Let's find out:" ] @@ -1926,11 +1538,7 @@ { "cell_type": "code", "execution_count": 77, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "logits = X_valid.dot(Theta)\n", @@ -1943,20 +1551,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Cool, perfect accuracy! We probably just got lucky with this validation set, but still, it's pleasant." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's add early stopping. For this we just need to measure the loss on the validation set at every iteration and stop when the error starts growing." ] @@ -1964,11 +1566,7 @@ { "cell_type": "code", "execution_count": 78, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "eta = 0.1 \n", @@ -2008,11 +1606,7 @@ { "cell_type": "code", "execution_count": 79, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "logits = X_valid.dot(Theta)\n", @@ -2025,20 +1619,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Still perfect, but faster." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's plot the model's predictions on the whole dataset:" ] @@ -2046,11 +1634,7 @@ { "cell_type": "code", "execution_count": 80, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "x0, x1 = np.meshgrid(\n", @@ -2087,10 +1671,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "And now let's measure the final model's accuracy on the test set:" ] @@ -2098,11 +1679,7 @@ { "cell_type": "code", "execution_count": 81, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "logits = X_test.dot(Theta)\n", @@ -2115,10 +1692,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Our perfect model turns out to have slight imperfections. This variability is likely due to the very small size of the dataset: depending on how you sample the training set, validation set and the test set, you can get quite different results. Try changing the random seed and running the code again a few times, you will see that the results will vary." ] @@ -2127,9 +1701,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [] @@ -2151,7 +1723,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.4" }, "nav_menu": {}, "toc": { @@ -2165,5 +1737,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 5350ca17b1d8ea7fadb528d21e4f8a622399af1b Mon Sep 17 00:00:00 2001 From: Abhinav Upadhyay Date: Tue, 27 Feb 2018 11:57:11 +0530 Subject: [PATCH 32/42] Fix a typo: s/it/is --- 15_autoencoders.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/15_autoencoders.ipynb b/15_autoencoders.ipynb index e42329f..1e8299a 100644 --- a/15_autoencoders.ipynb +++ b/15_autoencoders.ipynb @@ -520,7 +520,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are many ways to train one Autoencoder at a time. The first approach it to train each Autoencoder using a different graph, then we create the Stacked Autoencoder by simply initializing it with the weights and biases copied from these Autoencoders." + "There are many ways to train one Autoencoder at a time. The first approach is to train each Autoencoder using a different graph, then we create the Stacked Autoencoder by simply initializing it with the weights and biases copied from these Autoencoders." ] }, { From 749a44fbc196b06716f1b0902688d033abac9d53 Mon Sep 17 00:00:00 2001 From: Dror Atariah Date: Fri, 2 Mar 2018 18:44:05 +0100 Subject: [PATCH 33/42] Fix GH188 --- docker/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index b4ec526..82da41f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -8,6 +8,7 @@ RUN apt-get update && apt-get upgrade -y \ sudo \ && rm -rf /var/lib/apt/lists/* +RUN conda update -n base conda RUN conda install -y -c conda-forge \ tensorflow \ jupyter_contrib_nbextensions From d9fbf7dd4c419898eaae1540f687cb4733f69557 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 15 Mar 2018 18:38:58 +0100 Subject: [PATCH 34/42] LinearRegression is based on SVD, not the Normal Equation (fixes #184), also fixes #179 (mini-batch gradient descent), and updates matplotlib code to latest version. --- 04_training_linear_models.ipynb | 253 +++++++++++++++++--------------- 1 file changed, 136 insertions(+), 117 deletions(-) diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index 1804968..8acdbfc 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -61,7 +61,11 @@ " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format='png', dpi=300)\n" + " plt.savefig(path, format='png', dpi=300)\n", + "\n", + "# Ignore useless warnings (see SciPy issue #5998)\n", + "import warnings\n", + "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" ] }, { @@ -188,7 +192,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Linear regression using batch gradient descent" + "The `LinearRegression` class is based on the `scipy.linalg.lstsq()` function (the name stands for \"least squares\"), which you could call directly:" ] }, { @@ -196,6 +200,46 @@ "execution_count": 11, "metadata": {}, "outputs": [], + "source": [ + "theta_best_svd, residuals, rank, s = np.linalg.lstsq(X_b, y, rcond=1e-6)\n", + "theta_best_svd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function computes $\\mathbf{X}^+\\mathbf{y}$, where $\\mathbf{X}^{+}$ is the _pseudoinverse_ of $\\mathbf{X}$ (specifically the Moore-Penrose inverse). You can use `np.linalg.pinv()` to compute the pseudoinverse directly:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "np.linalg.pinv(X_b).dot(y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: the first releases of the book implied that the `LinearRegression` class was based on the Normal Equation. This was an error, my apologies: as explained above, it is based on the pseudoinverse, which ultimately relies on the SVD matrix decomposition of $\\mathbf{X}$ (see chapter 8 for details about the SVD decomposition). Its time complexity is $O(n^2)$ and it works even when $m < n$ or when some features are linear combinations of other features (in these cases, $\\mathbf{X}^T \\mathbf{X}$ is not invertible so the Normal Equation fails), see [issue #184](https://github.com/ageron/handson-ml/issues/184) for more details. However, this does not change the rest of the description of the `LinearRegression` class, in particular, it is based on an analytical solution, it does not scale well with the number of features, it scales linearly with the number of instances, all the data must fit in memory, it does not require feature scaling and the order of the instances in the training set does not matter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear regression using batch gradient descent" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], "source": [ "eta = 0.1\n", "n_iterations = 1000\n", @@ -209,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -218,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -227,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -253,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -279,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -290,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -326,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -335,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -346,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -362,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -374,7 +418,7 @@ "np.random.seed(42)\n", "theta = np.random.randn(2,1) # random initialization\n", "\n", - "t0, t1 = 10, 1000\n", + "t0, t1 = 200, 1000\n", "def learning_schedule(t):\n", " return t0 / (t + t1)\n", "\n", @@ -387,7 +431,7 @@ " t += 1\n", " xi = X_b_shuffled[i:i+minibatch_size]\n", " yi = y_shuffled[i:i+minibatch_size]\n", - " gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n", + " gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)\n", " eta = learning_schedule(t)\n", " theta = theta - eta * gradients\n", " theta_path_mgd.append(theta)" @@ -395,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -404,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -415,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -440,10 +484,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -454,10 +496,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, + "execution_count": 28, + "metadata": {}, "outputs": [], "source": [ "m = 100\n", @@ -467,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -481,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -493,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -502,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -513,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -532,7 +572,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -563,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -602,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -628,7 +668,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -671,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -683,7 +723,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -694,7 +734,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -705,7 +745,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -724,7 +764,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -736,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -748,7 +788,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 45, "metadata": { "scrolled": true }, @@ -809,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -832,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -841,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -852,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -879,7 +919,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -918,6 +958,9 @@ " plt.plot(t1_min, t2_min, \"rs\")\n", " plt.title(r\"$\\ell_{}$ penalty\".format(i + 1), fontsize=16)\n", " plt.axis([t1a, t1b, t2a, t2b])\n", + " if i == 1:\n", + " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", + " plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n", "\n", " plt.subplot(222 + i * 2)\n", " plt.grid(True)\n", @@ -928,14 +971,8 @@ " plt.plot(t1r_min, t2r_min, \"rs\")\n", " plt.title(title, fontsize=16)\n", " plt.axis([t1a, t1b, t2a, t2b])\n", - "\n", - "for subplot in (221, 223):\n", - " plt.subplot(subplot)\n", - " plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n", - "\n", - "for subplot in (223, 224):\n", - " plt.subplot(subplot)\n", - " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", + " if i == 1:\n", + " plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n", "\n", "save_fig(\"lasso_vs_ridge_plot\")\n", "plt.show()" @@ -950,7 +987,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -971,7 +1008,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -982,7 +1019,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -991,10 +1028,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": true - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, 3:] # petal width\n", @@ -1003,7 +1038,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1014,7 +1049,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1034,7 +1069,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -1061,7 +1096,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1070,7 +1105,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1079,7 +1114,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1123,7 +1158,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1136,7 +1171,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1161,7 +1196,7 @@ "from matplotlib.colors import ListedColormap\n", "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", "\n", - "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n", + "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n", "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n", "plt.clabel(contour, inline=1, fontsize=12)\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", @@ -1174,7 +1209,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1183,7 +1218,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1228,10 +1263,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": true - }, + "execution_count": 65, + "metadata": {}, "outputs": [], "source": [ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", @@ -1247,10 +1280,8 @@ }, { "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": true - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "X_with_bias = np.c_[np.ones([len(X), 1]), X]" @@ -1265,10 +1296,8 @@ }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": true - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(2042)" @@ -1283,7 +1312,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1314,10 +1343,8 @@ }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": true - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "def to_one_hot(y):\n", @@ -1337,7 +1364,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1346,7 +1373,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1362,10 +1389,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": true - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "Y_train_one_hot = to_one_hot(y_train)\n", @@ -1384,10 +1409,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": true - }, + "execution_count": 73, + "metadata": {}, "outputs": [], "source": [ "def softmax(logits):\n", @@ -1405,10 +1428,8 @@ }, { "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": true - }, + "execution_count": 74, + "metadata": {}, "outputs": [], "source": [ "n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)\n", @@ -1435,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1466,7 +1487,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1482,7 +1503,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1503,7 +1524,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1537,7 +1558,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1565,7 +1586,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1605,7 +1626,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1633,7 +1654,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1659,7 +1680,7 @@ "from matplotlib.colors import ListedColormap\n", "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", "\n", - "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n", + "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n", "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n", "plt.clabel(contour, inline=1, fontsize=12)\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", @@ -1678,7 +1699,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1700,9 +1721,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } From eefe262dca78863c01a6789597b3153324992c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 15 Mar 2018 18:51:08 +0100 Subject: [PATCH 35/42] Fix typo (pca->rbf_pca), fixes #192 --- 08_dimensionality_reduction.ipynb | 108 ++++++++---------------------- 1 file changed, 28 insertions(+), 80 deletions(-) diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index d1bf0d2..5f0de13 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -77,9 +75,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(4)\n", @@ -120,9 +116,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_centered = X - X.mean(axis=0)\n", @@ -134,9 +128,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "m, n = X.shape\n", @@ -157,9 +149,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "W2 = Vt.T[:, :2]\n", @@ -169,9 +159,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X2D_using_svd = X2D" @@ -194,9 +182,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", @@ -251,9 +237,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X3D_inv = pca.inverse_transform(X2D)" @@ -301,9 +285,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X3D_inv_using_svd = X2D_using_svd.dot(Vt[:2, :])" @@ -436,9 +418,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from matplotlib.patches import FancyArrowPatch\n", @@ -466,9 +446,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]\n", @@ -563,9 +541,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_swiss_roll\n", @@ -785,9 +761,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from six.moves import urllib\n", @@ -798,9 +772,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", @@ -814,9 +786,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pca = PCA()\n", @@ -837,9 +807,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components=0.95)\n", @@ -867,9 +835,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "pca = PCA(n_components = 154)\n", @@ -880,9 +846,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_digits(instances, images_per_row=5, **options):\n", @@ -921,9 +885,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_reduced_pca = X_reduced" @@ -956,9 +918,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)" @@ -981,9 +941,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_reduced_inc_pca = X_reduced" @@ -1038,9 +996,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "filename = \"my_mnist.data\"\n", @@ -1060,9 +1016,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "del X_mm" @@ -1091,9 +1045,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n", @@ -1221,9 +1173,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" @@ -1232,9 +1182,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import KernelPCA\n", @@ -1285,7 +1233,7 @@ "source": [ "plt.figure(figsize=(6, 5))\n", "\n", - "X_inverse = pca.inverse_transform(X_reduced_rbf)\n", + "X_inverse = rbf_pca.inverse_transform(X_reduced_rbf)\n", "\n", "ax = plt.subplot(111, projection='3d')\n", "ax.view_init(10, -70)\n", @@ -2339,7 +2287,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.4" }, "nav_menu": { "height": "352px", From e9c97ff3b8117ae584d8c52e57ab77535e031e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 15 Mar 2018 19:17:51 +0100 Subject: [PATCH 36/42] mean_squared_error(y_true, y_pred) instead of (y_pred, y_true), for clarity (result unchanged). Fixes #158 --- 04_training_linear_models.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index 8acdbfc..1845e8e 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -617,8 +617,8 @@ " model.fit(X_train[:m], y_train[:m])\n", " y_train_predict = model.predict(X_train[:m])\n", " y_val_predict = model.predict(X_val)\n", - " train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))\n", - " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", + " train_errors.append(mean_squared_error(y_train[:m], y_train_predict))\n", + " val_errors.append(mean_squared_error(y_val, y_val_predict))\n", "\n", " plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"train\")\n", " plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"val\")\n", @@ -822,8 +822,8 @@ " sgd_reg.fit(X_train_poly_scaled, y_train)\n", " y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n", " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", - " train_errors.append(mean_squared_error(y_train_predict, y_train))\n", - " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", + " train_errors.append(mean_squared_error(y_train, y_train_predict))\n", + " val_errors.append(mean_squared_error(y_val, y_val_predict))\n", "\n", "best_epoch = np.argmin(val_errors)\n", "best_val_rmse = np.sqrt(val_errors[best_epoch])\n", @@ -863,7 +863,7 @@ "for epoch in range(1000):\n", " sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n", " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", - " val_error = mean_squared_error(y_val_predict, y_val)\n", + " val_error = mean_squared_error(y_val, y_val_predict)\n", " if val_error < minimum_val_error:\n", " minimum_val_error = val_error\n", " best_epoch = epoch\n", From f9ac449f9761a4ea60592ce48a8f911752a50453 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 15 Mar 2018 23:26:18 +0100 Subject: [PATCH 37/42] Add cmap when plotting california image, fixes #65 --- 02_end_to_end_machine_learning_project.ipynb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 14a7c20..9090550 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -66,7 +66,11 @@ " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", - " plt.savefig(path, format=fig_extension, dpi=resolution)" + " plt.savefig(path, format=fig_extension, dpi=resolution)\n", + "\n", + "# Ignore useless warnings (see SciPy issue #5998)\n", + "import warnings\n", + "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" ] }, { @@ -466,7 +470,8 @@ " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", " colorbar=False, alpha=0.4,\n", " )\n", - "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n", + "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,\n", + " cmap=plt.get_cmap(\"jet\"))\n", "plt.ylabel(\"Latitude\", fontsize=14)\n", "plt.xlabel(\"Longitude\", fontsize=14)\n", "\n", @@ -2241,7 +2246,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.4" }, "nav_menu": { "height": "279px", From 82637087771c6735bead49e0150b9fa24aad7e80 Mon Sep 17 00:00:00 2001 From: ziembla Date: Wed, 21 Mar 2018 22:21:54 +0100 Subject: [PATCH 38/42] Dependencies for OpenAI gym in Chapter 16 --- docker/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 82da41f..72a16f2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,12 +6,15 @@ RUN apt-get update && apt-get upgrade -y \ build-essential \ git \ sudo \ + cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev libboost-all-dev libsdl2-dev swig \ && rm -rf /var/lib/apt/lists/* RUN conda update -n base conda RUN conda install -y -c conda-forge \ tensorflow \ - jupyter_contrib_nbextensions + jupyter_contrib_nbextensions \ + pyopengl +RUN pip install "gym[atari,box2d,classic_control]" ARG username ARG userid From c4f82f3621cdae6cb08a129fec2838a850103b62 Mon Sep 17 00:00:00 2001 From: ziembla Date: Wed, 21 Mar 2018 22:35:10 +0100 Subject: [PATCH 39/42] Don't try OpenAI rendering in Chapter 16 Section 3 --- 16_reinforcement_learning.ipynb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/16_reinforcement_learning.ipynb b/16_reinforcement_learning.ipynb index 15c258e..30f2ab2 100644 --- a/16_reinforcement_learning.ipynb +++ b/16_reinforcement_learning.ipynb @@ -574,6 +574,15 @@ " plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "openai_cart_pole_rendering = False # don't try, just use the safe way?" + ] + }, { "cell_type": "code", "execution_count": 26, From b3991908465d5ac9a0d48891202fae91401a5c62 Mon Sep 17 00:00:00 2001 From: Vladimir Tikhonov Date: Sat, 24 Mar 2018 17:34:38 +0300 Subject: [PATCH 40/42] Fix small typo in numpy notebook --- tools_numpy.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools_numpy.ipynb b/tools_numpy.ipynb index 5ec032d..ed2d81b 100644 --- a/tools_numpy.ipynb +++ b/tools_numpy.ipynb @@ -459,7 +459,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "NumPy first creates three `ndarrays` (one per dimension), each of shape `(3, 2, 10)`. Each array has values equal to the coordinate along a specific axis. For example, all elements in the `z` array are equal to their z-coordinate:\n", + "NumPy first creates three `ndarrays` (one per dimension), each of shape `(2, 10)`. Each array has values equal to the coordinate along a specific axis. For example, all elements in the `z` array are equal to their z-coordinate:\n", "\n", " [[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", " [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n", From a164ffc699d38e1bb54cdd4ce42fd635b9a55710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sat, 24 Mar 2018 22:50:29 +0100 Subject: [PATCH 41/42] Remove uncessary reuse_vars_dict in notebook 11 --- 11_deep_learning.ipynb | 214 +++++++++++------------------------------ 1 file changed, 55 insertions(+), 159 deletions(-) diff --git a/11_deep_learning.ipynb b/11_deep_learning.ipynb index c002217..d83e660 100644 --- a/11_deep_learning.ipynb +++ b/11_deep_learning.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -79,9 +77,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def logit(z):\n", @@ -134,9 +130,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf" @@ -145,9 +139,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -161,9 +153,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "he_init = tf.contrib.layers.variance_scaling_initializer()\n", @@ -188,9 +178,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def leaky_relu(z, alpha=0.01):\n", @@ -226,9 +214,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -239,9 +225,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def leaky_relu(z, name=None):\n", @@ -260,9 +244,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -276,9 +258,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", @@ -288,9 +268,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"dnn\"):\n", @@ -302,9 +280,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"loss\"):\n", @@ -315,9 +291,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "learning_rate = 0.01\n", @@ -330,9 +304,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"eval\"):\n", @@ -343,9 +315,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -404,9 +374,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def elu(z, alpha=1):\n", @@ -441,9 +409,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -454,9 +420,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=\"hidden1\")" @@ -479,9 +443,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def selu(z,\n", @@ -543,9 +505,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def selu(z,\n", @@ -571,9 +531,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -668,9 +626,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -702,9 +658,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -723,9 +677,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from functools import partial\n", @@ -753,9 +705,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -811,9 +761,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_epochs = 20\n", @@ -912,9 +860,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -946,9 +892,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "learning_rate = 0.01" @@ -964,9 +908,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "threshold = 1.0\n", @@ -988,9 +930,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"eval\"):\n", @@ -1001,9 +941,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -1013,9 +951,7 @@ { "cell_type": "code", "execution_count": 43, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "n_epochs = 20\n", @@ -1065,9 +1001,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()" @@ -1076,9 +1010,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "saver = tf.train.import_meta_graph(\"./my_model_final.ckpt.meta\")" @@ -1111,9 +1043,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output, Image, display, HTML\n", @@ -1175,9 +1105,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X = tf.get_default_graph().get_tensor_by_name(\"X:0\")\n", @@ -1198,9 +1126,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "for op in (X, y, accuracy, training_op):\n", @@ -1217,9 +1143,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X, y, accuracy, training_op = tf.get_collection(\"my_important_ops\")" @@ -1280,9 +1204,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1363,9 +1285,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1437,9 +1357,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1489,8 +1407,7 @@ "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()\n", @@ -1527,9 +1444,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1670,9 +1585,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1706,9 +1619,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"train\"): # not shown in the book\n", @@ -1721,9 +1632,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -1738,8 +1647,7 @@ "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()\n", @@ -1762,9 +1670,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1783,9 +1689,7 @@ { "cell_type": "code", "execution_count": 72, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"dnn\"):\n", @@ -1804,9 +1708,7 @@ { "cell_type": "code", "execution_count": 73, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "with tf.name_scope(\"loss\"):\n", @@ -1837,8 +1739,7 @@ "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()\n", @@ -1868,9 +1769,7 @@ { "cell_type": "code", "execution_count": 75, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reset_graph()\n", @@ -1913,15 +1812,12 @@ { "cell_type": "code", "execution_count": 76, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", " scope=\"hidden[123]\") # regular expression\n", - "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", - "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "restore_saver = tf.train.Saver(reuse_vars) # to restore layers 1-3\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()" @@ -4941,7 +4837,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.4" }, "nav_menu": { "height": "360px", From 8f6a28e6bc47ab6d6e5cff68de574d1b8366a067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 3 Apr 2018 16:45:53 +0200 Subject: [PATCH 42/42] Improve the implementation of the test_set_check() function: faster, supports python 2 and 3, and more fine grain split (32 bits intead of 8) --- 02_end_to_end_machine_learning_project.ipynb | 269 ++++++++++--------- 1 file changed, 147 insertions(+), 122 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 9090550..2eed4e3 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -219,26 +219,41 @@ "metadata": {}, "outputs": [], "source": [ - "import hashlib\n", + "from zlib import crc32\n", "\n", - "def test_set_check(identifier, test_ratio, hash):\n", - " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n", + "def test_set_check(identifier, test_ratio):\n", + " return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32\n", "\n", - "def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):\n", + "def split_train_test_by_id(data, test_ratio, id_column):\n", " ids = data[id_column]\n", - " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n", + " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))\n", " return data.loc[~in_test_set], data.loc[in_test_set]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The implementation of `test_set_check()` above works fine in both Python 2 and Python 3. In earlier releases, the following implementation was proposed, which supported any hash function, but was much slower and did not support Python 2:" + ] + }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "# This version supports both Python 2 and Python 3, instead of just Python 3.\n", - "def test_set_check(identifier, test_ratio, hash):\n", - " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" + "import hashlib\n", + "\n", + "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", + " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want an implementation that supports any hash function and is compatible with both Python 2 and Python 3, here is one:" ] }, { @@ -246,6 +261,16 @@ "execution_count": 15, "metadata": {}, "outputs": [], + "source": [ + "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", + " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], "source": [ "housing_with_id = housing.reset_index() # adds an `index` column\n", "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")" @@ -253,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -263,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -272,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -283,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -292,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -301,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -313,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -322,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -331,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -345,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -354,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -363,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -383,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -392,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -409,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -418,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -428,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -445,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -459,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -488,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -497,7 +522,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -506,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -521,7 +546,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -533,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -551,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -561,7 +586,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -573,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -589,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -599,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -609,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -618,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -627,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -638,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -656,7 +681,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -666,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -675,7 +700,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -691,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -707,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -716,7 +741,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -726,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -735,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -744,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -761,7 +786,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -778,7 +803,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -788,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -811,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -831,7 +856,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -847,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1048,7 +1073,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1069,7 +1094,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1085,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1096,7 +1121,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1112,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1142,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1159,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1177,7 +1202,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1193,7 +1218,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1219,7 +1244,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1241,7 +1266,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1255,7 +1280,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1265,7 +1290,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1281,7 +1306,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1293,7 +1318,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1314,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1323,7 +1348,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1332,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1346,7 +1371,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1358,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1370,7 +1395,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1389,7 +1414,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1402,7 +1427,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1416,7 +1441,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1428,7 +1453,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1440,7 +1465,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1452,7 +1477,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1466,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1476,7 +1501,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1492,7 +1517,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1521,7 +1546,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1530,7 +1555,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1546,7 +1571,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1557,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1566,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1586,7 +1611,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1597,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1607,7 +1632,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1620,7 +1645,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1638,7 +1663,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1661,7 +1686,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1683,7 +1708,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1692,7 +1717,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1711,7 +1736,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1749,7 +1774,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1775,7 +1800,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1793,7 +1818,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1823,7 +1848,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1856,7 +1881,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1874,7 +1899,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1897,7 +1922,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1922,7 +1947,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -1961,7 +1986,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -1997,7 +2022,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -2013,7 +2038,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -2023,7 +2048,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -2039,7 +2064,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -2055,7 +2080,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -2067,7 +2092,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -2083,7 +2108,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -2099,7 +2124,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -2129,7 +2154,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -2142,7 +2167,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -2158,7 +2183,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -2192,7 +2217,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -2208,7 +2233,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [