diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000..16adf41 --- /dev/null +++ b/docker/.env @@ -0,0 +1 @@ +COMPOSE_PROJECT_NAME=handson-ml diff --git a/docker/Dockerfile b/docker/Dockerfile index 54e5510..b4ec526 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,28 +1,29 @@ FROM continuumio/anaconda3 RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y \ - libpq-dev \ - build-essential \ - git \ - sudo \ - && rm -rf /var/lib/apt/lists/* + && apt-get install -y \ + libpq-dev \ + build-essential \ + git \ + sudo \ + && rm -rf /var/lib/apt/lists/* RUN conda install -y -c conda-forge \ - tensorflow=1.0.0 \ - jupyter_contrib_nbextensions + tensorflow \ + jupyter_contrib_nbextensions ARG username ARG userid +ARG home=/home/${username} +ARG workdir=${home}/handson-ml + RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ - && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ - && chmod 0440 /etc/sudoers.d/${username} + && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ + && chmod 0440 /etc/sudoers.d/${username} -ENV HOME /home/${username} - -WORKDIR ${HOME}/handson-ml -RUN chown ${username}:${username} ${HOME}/handson-ml +WORKDIR ${workdir} +RUN chown ${username}:${username} ${workdir} USER ${username} @@ -30,6 +31,55 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main +# INFO: Jupyter and nbdime extension are not totally integrated (anaconda image is py36, +# nbdime checks for py35 at the moment, still the config below enables diffing +# notebooks with nbdiff (and nbdiff support in git diff command) after connecting +# to the container by "make exec" (or "docker-compose exec handson-ml bash") +# You may also try running: +# nbd NOTEBOOK_NAME.ipynb +# to get nbdiff between checkpointed version and current version of the given notebook +USER root +WORKDIR / +RUN conda install -y -c conda-forge nbdime +USER ${username} +WORKDIR ${workdir} + +RUN git-nbdiffdriver config --enable --global + +# INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# metadata or details in nbdiff within git diff +#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' + + +# INFO: Dirty nbdime patching (ignored if not matching) +COPY docker/nbdime-*.patch /tmp/ +USER root +WORKDIR / +RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-1-details.patch || true \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch || true +RUN rm /tmp/nbdime-*.patch +USER ${username} +WORKDIR ${workdir} + + +COPY docker/bashrc.bash /tmp/ +RUN cat /tmp/bashrc.bash >> ${home}/.bashrc +RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc +RUN sudo rm /tmp/bashrc.bash + + +# INFO: Uncomment lines below to enable automatic save of python-only and html-only +# exports alongside the notebook +#COPY docker/jupyter_notebook_config.py /tmp/ +#RUN cat /tmp/jupyter_notebook_config.py >> ${home}/.jupyter/jupyter_notebook_config.py +#RUN sudo rm /tmp/jupyter_notebook_config.py + +# INFO: Uncomment the RUN command below to disable git diff paging +#RUN git config --global core.pager '' + # INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) # That will switch jupyter to using empty password instead of a token. # To avoid making a security hole you SHOULD in fact not only uncomment but @@ -38,34 +88,6 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py - -# INFO: Uncomment the RUN command below to disable git diff paging -#RUN git config --global core.pager '' - - -# INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing -# notebooks with nbdiff (and nbdiff support in git diff command) after connecting to -# the container by "make exec" (docker exec) -# Try: -# nbd NOTEBOOK_NAME.ipynb -# to get nbdiff between checkpointed version and current version of the given notebook -USER root -WORKDIR / - -RUN conda install -y -c conda-forge nbdime - -USER ${username} -WORKDIR ${HOME}/handson-ml - -RUN git-nbdiffdriver config --enable --global - -# INFO: Uncomment the RUN command below to ignore metadata in nbdiff within git diff -#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' - - -COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${HOME}/.bashrc -RUN sudo rm -rf /tmp/bashrc +#RUN mkdir -p ${home}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/Makefile b/docker/Makefile index 6078fc9..f85c49a 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -4,7 +4,7 @@ help: run: docker-compose up exec: - docker-compose exec handson-ml /bin/bash + docker-compose exec handson-ml bash build: stop .FORCE docker-compose build rebuild: stop .FORCE diff --git a/docker/README.md b/docker/README.md index 50b6f12..037ae22 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,5 +1,5 @@ -# Hands-on Machine Learning in Docker :-) +# Hands-on Machine Learning in Docker This is the Docker configuration which allows you to run and tweak the book's notebooks without installing any dependencies on your machine!
OK, any except `docker`. With `docker-compose`. Well, you may also want `make` (but it is only used as thin layer to call a few simple `docker-compose` commands). @@ -32,7 +32,9 @@ You can close the server just by pressing `Ctrl-C` in terminal window. Run `make exec` (or `docker-compose exec handson-ml bash`) while the server is running to run an additional `bash` shell inside the `handson-ml` container. Now you're inside the environment prepared within the image. -One of the usefull things that can be done there may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. +One of the usefull things that can be done there would be starting TensorBoard (for example with simple `tb` command, see bashrc file). + +Another one may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. You can see changes you made relative to the version in git using `git diff` which is integrated with `nbdiff`. diff --git a/docker/bashrc b/docker/bashrc deleted file mode 100644 index 3535389..0000000 --- a/docker/bashrc +++ /dev/null @@ -1,12 +0,0 @@ -alias ll="ls -l" - -nbd() { - DIRNAME=$(dirname "$1") - BASENAME=$(basename "$1" .ipynb) - - WORKING_COPY=$DIRNAME/$BASENAME.ipynb - CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb - - # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" -} diff --git a/docker/bashrc.bash b/docker/bashrc.bash new file mode 100644 index 0000000..ff19745 --- /dev/null +++ b/docker/bashrc.bash @@ -0,0 +1,3 @@ +alias ll="ls -alF" +alias nbd="nbdiff_checkpoint" +alias tb="tensorboard --logdir=tf_logs" diff --git a/docker/bin/nbclean_checkpoints b/docker/bin/nbclean_checkpoints new file mode 100755 index 0000000..ba4aaf9 --- /dev/null +++ b/docker/bin/nbclean_checkpoints @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +import collections +import glob +import hashlib +import os +import subprocess + + +class NotebookAnalyser: + + def __init__(self, dry_run=False, verbose=False, colorful=False): + self._dry_run = dry_run + self._verbose = verbose + self._colors = collections.defaultdict(lambda: "") + if colorful: + for color in [ + NotebookAnalyser.COLOR_WHITE, + NotebookAnalyser.COLOR_RED, + NotebookAnalyser.COLOR_GREEN, + NotebookAnalyser.COLOR_YELLOW, + ]: + self._colors[color] = "\033[{}m".format(color) + + NOTEBOOK_SUFFIX = ".ipynb" + CHECKPOINT_DIR = NOTEBOOK_SUFFIX + "_checkpoints" + CHECKPOINT_MASK = "*-checkpoint" + NOTEBOOK_SUFFIX + CHECKPOINT_MASK_LEN = len(CHECKPOINT_MASK) - 1 + + @staticmethod + def get_hash(file_path): + with open(file_path, "rb") as input: + hash = hashlib.md5() + for chunk in iter(lambda: input.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() + + MESSAGE_ORPHANED = "missing " + MESSAGE_MODIFIED = "modified" + MESSAGE_DELETED = "DELETING" + + COLOR_WHITE = "0" + COLOR_RED = "31" + COLOR_GREEN = "32" + COLOR_YELLOW = "33" + + def log(self, message, file, color=COLOR_WHITE): + color_on = self._colors[color] + color_off = self._colors[NotebookAnalyser.COLOR_WHITE] + print("{}{}{}: {}".format(color_on, message, color_off, file)) + + def clean_checkpoints(self, directory): + for checkpoint_path in sorted(glob.glob(os.path.join(directory, NotebookAnalyser.CHECKPOINT_MASK))): + + workfile_dir = os.path.dirname(os.path.dirname(checkpoint_path)) + workfile_name = os.path.basename(checkpoint_path)[:-NotebookAnalyser.CHECKPOINT_MASK_LEN] + NotebookAnalyser.NOTEBOOK_SUFFIX + workfile_path = os.path.join(workfile_dir, workfile_name) + + status = "" + if not os.path.isfile(workfile_path): + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_ORPHANED, workfile_path, NotebookAnalyser.COLOR_RED) + else: + checkpoint_stat = os.stat(checkpoint_path) + workfile_stat = os.stat(workfile_path) + + modified = workfile_stat.st_size != checkpoint_stat.st_size + + if not modified: + checkpoint_hash = NotebookAnalyser.get_hash(checkpoint_path) + workfile_hash = NotebookAnalyser.get_hash(workfile_path) + modified = checkpoint_hash != workfile_hash + + if modified: + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_MODIFIED, workfile_path, NotebookAnalyser.COLOR_YELLOW) + else: + self.log(NotebookAnalyser.MESSAGE_DELETED, checkpoint_path, NotebookAnalyser.COLOR_GREEN) + if not self._dry_run: + os.remove(checkpoint_path) + + if not self._dry_run and not os.listdir(directory): + self.log(NotebookAnalyser.MESSAGE_DELETED, directory, NotebookAnalyser.COLOR_GREEN) + os.rmdir(directory) + + def clean_checkpoints_recursively(self, directory): + for (root, subdirs, files) in os.walk(directory): + subdirs.sort() # INFO: traverse alphabetically + if NotebookAnalyser.CHECKPOINT_DIR in subdirs: + subdirs.remove(NotebookAnalyser.CHECKPOINT_DIR) # INFO: don't recurse there + self.clean_checkpoints(os.path.join(root, NotebookAnalyser.CHECKPOINT_DIR)) + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove checkpointed versions of those jupyter notebooks that are identical to their working copies.", + epilog="""Notebooks will be reported as either + "DELETED" if the working copy and checkpointed version are identical + (checkpoint will be deleted), + "missing" if there is a checkpoint but no corresponding working file can be found + or "modified" if notebook and the checkpoint are not byte-to-byte identical. + If removal of checkpoints results in empty ".ipynb_checkpoints" directory + that directory is also deleted. + """) #, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("dirs", metavar="DIR", type=str, nargs="*", default=".", help="directories to search") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") + parser.add_argument("-c", "--color", action="store_true", help="colorful mode") + args = parser.parse_args() + + analyser = NotebookAnalyser(args.dry_run, args.verbose, args.color) + for directory in args.dirs: + analyser.clean_checkpoints_recursively(directory) + +if __name__ == "__main__": + main() diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint new file mode 100755 index 0000000..9ce7cd0 --- /dev/null +++ b/docker/bin/nbdiff_checkpoint @@ -0,0 +1,17 @@ +#!/bin/bash +if [[ "$#" -lt 1 || "$1" =~ ^((-h)|(--help))$ ]] ; then + echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" + echo + echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" + exit +fi + +DIRNAME=$(dirname "$1") +BASENAME=$(basename "$1" .ipynb) +shift + +WORKING_COPY=$DIRNAME/$BASENAME.ipynb +CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb + +echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details "$@" diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs new file mode 100755 index 0000000..34f3ea9 --- /dev/null +++ b/docker/bin/rm_empty_subdirs @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import os + +def remove_empty_directories(initial_dir, + allow_initial_delete=False, ignore_nonexistant_initial=False, + dry_run=False, quiet=False): + + FORBIDDEN_SUBDIRS = set([".git"]) + + if not os.path.isdir(initial_dir) and not ignore_nonexistant_initial: + raise RuntimeError("Initial directory '{}' not found!".format(initial_dir)) + + message = "removed" + if dry_run: + message = "to be " + message + + deleted = set() + + for (directory, subdirs, files) in os.walk(initial_dir, topdown=False): + forbidden = False + parent = directory + while parent: + parent, dirname = os.path.split(parent) + if dirname in FORBIDDEN_SUBDIRS: + forbidden = True + break + if forbidden: + continue + + is_empty = len(files) < 1 and len(set([os.path.join(directory, s) for s in subdirs]) - deleted) < 1 + + if is_empty and (initial_dir != directory or allow_initial_delete): + if not quiet: + print("{}: {}".format(message, directory)) + deleted.add(directory) + if not dry_run: + os.rmdir(directory) + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") + parser.add_argument("dir", metavar="DIR", type=str, nargs="+", help="directory to be searched") + parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") + parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-q", "--quiet", action="store_true", help="don't print names of directories being removed") + args = parser.parse_args() + for directory in args.dir: + remove_empty_directories(directory, args.allow_dir_removal, args.ignore_nonexistent_dir, + args.dry_run, args.quiet) + +if __name__ == "__main__": + main() diff --git a/docker/bin/tensorboard b/docker/bin/tensorboard new file mode 100755 index 0000000..dd7294d --- /dev/null +++ b/docker/bin/tensorboard @@ -0,0 +1,2 @@ +#!/bin/bash +python -m tensorboard.main "$@" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8a9718c..d4b46e4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,6 +15,7 @@ services: max-size: 50m ports: - "8888:8888" + - "6006:6006" volumes: - ../:/home/devel/handson-ml command: /opt/conda/bin/jupyter notebook --ip='*' --port=8888 --no-browser diff --git a/docker/jupyter_notebook_config.py b/docker/jupyter_notebook_config.py new file mode 100644 index 0000000..971a49a --- /dev/null +++ b/docker/jupyter_notebook_config.py @@ -0,0 +1,15 @@ +import os +import subprocess + +def export_script_and_view(model, os_path, contents_manager): + if model["type"] != "notebook": + return + dir_name, file_name = os.path.split(os_path) + file_base, file_ext = os.path.splitext(file_name) + if file_base.startswith("Untitled"): + return + export_name = file_base if file_ext == ".ipynb" else file_name + subprocess.check_call(["jupyter", "nbconvert", "--to", "script", file_name, "--output", export_name + "_script"], cwd=dir_name) + subprocess.check_call(["jupyter", "nbconvert", "--to", "html", file_name, "--output", export_name + "_view"], cwd=dir_name) + +c.FileContentsManager.post_save_hook = export_script_and_view diff --git a/docker/nbdime-1-details.patch b/docker/nbdime-1-details.patch new file mode 100644 index 0000000..98f76d6 --- /dev/null +++ b/docker/nbdime-1-details.patch @@ -0,0 +1,17 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -548,8 +548,12 @@ def set_notebook_diff_targets(sources=True, outputs=True, attachments=True, meta + metadata_keys = ("/cells/*/metadata", "/metadata", "/cells/*/outputs/*/metadata") + if metadata: + for key in metadata_keys: +- if key in notebook_differs: +- del notebook_differs[key] ++ if details: ++ if key in notebook_differs: ++ del notebook_differs[key] ++ else: ++ notebook_differs[key] = diff_ignore_keys( ++ inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore diff --git a/docker/nbdime-2-toc.patch b/docker/nbdime-2-toc.patch new file mode 100644 index 0000000..4924e66 --- /dev/null +++ b/docker/nbdime-2-toc.patch @@ -0,0 +1,11 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -553,7 +553,7 @@ + del notebook_differs[key] + else: + notebook_differs[key] = diff_ignore_keys( +- inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) ++ inner_differ=diff, ignore_keys=['toc', 'collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore