From ed704b8e34db6a9f5adcd7f92c4aee9dbba025bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sat, 19 Feb 2022 23:18:10 +1300 Subject: [PATCH] Use read_csv() to download the data directly --- 01_the_machine_learning_landscape.ipynb | 126 +++++++++--------------- 1 file changed, 49 insertions(+), 77 deletions(-) diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb index fb172c7..8993776 100644 --- a/01_the_machine_learning_landscape.ipynb +++ b/01_the_machine_learning_landscape.ipynb @@ -111,41 +111,6 @@ "plt.rc('ytick', labelsize=10)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Download `lifesat.csv` from github, unless it's already available locally:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading lifesat.csv\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "import urllib.request\n", - "\n", - "datapath = Path() / \"datasets\" / \"lifesat\"\n", - "datapath.mkdir(parents=True, exist_ok=True)\n", - "\n", - "data_root = \"https://github.com/ageron/data/raw/main/\"\n", - "filename = \"lifesat.csv\"\n", - "if not (datapath / filename).is_file():\n", - " print(\"Downloading\", filename)\n", - " url = data_root + \"lifesat/\" + filename\n", - " urllib.request.urlretrieve(url, datapath / filename)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -155,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -179,15 +144,14 @@ } ], "source": [ - "from pathlib import Path\n", - "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.linear_model import LinearRegression\n", "\n", - "# Load and prepare the data\n", - "lifesat = pd.read_csv(Path() / \"datasets\" / \"lifesat\" / \"lifesat.csv\")\n", + "# Download and prepare the data\n", + "data_root = \"https://github.com/ageron/data/raw/main/\"\n", + "lifesat = pd.read_csv(data_root + \"lifesat/lifesat.csv\")\n", "X = lifesat[[\"GDP per capita (USD)\"]].values\n", "y = lifesat[[\"Life satisfaction\"]].values\n", "\n", @@ -232,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -279,10 +243,12 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ + "from pathlib import Path\n", + "\n", "# Where to save the figures\n", "IMAGES_PATH = Path() / \"images\" / \"fundamentals\"\n", "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", @@ -312,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -325,6 +291,12 @@ } ], "source": [ + "import urllib.request\n", + "\n", + "datapath = Path() / \"datasets\" / \"lifesat\"\n", + "datapath.mkdir(parents=True, exist_ok=True)\n", + "\n", + "data_root = \"https://github.com/ageron/data/raw/main/\"\n", "for filename in (\"oecd_bli.csv\", \"gdp_per_capita.csv\"):\n", " if not (datapath / filename).is_file():\n", " print(\"Downloading\", filename)\n", @@ -334,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -351,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -417,7 +389,7 @@ "Algeria 10681.679297" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -444,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -717,7 +689,7 @@ "[5 rows x 24 columns]" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -738,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -811,7 +783,7 @@ "Chile 23324.524751 6.5" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -834,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -907,7 +879,7 @@ "Hungary 31007.768407 5.6" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -923,7 +895,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -933,7 +905,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -984,7 +956,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1069,7 +1041,7 @@ "United States 60235.728492 6.9" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1081,7 +1053,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1126,7 +1098,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1152,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1188,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1197,7 +1169,7 @@ "37655.1803457421" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1209,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1218,7 +1190,7 @@ "6.301656332738056" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1230,7 +1202,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1271,7 +1243,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1368,7 +1340,7 @@ "Luxembourg 110261.157353 6.9" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1381,7 +1353,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1400,7 +1372,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1448,7 +1420,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1491,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1505,7 +1477,7 @@ "Name: Life satisfaction, dtype: float64" ] }, - "execution_count": 28, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1517,7 +1489,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1633,7 +1605,7 @@ "Switzerland 68393.306004" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1645,7 +1617,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1727,9 +1699,9 @@ ], "metadata": { "kernelspec": { - "display_name": "homl3", + "display_name": "Python 3", "language": "python", - "name": "homl3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1741,7 +1713,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.10" }, "metadata": { "interpreter": {