Use read_csv() to download the data directly

main
Aurélien Geron 2022-02-19 23:18:10 +13:00
parent 85171acd17
commit ed704b8e34
1 changed files with 49 additions and 77 deletions

View File

@ -111,41 +111,6 @@
"plt.rc('ytick', labelsize=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download `lifesat.csv` from github, unless it's already available locally:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading lifesat.csv\n"
]
}
],
"source": [
"from pathlib import Path\n",
"import urllib.request\n",
"\n",
"datapath = Path() / \"datasets\" / \"lifesat\"\n",
"datapath.mkdir(parents=True, exist_ok=True)\n",
"\n",
"data_root = \"https://github.com/ageron/data/raw/main/\"\n",
"filename = \"lifesat.csv\"\n",
"if not (datapath / filename).is_file():\n",
" print(\"Downloading\", filename)\n",
" url = data_root + \"lifesat/\" + filename\n",
" urllib.request.urlretrieve(url, datapath / filename)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -155,7 +120,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@ -179,15 +144,14 @@
}
],
"source": [
"from pathlib import Path\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"# Load and prepare the data\n",
"lifesat = pd.read_csv(Path() / \"datasets\" / \"lifesat\" / \"lifesat.csv\")\n",
"# Download and prepare the data\n",
"data_root = \"https://github.com/ageron/data/raw/main/\"\n",
"lifesat = pd.read_csv(data_root + \"lifesat/lifesat.csv\")\n",
"X = lifesat[[\"GDP per capita (USD)\"]].values\n",
"y = lifesat[[\"Life satisfaction\"]].values\n",
"\n",
@ -232,7 +196,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -279,10 +243,12 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"# Where to save the figures\n",
"IMAGES_PATH = Path() / \"images\" / \"fundamentals\"\n",
"IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
@ -312,7 +278,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -325,6 +291,12 @@
}
],
"source": [
"import urllib.request\n",
"\n",
"datapath = Path() / \"datasets\" / \"lifesat\"\n",
"datapath.mkdir(parents=True, exist_ok=True)\n",
"\n",
"data_root = \"https://github.com/ageron/data/raw/main/\"\n",
"for filename in (\"oecd_bli.csv\", \"gdp_per_capita.csv\"):\n",
" if not (datapath / filename).is_file():\n",
" print(\"Downloading\", filename)\n",
@ -334,7 +306,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@ -351,7 +323,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@ -417,7 +389,7 @@
"Algeria 10681.679297"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -444,7 +416,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -717,7 +689,7 @@
"[5 rows x 24 columns]"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -738,7 +710,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -811,7 +783,7 @@
"Chile 23324.524751 6.5"
]
},
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -834,7 +806,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@ -907,7 +879,7 @@
"Hungary 31007.768407 5.6"
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -923,7 +895,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -933,7 +905,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [
{
@ -984,7 +956,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"metadata": {},
"outputs": [
{
@ -1069,7 +1041,7 @@
"United States 60235.728492 6.9"
]
},
"execution_count": 17,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@ -1081,7 +1053,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"outputs": [
{
@ -1126,7 +1098,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"metadata": {},
"outputs": [
{
@ -1152,7 +1124,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"outputs": [
{
@ -1188,7 +1160,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"outputs": [
{
@ -1197,7 +1169,7 @@
"37655.1803457421"
]
},
"execution_count": 21,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@ -1209,7 +1181,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"metadata": {},
"outputs": [
{
@ -1218,7 +1190,7 @@
"6.301656332738056"
]
},
"execution_count": 22,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -1230,7 +1202,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -1271,7 +1243,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -1368,7 +1340,7 @@
"Luxembourg 110261.157353 6.9"
]
},
"execution_count": 24,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@ -1381,7 +1353,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@ -1400,7 +1372,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 25,
"metadata": {},
"outputs": [
{
@ -1448,7 +1420,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 26,
"metadata": {},
"outputs": [
{
@ -1491,7 +1463,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"outputs": [
{
@ -1505,7 +1477,7 @@
"Name: Life satisfaction, dtype: float64"
]
},
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@ -1517,7 +1489,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 28,
"metadata": {},
"outputs": [
{
@ -1633,7 +1605,7 @@
"Switzerland 68393.306004"
]
},
"execution_count": 29,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@ -1645,7 +1617,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 29,
"metadata": {},
"outputs": [
{
@ -1727,9 +1699,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "homl3",
"display_name": "Python 3",
"language": "python",
"name": "homl3"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@ -1741,7 +1713,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.9.10"
},
"metadata": {
"interpreter": {