Make notebook code match book examples more closely in chapter 1

main
Aurélien Geron 2017-06-01 09:57:58 +02:00
parent 88acd2b4b9
commit 1bc60fe315
1 changed files with 213 additions and 86 deletions

View File

@ -2,7 +2,10 @@
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"**Chapter 1 The Machine Learning landscape**\n", "**Chapter 1 The Machine Learning landscape**\n",
"\n", "\n",
@ -11,14 +14,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"# Setup" "# Setup"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
] ]
@ -28,6 +37,8 @@
"execution_count": 1, "execution_count": 1,
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"deletable": true,
"editable": true,
"slideshow": { "slideshow": {
"slide_type": "-" "slide_type": "-"
} }
@ -67,7 +78,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"# Load and prepare Life satisfaction data" "# Load and prepare Life satisfaction data"
] ]
@ -76,7 +90,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -95,7 +111,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 3,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -104,7 +122,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"# Load and prepare GDP per capita data" "# Load and prepare GDP per capita data"
] ]
@ -113,7 +134,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -129,7 +152,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -142,7 +167,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 6,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -153,7 +180,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 7,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -166,9 +195,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 8,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -195,7 +226,22 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 9,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"sample_data.to_csv(\"life_satisfaction_vs_gdp_per_capita.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -204,9 +250,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 11,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -230,9 +278,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 12,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -247,9 +297,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 13,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -265,9 +317,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 14,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -279,9 +333,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 15,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -300,9 +356,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 16,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -311,9 +369,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 17,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -322,26 +382,40 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 18,
"metadata": { "metadata": {
"collapsed": false "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"backup = oecd_bli, gdp_per_capita\n", "backup = oecd_bli, gdp_per_capita\n",
"\n", "\n",
"def prepare_country_stats(oecd_bli, gdp_per_capita):\n", "def prepare_country_stats(oecd_bli, gdp_per_capita):\n",
" return sample_data\n", " return sample_data"
"\n", ]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Code example\n", "# Code example\n",
"########################################################################\n", "import matplotlib\n",
"import sklearn\n", "import matplotlib.pyplot as plt\n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"import sklearn\n",
"\n", "\n",
"# Load the data\n", "# Load the data\n",
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", "oecd_bli = pd.read_csv(datapath + \"oecd_bli_2015.csv\", thousands=',')\n",
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n", "gdp_per_capita = pd.read_csv(datapath + \"gdp_per_capita.csv\",thousands=',',delimiter='\\t',\n",
" encoding='latin1', na_values=\"n/a\")\n", " encoding='latin1', na_values=\"n/a\")\n",
"\n", "\n",
"# Prepare the data\n", "# Prepare the data\n",
@ -354,24 +428,36 @@
"plt.show()\n", "plt.show()\n",
"\n", "\n",
"# Select a linear model\n", "# Select a linear model\n",
"lin_reg_model = sklearn.linear_model.LinearRegression()\n", "model = sklearn.linear_model.LinearRegression()\n",
"\n", "\n",
"# Train the model\n", "# Train the model\n",
"lin_reg_model.fit(X, y)\n", "model.fit(X, y)\n",
"\n", "\n",
"# Make a prediction for Cyprus\n", "# Make a prediction for Cyprus\n",
"X_new = [[22587]] # Cyprus' GDP per capita\n", "X_new = [[22587]] # Cyprus' GDP per capita\n",
"print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n", "print(model.predict(X_new)) # outputs [[ 5.96242338]]"
"########################################################################\n", ]
"\n", },
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"oecd_bli, gdp_per_capita = backup" "oecd_bli, gdp_per_capita = backup"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 21,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -380,9 +466,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 22,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -399,9 +487,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 23,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -432,9 +522,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 24,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -458,9 +550,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 25,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -469,9 +563,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 26,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -480,9 +576,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 27,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -513,50 +611,79 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 28,
"metadata": { "metadata": {
"collapsed": false "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"backup = oecd_bli, gdp_per_capita\n", "backup = oecd_bli, gdp_per_capita\n",
"\n", "\n",
"def prepare_country_stats(oecd_bli, gdp_per_capita):\n", "def prepare_country_stats(oecd_bli, gdp_per_capita):\n",
" return sample_data\n", " return sample_data"
"\n", ]
"# Code example\n", },
"########################################################################\n", {
"from sklearn import neighbors\n", "cell_type": "code",
"import numpy as np\n", "execution_count": 29,
"import pandas as pd\n", "metadata": {
"\n", "collapsed": true,
"# Load the data\n", "deletable": true,
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", "editable": true
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n", },
" encoding='latin1', na_values=\"n/a\")\n", "outputs": [],
"\n", "source": [
"# Prepare the data\n", "# Replace this linear model:\n",
"country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n", "model = sklearn.linear_model.LinearRegression()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# with this k-neighbors regression model:\n",
"model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"X = np.c_[country_stats[\"GDP per capita\"]]\n", "X = np.c_[country_stats[\"GDP per capita\"]]\n",
"y = np.c_[country_stats[\"Life satisfaction\"]]\n", "y = np.c_[country_stats[\"Life satisfaction\"]]\n",
"\n", "\n",
"# Visualize the data\n",
"country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n",
"plt.show()\n",
"\n",
"# Select a k-neighboors regression model\n",
"k_neigh_reg_model = neighbors.KNeighborsRegressor(n_neighbors=3)\n",
"\n",
"# Train the model\n", "# Train the model\n",
"k_neigh_reg_model.fit(X, y)\n", "model.fit(X, y)\n",
"\n", "\n",
"# Make a prediction for Cyprus\n", "# Make a prediction for Cyprus\n",
"X_new = [[22587]] # Cyprus' GDP per capita\n", "X_new = np.array([[22587.0]]) # Cyprus' GDP per capita\n",
"print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n", "print(model.predict(X_new)) # outputs [[ 5.76666667]]"
"########################################################################\n",
"\n",
"oecd_bli, gdp_per_capita = backup"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -575,7 +702,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.5.1" "version": "3.5.3"
}, },
"nav_menu": {}, "nav_menu": {},
"toc": { "toc": {