handson-ml/01_the_machine_learning_lan...

601 lines
18 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Chapter 1 The Machine Learning landscape**\n",
"\n",
"_This is the code used to generate some of the figures in chapter 1._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import numpy.random as rnd\n",
"import os\n",
"\n",
"# to make this notebook's output stable across runs\n",
"rnd.seed(42)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"# Where to save the figures\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"fundamentals\"\n",
"\n",
"def save_fig(fig_id, tight_layout=True):\n",
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
" print(\"Saving figure\", fig_id)\n",
" if tight_layout:\n",
" plt.tight_layout()\n",
" plt.savefig(path, format='png', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load and prepare Life satisfaction data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Download CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n",
"datapath = \"datasets/lifesat/\"\n",
"\n",
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n",
"oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n",
"oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n",
"oecd_bli.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"oecd_bli[\"Life satisfaction\"].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load and prepare GDP per capita data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Download data from http://goo.gl/j1MSKe (=> imf.org)\n",
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',', delimiter='\\t',\n",
" encoding='latin1', na_values=\"n/a\")\n",
"gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n",
"gdp_per_capita.set_index(\"Country\", inplace=True)\n",
"gdp_per_capita.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)\n",
"full_country_stats.sort_values(by=\"GDP per capita\", inplace=\"True\")\n",
"full_country_stats"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"full_country_stats[[\"GDP per capita\", 'Life satisfaction']].loc[\"United States\"]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"remove_indices = [0, 1, 6, 8, 33, 34, 35]\n",
"keep_indices = list(set(range(36)) - set(remove_indices))\n",
"\n",
"sample_data = full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]\n",
"missing_data = full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[remove_indices]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n",
"plt.axis([0, 60000, 0, 10])\n",
"position_text = {\n",
" \"Hungary\": (5000, 1),\n",
" \"Korea\": (18000, 1.7),\n",
" \"France\": (29000, 2.4),\n",
" \"Australia\": (40000, 3.0),\n",
" \"United States\": (52000, 3.8),\n",
"}\n",
"for country, pos_text in position_text.items():\n",
" pos_data_x, pos_data_y = sample_data.loc[country]\n",
" country = \"U.S.\" if country == \"United States\" else country\n",
" plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,\n",
" arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))\n",
" plt.plot(pos_data_x, pos_data_y, \"ro\")\n",
"save_fig('money_happy_scatterplot')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sample_data.loc[list(position_text.keys())]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n",
"plt.axis([0, 60000, 0, 10])\n",
"X=np.linspace(0, 60000, 1000)\n",
"plt.plot(X, 2*X/100000, \"r\")\n",
"plt.text(40000, 2.7, r\"$\\theta_0 = 0$\", fontsize=14, color=\"r\")\n",
"plt.text(40000, 1.8, r\"$\\theta_1 = 2 \\times 10^{-5}$\", fontsize=14, color=\"r\")\n",
"plt.plot(X, 8 - 5*X/100000, \"g\")\n",
"plt.text(5000, 9.1, r\"$\\theta_0 = 8$\", fontsize=14, color=\"g\")\n",
"plt.text(5000, 8.2, r\"$\\theta_1 = -5 \\times 10^{-5}$\", fontsize=14, color=\"g\")\n",
"plt.plot(X, 4 + 5*X/100000, \"b\")\n",
"plt.text(5000, 3.5, r\"$\\theta_0 = 4$\", fontsize=14, color=\"b\")\n",
"plt.text(5000, 2.6, r\"$\\theta_1 = 5 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n",
"save_fig('tweaking_model_params_plot')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn import linear_model\n",
"lin1 = linear_model.LinearRegression()\n",
"Xsample = np.c_[sample_data[\"GDP per capita\"]]\n",
"ysample = np.c_[sample_data[\"Life satisfaction\"]]\n",
"lin1.fit(Xsample, ysample)\n",
"t0, t1 = lin1.intercept_[0], lin1.coef_[0][0]\n",
"t0, t1"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n",
"plt.axis([0, 60000, 0, 10])\n",
"X=np.linspace(0, 60000, 1000)\n",
"plt.plot(X, t0 + t1*X, \"b\")\n",
"plt.text(5000, 3.1, r\"$\\theta_0 = 4.85$\", fontsize=14, color=\"b\")\n",
"plt.text(5000, 2.2, r\"$\\theta_1 = 4.91 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n",
"save_fig('best_fit_model_plot')\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cyprus_gdp_per_capita = gdp_per_capita.loc[\"Cyprus\"][\"GDP per capita\"]\n",
"print(cyprus_gdp_per_capita)\n",
"cyprus_predicted_life_satisfaction = lin1.predict(cyprus_gdp_per_capita)[0][0]\n",
"cyprus_predicted_life_satisfaction"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3), s=1)\n",
"X=np.linspace(0, 60000, 1000)\n",
"plt.plot(X, t0 + t1*X, \"b\")\n",
"plt.axis([0, 60000, 0, 10])\n",
"plt.text(5000, 7.5, r\"$\\theta_0 = 4.85$\", fontsize=14, color=\"b\")\n",
"plt.text(5000, 6.6, r\"$\\theta_1 = 4.91 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n",
"plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], \"r--\")\n",
"plt.text(25000, 5.0, r\"Prediction = 5.96\", fontsize=14, color=\"b\")\n",
"plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, \"ro\")\n",
"save_fig('cyprus_prediction_plot')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sample_data[7:10]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"(5.1+5.7+6.5)/3"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"backup = oecd_bli, gdp_per_capita\n",
"\n",
"def prepare_country_stats(oecd_bli, gdp_per_capita):\n",
" return sample_data\n",
"\n",
"# Code example\n",
"########################################################################\n",
"import sklearn\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# Load the data\n",
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n",
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n",
" encoding='latin1', na_values=\"n/a\")\n",
"\n",
"# Prepare the data\n",
"country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n",
"X = np.c_[country_stats[\"GDP per capita\"]]\n",
"y = np.c_[country_stats[\"Life satisfaction\"]]\n",
"\n",
"# Visualize the data\n",
"country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n",
"plt.show()\n",
"\n",
"# Select a linear model\n",
"lin_reg_model = sklearn.linear_model.LinearRegression()\n",
"\n",
"# Train the model\n",
"lin_reg_model.fit(X, y)\n",
"\n",
"# Make a prediction for Cyprus\n",
"X_new = [[22587]] # Cyprus' GDP per capita\n",
"print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n",
"########################################################################\n",
"\n",
"oecd_bli, gdp_per_capita = backup"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"missing_data"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"position_text2 = {\n",
" \"Brazil\": (1000, 9.0),\n",
" \"Mexico\": (11000, 9.0),\n",
" \"Chile\": (25000, 9.0),\n",
" \"Czech Republic\": (35000, 9.0),\n",
" \"Norway\": (60000, 3),\n",
" \"Switzerland\": (72000, 3.0),\n",
" \"Luxembourg\": (90000, 3.0),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n",
"plt.axis([0, 110000, 0, 10])\n",
"\n",
"for country, pos_text in position_text2.items():\n",
" pos_data_x, pos_data_y = missing_data.loc[country]\n",
" plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,\n",
" arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))\n",
" plt.plot(pos_data_x, pos_data_y, \"rs\")\n",
"\n",
"X=np.linspace(0, 110000, 1000)\n",
"plt.plot(X, t0 + t1*X, \"b:\")\n",
"\n",
"lin_reg_full = linear_model.LinearRegression()\n",
"Xfull = np.c_[full_country_stats[\"GDP per capita\"]]\n",
"yfull = np.c_[full_country_stats[\"Life satisfaction\"]]\n",
"lin_reg_full.fit(Xfull, yfull)\n",
"\n",
"t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]\n",
"X = np.linspace(0, 110000, 1000)\n",
"plt.plot(X, t0full + t1full * X, \"k\")\n",
"\n",
"save_fig('representative_training_data_scatterplot')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"full_country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n",
"plt.axis([0, 110000, 0, 10])\n",
"\n",
"from sklearn import preprocessing\n",
"from sklearn import pipeline\n",
"\n",
"poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False)\n",
"scaler = preprocessing.StandardScaler()\n",
"lin_reg2 = linear_model.LinearRegression()\n",
"\n",
"pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])\n",
"pipeline_reg.fit(Xfull, yfull)\n",
"curve = pipeline_reg.predict(X[:, np.newaxis])\n",
"plt.plot(X, curve)\n",
"save_fig('overfitting_model_plot')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"full_country_stats.loc[[c for c in full_country_stats.index if \"W\" in c.upper()]][\"Life satisfaction\"]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gdp_per_capita.loc[[c for c in gdp_per_capita.index if \"W\" in c.upper()]].head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(8,3))\n",
"\n",
"plt.xlabel(\"GDP per capita\")\n",
"plt.ylabel('Life satisfaction')\n",
"\n",
"plt.plot(list(sample_data[\"GDP per capita\"]), list(sample_data[\"Life satisfaction\"]), \"bo\")\n",
"plt.plot(list(missing_data[\"GDP per capita\"]), list(missing_data[\"Life satisfaction\"]), \"rs\")\n",
"\n",
"X = np.linspace(0, 110000, 1000)\n",
"plt.plot(X, t0full + t1full * X, \"r--\", label=\"Linear model on all data\")\n",
"plt.plot(X, t0 + t1*X, \"b:\", label=\"Linear model on partial data\")\n",
"\n",
"ridge = linear_model.Ridge(alpha=10**9.5)\n",
"Xsample = np.c_[sample_data[\"GDP per capita\"]]\n",
"ysample = np.c_[sample_data[\"Life satisfaction\"]]\n",
"ridge.fit(Xsample, ysample)\n",
"t0ridge, t1ridge = ridge.intercept_[0], ridge.coef_[0][0]\n",
"plt.plot(X, t0ridge + t1ridge * X, \"b\", label=\"Regularized linear model on partial data\")\n",
"\n",
"plt.legend(loc=\"lower right\")\n",
"plt.axis([0, 110000, 0, 10])\n",
"save_fig('ridge_model_plot')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"backup = oecd_bli, gdp_per_capita\n",
"\n",
"def prepare_country_stats(oecd_bli, gdp_per_capita):\n",
" return sample_data\n",
"\n",
"# Code example\n",
"########################################################################\n",
"from sklearn import neighbors\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# Load the data\n",
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n",
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n",
" encoding='latin1', na_values=\"n/a\")\n",
"\n",
"# Prepare the data\n",
"country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n",
"X = np.c_[country_stats[\"GDP per capita\"]]\n",
"y = np.c_[country_stats[\"Life satisfaction\"]]\n",
"\n",
"# Visualize the data\n",
"country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n",
"plt.show()\n",
"\n",
"# Select a k-neighboors regression model\n",
"k_neigh_reg_model = neighbors.KNeighborsRegressor(n_neighbors=3)\n",
"\n",
"# Train the model\n",
"k_neigh_reg_model.fit(X, y)\n",
"\n",
"# Make a prediction for Cyprus\n",
"X_new = [[22587]] # Cyprus' GDP per capita\n",
"print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n",
"########################################################################\n",
"\n",
"oecd_bli, gdp_per_capita = backup"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
},
"nav_menu": {},
"toc": {
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 6,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": true
},
"toc_position": {
"height": "616px",
"left": "0px",
"right": "20px",
"top": "106px",
"width": "213px"
}
},
"nbformat": 4,
"nbformat_minor": 0
}