Add fundamentals and training_linear_models notebooks
parent
373535d8e4
commit
240f3d7828
|
@ -27,11 +27,26 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from __future__ import division, print_function, unicode_literals"
|
||||
"from __future__ import division, print_function, unicode_literals\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"plt.rcParams['axes.labelsize'] = 14\n",
|
||||
"plt.rcParams['xtick.labelsize'] = 12\n",
|
||||
"plt.rcParams['ytick.labelsize'] = 12\n",
|
||||
"\n",
|
||||
"PROJECT_ROOT_DIR = \".\"\n",
|
||||
"CHAPTER_ID = \"end_to_end_project\"\n",
|
||||
"\n",
|
||||
"def save_fig(fig_id):\n",
|
||||
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
|
||||
" print(\"Saving figure\", fig_id)\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.savefig(path, format='png', dpi=300)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -157,7 +172,8 @@
|
|||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"housing.hist(bins=50, figsize=(20,15))\n",
|
||||
"housing.hist(bins=50, figsize=(11,8))\n",
|
||||
"save_fig(\"attribute_histogram_plots\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
|
@ -171,6 +187,7 @@
|
|||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import numpy.random as rnd\n",
|
||||
"rnd.seed(42) # to make this notebook's output identical at every run\n",
|
||||
"\n",
|
||||
"def split_train_test(data, test_ratio):\n",
|
||||
" shuffled_indices = rnd.permutation(len(data))\n",
|
||||
|
@ -349,7 +366,8 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")"
|
||||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n",
|
||||
"save_fig(\"bad_visualization\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -360,23 +378,27 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)"
|
||||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n",
|
||||
"save_fig(\"better_visualization\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n",
|
||||
" s=housing['population']/100, label=\"population\",\n",
|
||||
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
|
||||
" colorbar=True, alpha=0.4,\n",
|
||||
" colorbar=True, alpha=0.4, figsize=(10,7),\n",
|
||||
")\n",
|
||||
"plt.legend()"
|
||||
"plt.legend()\n",
|
||||
"save_fig(\"housing_prices_scatterplot\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -386,6 +408,36 @@
|
|||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import matplotlib.image as mpimg\n",
|
||||
"california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n",
|
||||
"ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
|
||||
" s=housing['population']/100, label=\"Population\",\n",
|
||||
" c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
|
||||
" colorbar=False, alpha=0.4,\n",
|
||||
" )\n",
|
||||
"plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n",
|
||||
"plt.ylabel(\"Latitude\", fontsize=14)\n",
|
||||
"plt.xlabel(\"Longitude\", fontsize=14)\n",
|
||||
"\n",
|
||||
"prices = housing[\"median_house_value\"]\n",
|
||||
"tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
|
||||
"cbar = plt.colorbar()\n",
|
||||
"cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n",
|
||||
"cbar.set_label('Median House Value', fontsize=16)\n",
|
||||
"\n",
|
||||
"plt.legend(fontsize=16)\n",
|
||||
"save_fig(\"california_housing_prices\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"corr_matrix = housing.corr()\n",
|
||||
"corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
|
||||
|
@ -393,7 +445,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 28,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -401,12 +453,14 @@
|
|||
"source": [
|
||||
"housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
|
||||
" alpha=0.3)\n",
|
||||
"plt.axis([0, 16, 0, 550000])"
|
||||
"plt.axis([0, 16, 0, 550000])\n",
|
||||
"save_fig(\"income_vs_house_value_scatterplot\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 29,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -415,13 +469,14 @@
|
|||
"from pandas.tools.plotting import scatter_matrix\n",
|
||||
"\n",
|
||||
"attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n",
|
||||
"scatter_matrix(housing[attributes], figsize=(12, 8))\n",
|
||||
"scatter_matrix(housing[attributes], figsize=(11, 8))\n",
|
||||
"save_fig(\"scatter_matrix_plot\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
|
@ -434,7 +489,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 31,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -446,7 +501,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 32,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -460,7 +515,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -478,7 +533,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
|
@ -490,7 +545,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -502,7 +557,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"execution_count": 36,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -513,7 +568,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"execution_count": 37,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -525,7 +580,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"execution_count": 38,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -539,7 +594,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"execution_count": 39,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -557,7 +612,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"execution_count": 40,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -568,7 +623,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"execution_count": 41,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -579,7 +634,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"execution_count": 42,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -590,7 +645,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"execution_count": 43,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -602,7 +657,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"execution_count": 44,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -618,7 +673,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"execution_count": 45,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -629,7 +684,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"execution_count": 46,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -644,7 +699,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"execution_count": 47,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -655,7 +710,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"execution_count": 48,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -669,7 +724,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"execution_count": 49,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -702,7 +757,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"execution_count": 50,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -722,7 +777,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"execution_count": 51,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -761,7 +816,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"execution_count": 52,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -773,7 +828,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"execution_count": 53,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -791,7 +846,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 53,
|
||||
"execution_count": 54,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -805,7 +860,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 54,
|
||||
"execution_count": 55,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -822,7 +877,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"execution_count": 56,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -838,7 +893,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"execution_count": 57,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -852,7 +907,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"execution_count": 58,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -877,7 +932,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"execution_count": 59,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -892,7 +947,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"execution_count": 60,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -908,7 +963,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 60,
|
||||
"execution_count": 61,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -922,7 +977,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"execution_count": 62,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -940,7 +995,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"execution_count": 63,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -956,7 +1011,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -968,7 +1023,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"execution_count": 65,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -986,7 +1041,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"execution_count": 66,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1006,7 +1061,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"execution_count": 67,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1017,7 +1072,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"execution_count": 68,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1028,7 +1083,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"execution_count": 69,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1040,7 +1095,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"execution_count": 70,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1062,7 +1117,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"execution_count": 71,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1074,7 +1129,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"execution_count": 72,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1086,7 +1141,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"execution_count": 73,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1100,7 +1155,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"execution_count": 74,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1138,7 +1193,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"execution_count": 75,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1170,7 +1225,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 75,
|
||||
"execution_count": 76,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
|
@ -1181,7 +1236,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 76,
|
||||
"execution_count": 77,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1192,7 +1247,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 77,
|
||||
"execution_count": 78,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
@ -1211,7 +1266,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 78,
|
||||
"execution_count": 79,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
|
|
|
@ -0,0 +1,569 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Fundamentals of Machine Learning**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": false,
|
||||
"slideshow": {
|
||||
"slide_type": "-"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from __future__ import division, print_function, unicode_literals\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"plt.rcParams['axes.labelsize'] = 14\n",
|
||||
"plt.rcParams['xtick.labelsize'] = 12\n",
|
||||
"plt.rcParams['ytick.labelsize'] = 12\n",
|
||||
"\n",
|
||||
"PROJECT_ROOT_DIR = \".\"\n",
|
||||
"CHAPTER_ID = \"fundamentals\"\n",
|
||||
"\n",
|
||||
"def save_fig(fig_id):\n",
|
||||
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
|
||||
" print(\"Saving figure\", fig_id)\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.savefig(path, format='png', dpi=300)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load and prepare Life satisfaction data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Download CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n",
|
||||
"datapath = \"datasets/lifesat/\"\n",
|
||||
"\n",
|
||||
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n",
|
||||
"oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n",
|
||||
"oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n",
|
||||
"oecd_bli.head(2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"oecd_bli[\"Life satisfaction\"].head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load and prepare GDP per capita data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download data from http://goo.gl/j1MSKe (=> imf.org)\n",
|
||||
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',', delimiter='\\t',\n",
|
||||
" encoding='latin1', na_values=\"n/a\")\n",
|
||||
"gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n",
|
||||
"gdp_per_capita.set_index(\"Country\", inplace=True)\n",
|
||||
"gdp_per_capita.head(2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)\n",
|
||||
"full_country_stats.sort_values(by=\"GDP per capita\", inplace=\"True\")\n",
|
||||
"full_country_stats"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"full_country_stats[[\"GDP per capita\", 'Life satisfaction']].loc[\"United States\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remove_indices = [0, 1, 6, 8, 33, 34, 35]\n",
|
||||
"keep_indices = list(set(range(36)) - set(remove_indices))\n",
|
||||
"\n",
|
||||
"sample_data = full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]\n",
|
||||
"missing_data = full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[remove_indices]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n",
|
||||
"plt.axis([0, 60000, 0, 10])\n",
|
||||
"position_text = {\n",
|
||||
" \"Hungary\": (5000, 1),\n",
|
||||
" \"Korea\": (18000, 1.7),\n",
|
||||
" \"France\": (29000, 2.4),\n",
|
||||
" \"Australia\": (40000, 3.1),\n",
|
||||
" \"United States\": (52000, 3.8),\n",
|
||||
"}\n",
|
||||
"for country, pos_text in position_text.items():\n",
|
||||
" pos_data_x, pos_data_y = sample_data.loc[country]\n",
|
||||
" country = \"U.S.\" if country == \"United States\" else country\n",
|
||||
" plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,\n",
|
||||
" arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))\n",
|
||||
" plt.plot(pos_data_x, pos_data_y, \"ro\")\n",
|
||||
"save_fig('money_happy_scatterplot')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_data.loc[list(position_text.keys())]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n",
|
||||
"plt.axis([0, 60000, 0, 10])\n",
|
||||
"X=np.linspace(0, 60000, 1000)\n",
|
||||
"plt.plot(X, 2*X/100000, \"r\")\n",
|
||||
"plt.text(40000, 2.7, r\"$\\theta_0 = 0$\", fontsize=14, color=\"r\")\n",
|
||||
"plt.text(40000, 1.8, r\"$\\theta_1 = 2 \\times 10^{-5}$\", fontsize=14, color=\"r\")\n",
|
||||
"plt.plot(X, 8 - 5*X/100000, \"g\")\n",
|
||||
"plt.text(5000, 9.1, r\"$\\theta_0 = 8$\", fontsize=14, color=\"g\")\n",
|
||||
"plt.text(5000, 8.2, r\"$\\theta_1 = -5 \\times 10^{-5}$\", fontsize=14, color=\"g\")\n",
|
||||
"plt.plot(X, 4 + 5*X/100000, \"b\")\n",
|
||||
"plt.text(5000, 3.5, r\"$\\theta_0 = 4$\", fontsize=14, color=\"b\")\n",
|
||||
"plt.text(5000, 2.6, r\"$\\theta_1 = 5 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n",
|
||||
"save_fig('tweaking_model_params_plot')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn import linear_model\n",
|
||||
"lin1 = linear_model.LinearRegression()\n",
|
||||
"Xsample = np.c_[sample_data[\"GDP per capita\"]]\n",
|
||||
"ysample = np.c_[sample_data[\"Life satisfaction\"]]\n",
|
||||
"lin1.fit(Xsample, ysample)\n",
|
||||
"t0, t1 = lin1.intercept_[0], lin1.coef_[0][0]\n",
|
||||
"t0, t1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n",
|
||||
"plt.axis([0, 60000, 0, 10])\n",
|
||||
"X=np.linspace(0, 60000, 1000)\n",
|
||||
"plt.plot(X, t0 + t1*X, \"b\")\n",
|
||||
"plt.text(5000, 3.1, r\"$\\theta_0 = 4.85$\", fontsize=14, color=\"b\")\n",
|
||||
"plt.text(5000, 2.2, r\"$\\theta_1 = 4.91 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n",
|
||||
"save_fig('best_fit_model_plot')\n",
|
||||
"plt.show()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cyprus_gdp_per_capita = gdp_per_capita.loc[\"Cyprus\"][\"GDP per capita\"]\n",
|
||||
"print(cyprus_gdp_per_capita)\n",
|
||||
"cyprus_predicted_life_satisfaction = lin1.predict(cyprus_gdp_per_capita)[0][0]\n",
|
||||
"cyprus_predicted_life_satisfaction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3), s=1)\n",
|
||||
"X=np.linspace(0, 60000, 1000)\n",
|
||||
"plt.plot(X, t0 + t1*X, \"b\")\n",
|
||||
"plt.axis([0, 60000, 0, 10])\n",
|
||||
"plt.text(5000, 7.5, r\"$\\theta_0 = 4.85$\", fontsize=14, color=\"b\")\n",
|
||||
"plt.text(5000, 6.6, r\"$\\theta_1 = 4.91 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n",
|
||||
"plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], \"r--\")\n",
|
||||
"plt.text(25000, 5.0, r\"Prediction = 5.96\", fontsize=14, color=\"b\")\n",
|
||||
"plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, \"ro\")\n",
|
||||
"save_fig('cyprus_prediction_plot')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_data[7:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"(5.1+5.7+6.5)/3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"backup = oecd_bli, gdp_per_capita\n",
|
||||
"\n",
|
||||
"def prepare_country_stats(oecd_bli, gdp_per_capita):\n",
|
||||
" return sample_data\n",
|
||||
"\n",
|
||||
"# Code example\n",
|
||||
"########################################################################\n",
|
||||
"import sklearn\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Load the data\n",
|
||||
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n",
|
||||
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n",
|
||||
" encoding='latin1', na_values=\"n/a\")\n",
|
||||
"\n",
|
||||
"# Prepare the data\n",
|
||||
"country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n",
|
||||
"X = np.c_[country_stats[\"GDP per capita\"]]\n",
|
||||
"y = np.c_[country_stats[\"Life satisfaction\"]]\n",
|
||||
"\n",
|
||||
"# Visualize the data\n",
|
||||
"country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Select a linear model\n",
|
||||
"lin_reg_model = sklearn.linear_model.LinearRegression()\n",
|
||||
"\n",
|
||||
"# Train the model\n",
|
||||
"lin_reg_model.fit(X, y)\n",
|
||||
"\n",
|
||||
"# Make a prediction for Cyprus\n",
|
||||
"X_new = [[22587]] # Cyprus' GDP per capita\n",
|
||||
"print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n",
|
||||
"########################################################################\n",
|
||||
"\n",
|
||||
"oecd_bli, gdp_per_capita = backup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"missing_data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"position_text2 = {\n",
|
||||
" \"Brazil\": (1000, 9.0),\n",
|
||||
" \"Mexico\": (11000, 9.0),\n",
|
||||
" \"Chile\": (25000, 9.0),\n",
|
||||
" \"Czech Republic\": (35000, 9.0),\n",
|
||||
" \"Norway\": (60000, 3),\n",
|
||||
" \"Switzerland\": (72000, 3.0),\n",
|
||||
" \"Luxembourg\": (90000, 3.0),\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n",
|
||||
"plt.axis([0, 110000, 0, 10])\n",
|
||||
"\n",
|
||||
"for country, pos_text in position_text2.items():\n",
|
||||
" pos_data_x, pos_data_y = missing_data.loc[country]\n",
|
||||
" plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,\n",
|
||||
" arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))\n",
|
||||
" plt.plot(pos_data_x, pos_data_y, \"rs\")\n",
|
||||
"\n",
|
||||
"X=np.linspace(0, 110000, 1000)\n",
|
||||
"plt.plot(X, t0 + t1*X, \"b:\")\n",
|
||||
"\n",
|
||||
"lin_reg_full = linear_model.LinearRegression()\n",
|
||||
"Xfull = np.c_[full_country_stats[\"GDP per capita\"]]\n",
|
||||
"yfull = np.c_[full_country_stats[\"Life satisfaction\"]]\n",
|
||||
"lin_reg_full.fit(Xfull, yfull)\n",
|
||||
"\n",
|
||||
"t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]\n",
|
||||
"X = np.linspace(0, 110000, 1000)\n",
|
||||
"plt.plot(X, t0full + t1full * X, \"k\")\n",
|
||||
"\n",
|
||||
"save_fig('representative_training_data_scatterplot')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"full_country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n",
|
||||
"plt.axis([0, 110000, 0, 10])\n",
|
||||
"\n",
|
||||
"from sklearn import preprocessing\n",
|
||||
"from sklearn import pipeline\n",
|
||||
"\n",
|
||||
"poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False)\n",
|
||||
"scaler = preprocessing.StandardScaler()\n",
|
||||
"lin_reg2 = linear_model.LinearRegression()\n",
|
||||
"\n",
|
||||
"pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])\n",
|
||||
"pipeline_reg.fit(Xfull, yfull)\n",
|
||||
"curve = pipeline_reg.predict(X[:, np.newaxis])\n",
|
||||
"plt.plot(X, curve)\n",
|
||||
"save_fig('overfitting_model_plot')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"full_country_stats.loc[[c for c in full_country_stats.index if \"W\" in c.upper()]][\"Life satisfaction\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gdp_per_capita.loc[[c for c in gdp_per_capita.index if \"W\" in c.upper()]].head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plt.figure(figsize=(8,3))\n",
|
||||
"\n",
|
||||
"plt.xlabel(\"GDP per capita\")\n",
|
||||
"plt.ylabel('Life satisfaction')\n",
|
||||
"\n",
|
||||
"plt.plot(list(sample_data[\"GDP per capita\"]), list(sample_data[\"Life satisfaction\"]), \"bo\")\n",
|
||||
"plt.plot(list(missing_data[\"GDP per capita\"]), list(missing_data[\"Life satisfaction\"]), \"rs\")\n",
|
||||
"\n",
|
||||
"X = np.linspace(0, 110000, 1000)\n",
|
||||
"plt.plot(X, t0full + t1full * X, \"r--\", label=\"Linear model on all data\")\n",
|
||||
"plt.plot(X, t0 + t1*X, \"b:\", label=\"Linear model on partial data\")\n",
|
||||
"\n",
|
||||
"ridge = linear_model.Ridge(alpha=10**9.5)\n",
|
||||
"Xsample = np.c_[sample_data[\"GDP per capita\"]]\n",
|
||||
"ysample = np.c_[sample_data[\"Life satisfaction\"]]\n",
|
||||
"ridge.fit(Xsample, ysample)\n",
|
||||
"t0ridge, t1ridge = ridge.intercept_[0], ridge.coef_[0][0]\n",
|
||||
"plt.plot(X, t0ridge + t1ridge * X, \"b\", label=\"Regularized linear model on partial data\")\n",
|
||||
"\n",
|
||||
"plt.legend(loc=\"lower right\")\n",
|
||||
"plt.axis([0, 110000, 0, 10])\n",
|
||||
"save_fig('ridge_model_plot')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"backup = oecd_bli, gdp_per_capita\n",
|
||||
"\n",
|
||||
"def prepare_country_stats(oecd_bli, gdp_per_capita):\n",
|
||||
" return sample_data\n",
|
||||
"\n",
|
||||
"# Code example\n",
|
||||
"########################################################################\n",
|
||||
"from sklearn import neighbors\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Load the data\n",
|
||||
"oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n",
|
||||
"gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n",
|
||||
" encoding='latin1', na_values=\"n/a\")\n",
|
||||
"\n",
|
||||
"# Prepare the data\n",
|
||||
"country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n",
|
||||
"X = np.c_[country_stats[\"GDP per capita\"]]\n",
|
||||
"y = np.c_[country_stats[\"Life satisfaction\"]]\n",
|
||||
"\n",
|
||||
"# Visualize the data\n",
|
||||
"country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n",
|
||||
"plt.show()\n",
|
||||
"\n",
|
||||
"# Select a k-neighboors regression model\n",
|
||||
"k_neigh_reg_model = neighbors.KNeighborsRegressor(n_neighbors=3)\n",
|
||||
"\n",
|
||||
"# Train the model\n",
|
||||
"k_neigh_reg_model.fit(X, y)\n",
|
||||
"\n",
|
||||
"# Make a prediction for Cyprus\n",
|
||||
"X_new = [[22587]] # Cyprus' GDP per capita\n",
|
||||
"print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n",
|
||||
"########################################################################\n",
|
||||
"\n",
|
||||
"oecd_bli, gdp_per_capita = backup"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.1"
|
||||
},
|
||||
"toc": {
|
||||
"toc_cell": false,
|
||||
"toc_number_sections": false,
|
||||
"toc_section_display": "block",
|
||||
"toc_threshold": 6,
|
||||
"toc_window_display": true
|
||||
},
|
||||
"toc_position": {
|
||||
"height": "61px",
|
||||
"left": "1135.97px",
|
||||
"right": "20px",
|
||||
"top": "120px",
|
||||
"width": "213px"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue