diff --git a/end_to_end_project.ipynb b/end_to_end_project.ipynb index 1f64da1..23927e4 100644 --- a/end_to_end_project.ipynb +++ b/end_to_end_project.ipynb @@ -27,11 +27,26 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ - "from __future__ import division, print_function, unicode_literals" + "from __future__ import division, print_function, unicode_literals\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams['axes.labelsize'] = 14\n", + "plt.rcParams['xtick.labelsize'] = 12\n", + "plt.rcParams['ytick.labelsize'] = 12\n", + "\n", + "PROJECT_ROOT_DIR = \".\"\n", + "CHAPTER_ID = \"end_to_end_project\"\n", + "\n", + "def save_fig(fig_id):\n", + " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", + " print(\"Saving figure\", fig_id)\n", + " plt.tight_layout()\n", + " plt.savefig(path, format='png', dpi=300)" ] }, { @@ -157,7 +172,8 @@ "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", - "housing.hist(bins=50, figsize=(20,15))\n", + "housing.hist(bins=50, figsize=(11,8))\n", + "save_fig(\"attribute_histogram_plots\")\n", "plt.show()" ] }, @@ -171,6 +187,7 @@ "source": [ "import numpy as np\n", "import numpy.random as rnd\n", + "rnd.seed(42) # to make this notebook's output identical at every run\n", "\n", "def split_train_test(data, test_ratio):\n", " shuffled_indices = rnd.permutation(len(data))\n", @@ -349,7 +366,8 @@ }, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")" + "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n", + "save_fig(\"bad_visualization\")" ] }, { @@ -360,23 +378,27 @@ }, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)" + "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n", + "save_fig(\"better_visualization\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": false + "collapsed": false, + "scrolled": true }, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\",\n", " s=housing['population']/100, label=\"population\",\n", " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", - " colorbar=True, alpha=0.4,\n", + " colorbar=True, alpha=0.4, figsize=(10,7),\n", ")\n", - "plt.legend()" + "plt.legend()\n", + "save_fig(\"housing_prices_scatterplot\")\n", + "plt.show()" ] }, { @@ -386,6 +408,36 @@ "collapsed": false }, "outputs": [], + "source": [ + "import matplotlib.image as mpimg\n", + "california_img=mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')\n", + "ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n", + " s=housing['population']/100, label=\"Population\",\n", + " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", + " colorbar=False, alpha=0.4,\n", + " )\n", + "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5)\n", + "plt.ylabel(\"Latitude\", fontsize=14)\n", + "plt.xlabel(\"Longitude\", fontsize=14)\n", + "\n", + "prices = housing[\"median_house_value\"]\n", + "tick_values = np.linspace(prices.min(), prices.max(), 11)\n", + "cbar = plt.colorbar()\n", + "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n", + "cbar.set_label('Median House Value', fontsize=16)\n", + "\n", + "plt.legend(fontsize=16)\n", + "save_fig(\"california_housing_prices\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], "source": [ "corr_matrix = housing.corr()\n", "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" @@ -393,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": { "collapsed": false }, @@ -401,12 +453,14 @@ "source": [ "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n", " alpha=0.3)\n", - "plt.axis([0, 16, 0, 550000])" + "plt.axis([0, 16, 0, 550000])\n", + "save_fig(\"income_vs_house_value_scatterplot\")\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": { "collapsed": false }, @@ -415,13 +469,14 @@ "from pandas.tools.plotting import scatter_matrix\n", "\n", "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\"]\n", - "scatter_matrix(housing[attributes], figsize=(12, 8))\n", + "scatter_matrix(housing[attributes], figsize=(11, 8))\n", + "save_fig(\"scatter_matrix_plot\")\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "collapsed": true }, @@ -434,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": { "collapsed": false }, @@ -446,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": { "collapsed": false }, @@ -460,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": { "collapsed": false }, @@ -478,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": { "collapsed": true }, @@ -490,7 +545,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": { "collapsed": false }, @@ -502,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": { "collapsed": false }, @@ -513,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": { "collapsed": false }, @@ -525,7 +580,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": { "collapsed": false }, @@ -539,7 +594,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": { "collapsed": false }, @@ -557,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": { "collapsed": false }, @@ -568,7 +623,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": { "collapsed": false }, @@ -579,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": { "collapsed": false }, @@ -590,7 +645,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": { "collapsed": false }, @@ -602,7 +657,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": { "collapsed": false }, @@ -618,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": { "collapsed": false }, @@ -629,7 +684,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": { "collapsed": false }, @@ -644,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": { "collapsed": false }, @@ -655,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": { "collapsed": false }, @@ -669,7 +724,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": { "collapsed": false }, @@ -702,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": { "collapsed": false }, @@ -722,7 +777,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": { "collapsed": false }, @@ -761,7 +816,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "metadata": { "collapsed": false }, @@ -773,7 +828,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 53, "metadata": { "collapsed": false }, @@ -791,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "metadata": { "collapsed": false }, @@ -805,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "metadata": { "collapsed": false }, @@ -822,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 56, "metadata": { "collapsed": false }, @@ -838,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 57, "metadata": { "collapsed": false }, @@ -852,7 +907,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "metadata": { "collapsed": false }, @@ -877,7 +932,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": { "collapsed": false }, @@ -892,7 +947,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": { "collapsed": false }, @@ -908,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": { "collapsed": false }, @@ -922,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": { "collapsed": false }, @@ -940,7 +995,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": { "collapsed": false }, @@ -956,7 +1011,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": { "collapsed": false }, @@ -968,7 +1023,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": { "collapsed": false }, @@ -986,7 +1041,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": { "collapsed": false }, @@ -1006,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": { "collapsed": false }, @@ -1017,7 +1072,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 68, "metadata": { "collapsed": false }, @@ -1028,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 69, "metadata": { "collapsed": false }, @@ -1040,7 +1095,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": { "collapsed": false }, @@ -1062,7 +1117,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 71, "metadata": { "collapsed": false }, @@ -1074,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": { "collapsed": false }, @@ -1086,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 73, "metadata": { "collapsed": false }, @@ -1100,7 +1155,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": { "collapsed": false }, @@ -1138,7 +1193,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": { "collapsed": false }, @@ -1170,7 +1225,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": { "collapsed": true }, @@ -1181,7 +1236,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": { "collapsed": false }, @@ -1192,7 +1247,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "metadata": { "collapsed": false }, @@ -1211,7 +1266,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "metadata": { "collapsed": false }, diff --git a/fundamentals.ipynb b/fundamentals.ipynb new file mode 100644 index 0000000..d3a1606 --- /dev/null +++ b/fundamentals.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Fundamentals of Machine Learning**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "from __future__ import division, print_function, unicode_literals\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.rcParams['axes.labelsize'] = 14\n", + "plt.rcParams['xtick.labelsize'] = 12\n", + "plt.rcParams['ytick.labelsize'] = 12\n", + "\n", + "PROJECT_ROOT_DIR = \".\"\n", + "CHAPTER_ID = \"fundamentals\"\n", + "\n", + "def save_fig(fig_id):\n", + " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", + " print(\"Saving figure\", fig_id)\n", + " plt.tight_layout()\n", + " plt.savefig(path, format='png', dpi=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load and prepare Life satisfaction data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Download CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI\n", + "datapath = \"datasets/lifesat/\"\n", + "\n", + "oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", + "oecd_bli = oecd_bli[oecd_bli[\"INEQUALITY\"]==\"TOT\"]\n", + "oecd_bli = oecd_bli.pivot(index=\"Country\", columns=\"Indicator\", values=\"Value\")\n", + "oecd_bli.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "oecd_bli[\"Life satisfaction\"].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load and prepare GDP per capita data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Download data from http://goo.gl/j1MSKe (=> imf.org)\n", + "gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',', delimiter='\\t',\n", + " encoding='latin1', na_values=\"n/a\")\n", + "gdp_per_capita.rename(columns={\"2015\": \"GDP per capita\"}, inplace=True)\n", + "gdp_per_capita.set_index(\"Country\", inplace=True)\n", + "gdp_per_capita.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)\n", + "full_country_stats.sort_values(by=\"GDP per capita\", inplace=\"True\")\n", + "full_country_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "full_country_stats[[\"GDP per capita\", 'Life satisfaction']].loc[\"United States\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "remove_indices = [0, 1, 6, 8, 33, 34, 35]\n", + "keep_indices = list(set(range(36)) - set(remove_indices))\n", + "\n", + "sample_data = full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[keep_indices]\n", + "missing_data = full_country_stats[[\"GDP per capita\", 'Life satisfaction']].iloc[remove_indices]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", + "plt.axis([0, 60000, 0, 10])\n", + "position_text = {\n", + " \"Hungary\": (5000, 1),\n", + " \"Korea\": (18000, 1.7),\n", + " \"France\": (29000, 2.4),\n", + " \"Australia\": (40000, 3.1),\n", + " \"United States\": (52000, 3.8),\n", + "}\n", + "for country, pos_text in position_text.items():\n", + " pos_data_x, pos_data_y = sample_data.loc[country]\n", + " country = \"U.S.\" if country == \"United States\" else country\n", + " plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,\n", + " arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))\n", + " plt.plot(pos_data_x, pos_data_y, \"ro\")\n", + "save_fig('money_happy_scatterplot')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sample_data.loc[list(position_text.keys())]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", + "plt.axis([0, 60000, 0, 10])\n", + "X=np.linspace(0, 60000, 1000)\n", + "plt.plot(X, 2*X/100000, \"r\")\n", + "plt.text(40000, 2.7, r\"$\\theta_0 = 0$\", fontsize=14, color=\"r\")\n", + "plt.text(40000, 1.8, r\"$\\theta_1 = 2 \\times 10^{-5}$\", fontsize=14, color=\"r\")\n", + "plt.plot(X, 8 - 5*X/100000, \"g\")\n", + "plt.text(5000, 9.1, r\"$\\theta_0 = 8$\", fontsize=14, color=\"g\")\n", + "plt.text(5000, 8.2, r\"$\\theta_1 = -5 \\times 10^{-5}$\", fontsize=14, color=\"g\")\n", + "plt.plot(X, 4 + 5*X/100000, \"b\")\n", + "plt.text(5000, 3.5, r\"$\\theta_0 = 4$\", fontsize=14, color=\"b\")\n", + "plt.text(5000, 2.6, r\"$\\theta_1 = 5 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n", + "save_fig('tweaking_model_params_plot')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn import linear_model\n", + "lin1 = linear_model.LinearRegression()\n", + "Xsample = np.c_[sample_data[\"GDP per capita\"]]\n", + "ysample = np.c_[sample_data[\"Life satisfaction\"]]\n", + "lin1.fit(Xsample, ysample)\n", + "t0, t1 = lin1.intercept_[0], lin1.coef_[0][0]\n", + "t0, t1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3))\n", + "plt.axis([0, 60000, 0, 10])\n", + "X=np.linspace(0, 60000, 1000)\n", + "plt.plot(X, t0 + t1*X, \"b\")\n", + "plt.text(5000, 3.1, r\"$\\theta_0 = 4.85$\", fontsize=14, color=\"b\")\n", + "plt.text(5000, 2.2, r\"$\\theta_1 = 4.91 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n", + "save_fig('best_fit_model_plot')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cyprus_gdp_per_capita = gdp_per_capita.loc[\"Cyprus\"][\"GDP per capita\"]\n", + "print(cyprus_gdp_per_capita)\n", + "cyprus_predicted_life_satisfaction = lin1.predict(cyprus_gdp_per_capita)[0][0]\n", + "cyprus_predicted_life_satisfaction" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(5,3), s=1)\n", + "X=np.linspace(0, 60000, 1000)\n", + "plt.plot(X, t0 + t1*X, \"b\")\n", + "plt.axis([0, 60000, 0, 10])\n", + "plt.text(5000, 7.5, r\"$\\theta_0 = 4.85$\", fontsize=14, color=\"b\")\n", + "plt.text(5000, 6.6, r\"$\\theta_1 = 4.91 \\times 10^{-5}$\", fontsize=14, color=\"b\")\n", + "plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], \"r--\")\n", + "plt.text(25000, 5.0, r\"Prediction = 5.96\", fontsize=14, color=\"b\")\n", + "plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, \"ro\")\n", + "save_fig('cyprus_prediction_plot')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sample_data[7:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "(5.1+5.7+6.5)/3" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "backup = oecd_bli, gdp_per_capita\n", + "\n", + "def prepare_country_stats(oecd_bli, gdp_per_capita):\n", + " return sample_data\n", + "\n", + "# Code example\n", + "########################################################################\n", + "import sklearn\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Load the data\n", + "oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", + "gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n", + " encoding='latin1', na_values=\"n/a\")\n", + "\n", + "# Prepare the data\n", + "country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n", + "X = np.c_[country_stats[\"GDP per capita\"]]\n", + "y = np.c_[country_stats[\"Life satisfaction\"]]\n", + "\n", + "# Visualize the data\n", + "country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n", + "plt.show()\n", + "\n", + "# Select a linear model\n", + "lin_reg_model = sklearn.linear_model.LinearRegression()\n", + "\n", + "# Train the model\n", + "lin_reg_model.fit(X, y)\n", + "\n", + "# Make a prediction for Cyprus\n", + "X_new = [[22587]] # Cyprus' GDP per capita\n", + "print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n", + "########################################################################\n", + "\n", + "oecd_bli, gdp_per_capita = backup" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "missing_data" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "position_text2 = {\n", + " \"Brazil\": (1000, 9.0),\n", + " \"Mexico\": (11000, 9.0),\n", + " \"Chile\": (25000, 9.0),\n", + " \"Czech Republic\": (35000, 9.0),\n", + " \"Norway\": (60000, 3),\n", + " \"Switzerland\": (72000, 3.0),\n", + " \"Luxembourg\": (90000, 3.0),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sample_data.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n", + "plt.axis([0, 110000, 0, 10])\n", + "\n", + "for country, pos_text in position_text2.items():\n", + " pos_data_x, pos_data_y = missing_data.loc[country]\n", + " plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text,\n", + " arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))\n", + " plt.plot(pos_data_x, pos_data_y, \"rs\")\n", + "\n", + "X=np.linspace(0, 110000, 1000)\n", + "plt.plot(X, t0 + t1*X, \"b:\")\n", + "\n", + "lin_reg_full = linear_model.LinearRegression()\n", + "Xfull = np.c_[full_country_stats[\"GDP per capita\"]]\n", + "yfull = np.c_[full_country_stats[\"Life satisfaction\"]]\n", + "lin_reg_full.fit(Xfull, yfull)\n", + "\n", + "t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]\n", + "X = np.linspace(0, 110000, 1000)\n", + "plt.plot(X, t0full + t1full * X, \"k\")\n", + "\n", + "save_fig('representative_training_data_scatterplot')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "full_country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction', figsize=(8,3))\n", + "plt.axis([0, 110000, 0, 10])\n", + "\n", + "from sklearn import preprocessing\n", + "from sklearn import pipeline\n", + "\n", + "poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False)\n", + "scaler = preprocessing.StandardScaler()\n", + "lin_reg2 = linear_model.LinearRegression()\n", + "\n", + "pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])\n", + "pipeline_reg.fit(Xfull, yfull)\n", + "curve = pipeline_reg.predict(X[:, np.newaxis])\n", + "plt.plot(X, curve)\n", + "save_fig('overfitting_model_plot')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "full_country_stats.loc[[c for c in full_country_stats.index if \"W\" in c.upper()]][\"Life satisfaction\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "gdp_per_capita.loc[[c for c in gdp_per_capita.index if \"W\" in c.upper()]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,3))\n", + "\n", + "plt.xlabel(\"GDP per capita\")\n", + "plt.ylabel('Life satisfaction')\n", + "\n", + "plt.plot(list(sample_data[\"GDP per capita\"]), list(sample_data[\"Life satisfaction\"]), \"bo\")\n", + "plt.plot(list(missing_data[\"GDP per capita\"]), list(missing_data[\"Life satisfaction\"]), \"rs\")\n", + "\n", + "X = np.linspace(0, 110000, 1000)\n", + "plt.plot(X, t0full + t1full * X, \"r--\", label=\"Linear model on all data\")\n", + "plt.plot(X, t0 + t1*X, \"b:\", label=\"Linear model on partial data\")\n", + "\n", + "ridge = linear_model.Ridge(alpha=10**9.5)\n", + "Xsample = np.c_[sample_data[\"GDP per capita\"]]\n", + "ysample = np.c_[sample_data[\"Life satisfaction\"]]\n", + "ridge.fit(Xsample, ysample)\n", + "t0ridge, t1ridge = ridge.intercept_[0], ridge.coef_[0][0]\n", + "plt.plot(X, t0ridge + t1ridge * X, \"b\", label=\"Regularized linear model on partial data\")\n", + "\n", + "plt.legend(loc=\"lower right\")\n", + "plt.axis([0, 110000, 0, 10])\n", + "save_fig('ridge_model_plot')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "backup = oecd_bli, gdp_per_capita\n", + "\n", + "def prepare_country_stats(oecd_bli, gdp_per_capita):\n", + " return sample_data\n", + "\n", + "# Code example\n", + "########################################################################\n", + "from sklearn import neighbors\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Load the data\n", + "oecd_bli = pd.read_csv(datapath+\"oecd_bli_2015.csv\", thousands=',')\n", + "gdp_per_capita = pd.read_csv(datapath+\"gdp_per_capita.csv\", thousands=',',delimiter='\\t',\n", + " encoding='latin1', na_values=\"n/a\")\n", + "\n", + "# Prepare the data\n", + "country_stats = prepare_country_stats(oecd_bli, gdp_per_capita)\n", + "X = np.c_[country_stats[\"GDP per capita\"]]\n", + "y = np.c_[country_stats[\"Life satisfaction\"]]\n", + "\n", + "# Visualize the data\n", + "country_stats.plot(kind='scatter', x=\"GDP per capita\", y='Life satisfaction')\n", + "plt.show()\n", + "\n", + "# Select a k-neighboors regression model\n", + "k_neigh_reg_model = neighbors.KNeighborsRegressor(n_neighbors=3)\n", + "\n", + "# Train the model\n", + "k_neigh_reg_model.fit(X, y)\n", + "\n", + "# Make a prediction for Cyprus\n", + "X_new = [[22587]] # Cyprus' GDP per capita\n", + "print(lin_reg_model.predict(X_new)) # outputs [[ 5.96242338]]\n", + "########################################################################\n", + "\n", + "oecd_bli, gdp_per_capita = backup" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.1" + }, + "toc": { + "toc_cell": false, + "toc_number_sections": false, + "toc_section_display": "block", + "toc_threshold": 6, + "toc_window_display": true + }, + "toc_position": { + "height": "61px", + "left": "1135.97px", + "right": "20px", + "top": "120px", + "width": "213px" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/training_linear_models.ipynb b/training_linear_models.ipynb new file mode 100644 index 0000000..e3e8078 --- /dev/null +++ b/training_linear_models.ipynb @@ -0,0 +1,1057 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Training Linear Models**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division, print_function, unicode_literals\n", + "\n", + "import numpy as np\n", + "import numpy.random as rnd\n", + "rnd.seed(42) # to make this notebook's output stable across runs\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "LinearRegression().fit([[5]], [3])\n", + "\n", + "%matplotlib inline\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams['axes.labelsize'] = 14\n", + "plt.rcParams['xtick.labelsize'] = 12\n", + "plt.rcParams['ytick.labelsize'] = 12\n", + "\n", + "PROJECT_ROOT_DIR = \"/Users/ageron/dev/py/ml/handson-ml\"\n", + "CHAPTER_ID = \"training_linear_models\"\n", + "\n", + "def save_fig(fig_id, tight_layout=True):\n", + " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", + " print(\"Saving figure\", fig_id)\n", + " if tight_layout:\n", + " plt.tight_layout()\n", + " plt.savefig(path, format='png', dpi=300)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear regression using the Normal Equation" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X = 2 * rnd.rand(100, 1)\n", + "y = 4 + 3 * X + rnd.randn(100, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.plot(X, y, \"b.\")\n", + "plt.xlabel(\"$x_1$\", fontsize=18)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.axis([0, 2, 0, 15])\n", + "save_fig(\"generated_data\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy.linalg as LA\n", + "\n", + "Xb = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance\n", + "theta_best = LA.inv(Xb.T.dot(Xb)).dot(Xb.T).dot(y)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "theta_best" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_new = np.array([[0], [2]])\n", + "X_newb = np.c_[np.ones((2, 1)), X_new] # add x0 = 1 to each instance\n", + "y_predict = X_newb.dot(theta_best)\n", + "y_predict" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n", + "plt.plot(X, y, \"b.\")\n", + "plt.xlabel(\"$x_1$\", fontsize=18)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.legend(loc=\"upper left\", fontsize=14)\n", + "plt.axis([0, 2, 0, 15])\n", + "save_fig(\"linear_model_predictions\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "lin_reg = LinearRegression()\n", + "lin_reg.fit(X, y)\n", + "lin_reg.intercept_, lin_reg.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "lin_reg.predict(X_new)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear regression using batch gradient descent" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "theta_path_bgd = []\n", + "\n", + "def plot_gradient_descent(theta, eta, theta_path=None):\n", + " m = len(Xb)\n", + " plt.plot(X, y, \"b.\")\n", + " n_iterations = 1000\n", + " for iteration in range(n_iterations):\n", + " if iteration < 10:\n", + " y_predict = X_newb.dot(theta)\n", + " style = \"b-\" if iteration > 0 else \"r--\"\n", + " plt.plot(X_new, y_predict, style)\n", + " gradients = 2/m * Xb.T.dot(Xb.dot(theta) - y)\n", + " theta = theta - eta * gradients\n", + " if theta_path is not None:\n", + " theta_path.append(theta)\n", + " plt.xlabel(\"$x_1$\", fontsize=18)\n", + " plt.axis([0, 2, 0, 15])\n", + " plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)\n", + "\n", + "rnd.seed(42)\n", + "theta = rnd.randn(2,1) # random initialization\n", + "\n", + "plt.figure(figsize=(10,4))\n", + "plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n", + "plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n", + "\n", + "save_fig(\"gradient_descent_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stochastic Gradient Descent" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "theta_path_sgd = []\n", + "\n", + "n_iterations = 50\n", + "t0, t1 = 5, 50 # learning schedule hyperparameters\n", + "\n", + "rnd.seed(42)\n", + "theta = rnd.randn(2,1) # random initialization\n", + "\n", + "def learning_schedule(t):\n", + " return t0 / (t + t1)\n", + "\n", + "m = len(Xb)\n", + "\n", + "for epoch in range(n_iterations):\n", + " shuffled_indices = rnd.permutation(m)\n", + " Xb_shuffled = Xb[shuffled_indices]\n", + " y_shuffled = y[shuffled_indices]\n", + " for i in range(m):\n", + " if epoch == 0 and i < 20:\n", + " y_predict = X_newb.dot(theta)\n", + " style = \"b-\" if i > 0 else \"r--\"\n", + " plt.plot(X_new, y_predict, style)\n", + " xi = Xb_shuffled[i:i+1]\n", + " yi = y_shuffled[i:i+1]\n", + " gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n", + " eta = learning_schedule(epoch * m + i)\n", + " theta = theta - eta * gradients\n", + " theta_path_sgd.append(theta)\n", + "\n", + "plt.plot(X, y, \"b.\")\n", + "plt.xlabel(\"$x_1$\", fontsize=18)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.axis([0, 2, 0, 15])\n", + "save_fig(\"sgd_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "theta" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import SGDRegressor\n", + "sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)\n", + "sgd_reg.fit(X, y.ravel())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sgd_reg.intercept_, sgd_reg.coef_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mini-batch gradient descent" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "theta_path_mgd = []\n", + "\n", + "n_iterations = 50\n", + "minibatch_size = 20\n", + "\n", + "rnd.seed(42)\n", + "theta = rnd.randn(2,1) # random initialization\n", + "\n", + "t0, t1 = 10, 1000\n", + "def learning_schedule(t):\n", + " return t0 / (t + t1)\n", + "\n", + "t = 0\n", + "for epoch in range(n_iterations):\n", + " shuffled_indices = rnd.permutation(m)\n", + " Xb_shuffled = Xb[shuffled_indices]\n", + " y_shuffled = y[shuffled_indices]\n", + " for i in range(0, m, minibatch_size):\n", + " t += 1\n", + " xi = Xb_shuffled[i:i+minibatch_size]\n", + " yi = y_shuffled[i:i+minibatch_size]\n", + " gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n", + " eta = learning_schedule(t)\n", + " theta = theta - eta * gradients\n", + " theta_path_mgd.append(theta)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "theta" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "theta_path_bgd = np.array(theta_path_bgd)\n", + "theta_path_sgd = np.array(theta_path_sgd)\n", + "theta_path_mgd = np.array(theta_path_mgd)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(7,4))\n", + "plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], \"r-s\", linewidth=1, label=\"Stochastic\")\n", + "plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], \"g-+\", linewidth=2, label=\"Mini-batch\")\n", + "plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], \"b-o\", linewidth=3, label=\"Batch\")\n", + "plt.legend(loc=\"upper left\", fontsize=16)\n", + "plt.xlabel(r\"$\\theta_0$\", fontsize=20)\n", + "plt.ylabel(r\"$\\theta_1$ \", fontsize=20, rotation=0)\n", + "plt.axis([2.5, 4.5, 2.3, 3.9])\n", + "save_fig(\"gradient_descent_paths_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Polynomial regression" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import numpy.random as rnd\n", + "\n", + "rnd.seed(42)\n", + "m = 100\n", + "X = 6 * rnd.rand(m, 1) - 3\n", + "y = 2 + X + 0.5 * X**2 + rnd.randn(m, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.plot(X, y, \"b.\")\n", + "plt.xlabel(\"$x_1$\", fontsize=18)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.axis([-3, 3, 0, 10])\n", + "save_fig(\"quadratic_data_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import PolynomialFeatures\n", + "poly_features = PolynomialFeatures(degree=2, include_bias=False)\n", + "X_poly = poly_features.fit_transform(X)\n", + "X[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_poly[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "lin_reg = LinearRegression()\n", + "lin_reg.fit(X_poly, y)\n", + "lin_reg.intercept_, lin_reg.coef_" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n", + "X_new_poly = poly_features.transform(X_new)\n", + "y_new = lin_reg.predict(X_new_poly)\n", + "plt.plot(X, y, \"b.\")\n", + "plt.plot(X_new, y_new, \"r-\", linewidth=2, label=\"Predictions\")\n", + "plt.xlabel(\"$x_1$\", fontsize=18)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.legend(loc=\"upper left\", fontsize=14)\n", + "plt.axis([-3, 3, 0, 10])\n", + "save_fig(\"quadratic_predictions_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "for style, width, degree in ((\"g-\", 1, 300), (\"b--\", 2, 2), (\"r-+\", 2, 1)):\n", + " polybig_features = PolynomialFeatures(degree=degree, include_bias=False)\n", + " std_scaler = StandardScaler()\n", + " lin_reg = LinearRegression()\n", + " polynomial_regression = Pipeline((\n", + " (\"poly_features\", polybig_features),\n", + " (\"std_scaler\", std_scaler),\n", + " (\"lin_reg\", lin_reg),\n", + " ))\n", + " polynomial_regression.fit(X, y)\n", + " y_newbig = polynomial_regression.predict(X_new)\n", + " plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)\n", + "\n", + "plt.plot(X, y, \"b.\", linewidth=3)\n", + "plt.legend(loc=\"upper left\")\n", + "plt.xlabel(\"$x_1$\", fontsize=18)\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.axis([-3, 3, 0, 10])\n", + "save_fig(\"high_degree_polynomials_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.cross_validation import train_test_split\n", + "\n", + "def plot_learning_curves(model, X, y):\n", + " X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)\n", + " train_errors, val_errors = [], []\n", + " for m in range(1, len(X_train)):\n", + " model.fit(X_train[:m], y_train[:m])\n", + " y_train_predict = model.predict(X_train[:m])\n", + " y_val_predict = model.predict(X_val)\n", + " train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))\n", + " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", + "\n", + " plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"Training set\")\n", + " plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n", + " plt.legend(loc=\"upper right\", fontsize=14)\n", + " plt.xlabel(\"Training set size\", fontsize=14)\n", + " plt.ylabel(\"RMSE\", fontsize=14)\n", + "\n", + "lin_reg = LinearRegression()\n", + "plot_learning_curves(lin_reg, X, y)\n", + "plt.axis([0, 80, 0, 3])\n", + "save_fig(\"underfitting_learning_curves_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "\n", + "polynomial_regression = Pipeline((\n", + " (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n", + " (\"sgd_reg\", LinearRegression()),\n", + " ))\n", + "\n", + "plot_learning_curves(polynomial_regression, X, y)\n", + "plt.axis([0, 80, 0, 3])\n", + "save_fig(\"learning_curves_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regularized models" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import Ridge\n", + "\n", + "rnd.seed(42)\n", + "m = 20\n", + "X = 3 * rnd.rand(m, 1)\n", + "y = 1 + 0.5 * X + rnd.randn(m, 1) / 1.5\n", + "X_new = np.linspace(0, 3, 100).reshape(100, 1)\n", + "\n", + "def plot_model(model_class, polynomial, alphas, **model_kargs):\n", + " for alpha, style in zip(alphas, (\"b-\", \"g--\", \"r:\")):\n", + " model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()\n", + " if polynomial:\n", + " model = Pipeline((\n", + " (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n", + " (\"std_scaler\", StandardScaler()),\n", + " (\"regul_reg\", model),\n", + " ))\n", + " model.fit(X, y)\n", + " y_new_regul = model.predict(X_new)\n", + " lw = 2 if alpha > 0 else 1\n", + " plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r\"$\\alpha = {}$\".format(alpha))\n", + " plt.plot(X, y, \"b.\", linewidth=3)\n", + " plt.legend(loc=\"upper left\", fontsize=15)\n", + " plt.xlabel(\"$x_1$\", fontsize=18)\n", + " plt.axis([0, 3, 0, 4])\n", + "\n", + "plt.figure(figsize=(8,4))\n", + "plt.subplot(121)\n", + "plot_model(Ridge, polynomial=False, alphas=(0, 10, 100))\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.subplot(122)\n", + "plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1))\n", + "\n", + "save_fig(\"ridge_regression_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import Ridge\n", + "ridge_reg = Ridge(alpha=1, solver=\"cholesky\")\n", + "ridge_reg.fit(X, y)\n", + "ridge_reg.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sgd_reg = SGDRegressor(penalty=\"l2\", random_state=42)\n", + "sgd_reg.fit(X, y.ravel())\n", + "ridge_reg.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ridge_reg = Ridge(alpha=1, solver=\"sag\")\n", + "ridge_reg.fit(X, y)\n", + "ridge_reg.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import Lasso\n", + "\n", + "plt.figure(figsize=(8,4))\n", + "plt.subplot(121)\n", + "plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1))\n", + "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", + "plt.subplot(122)\n", + "plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1)\n", + "\n", + "save_fig(\"lasso_regression_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import Lasso\n", + "lasso_reg = Lasso(alpha=0.1)\n", + "lasso_reg.fit(X, y)\n", + "lasso_reg.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import ElasticNet\n", + "elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)\n", + "elastic_net.fit(X, y)\n", + "elastic_net.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "rnd.seed(42)\n", + "m = 100\n", + "X = 6 * rnd.rand(m, 1) - 3\n", + "y = 2 + X + 0.5 * X**2 + rnd.randn(m, 1)\n", + "\n", + "X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)\n", + "\n", + "poly_scaler = Pipeline((\n", + " (\"poly_features\", PolynomialFeatures(degree=90, include_bias=False)),\n", + " (\"std_scaler\", StandardScaler()),\n", + " ))\n", + "\n", + "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n", + "X_val_poly_scaled = poly_scaler.transform(X_val)\n", + "\n", + "sgd_reg = SGDRegressor(n_iter=1,\n", + " penalty=None,\n", + " eta0=0.0005,\n", + " warm_start=True,\n", + " learning_rate=\"constant\",\n", + " random_state=42)\n", + "\n", + "n_epochs = 500\n", + "train_errors, val_errors = [], []\n", + "for epoch in range(n_epochs):\n", + " sgd_reg.fit(X_train_poly_scaled, y_train)\n", + " y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n", + " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", + " train_errors.append(mean_squared_error(y_train_predict, y_train))\n", + " val_errors.append(mean_squared_error(y_val_predict, y_val))\n", + "\n", + "best_epoch = np.argmin(val_errors)\n", + "best_val_rmse = np.sqrt(val_errors[best_epoch])\n", + "\n", + "plt.annotate('Best model',\n", + " xy=(best_epoch, best_val_rmse),\n", + " xytext=(best_epoch, best_val_rmse + 1),\n", + " ha=\"center\",\n", + " arrowprops=dict(facecolor='black', shrink=0.05),\n", + " fontsize=16,\n", + " )\n", + "\n", + "best_val_rmse -= 0.03 # just to make the graph look better\n", + "plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], \"k:\", linewidth=2)\n", + "plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n", + "plt.plot(np.sqrt(train_errors), \"r--\", linewidth=2, label=\"Training set\")\n", + "plt.legend(loc=\"upper right\", fontsize=14)\n", + "plt.xlabel(\"Epoch\", fontsize=14)\n", + "plt.ylabel(\"RMSE\", fontsize=14)\n", + "save_fig(\"early_stopping_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.base import clone\n", + "sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None,\n", + " learning_rate=\"constant\", eta0=0.0005,\n", + " random_state=42)\n", + "\n", + "minimum_val_error = float(\"inf\")\n", + "best_epoch = None\n", + "best_model = None\n", + "for epoch in range(1000):\n", + " sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n", + " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", + " val_error = mean_squared_error(y_val_predict, y_val)\n", + " if val_error < minimum_val_error:\n", + " minimum_val_error = val_error\n", + " best_epoch = epoch\n", + " best_model = clone(sgd_reg)\n", + "\n", + "best_epoch, best_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic regression" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "t = np.linspace(-10, 10, 100)\n", + "sig = 1 / (1 + np.exp(-t))\n", + "plt.figure(figsize=(9, 3))\n", + "plt.plot([-10, 10], [0, 0], \"k-\")\n", + "plt.plot([-10, 10], [0.5, 0.5], \"k:\")\n", + "plt.plot([-10, 10], [1, 1], \"k:\")\n", + "plt.plot([0, 0], [-1.1, 1.1], \"k-\")\n", + "plt.plot(t, sig, \"b-\", linewidth=2, label=r\"$\\sigma(t) = \\frac{1}{1 + e^{-t}}$\")\n", + "plt.xlabel(\"t\")\n", + "plt.legend(loc=\"upper left\", fontsize=20)\n", + "plt.axis([-10, 10, -0.1, 1.1])\n", + "save_fig(\"logistic_function_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "iris = datasets.load_iris()\n", + "list(iris.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(iris.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "X = iris[\"data\"][:, 3:] # petal width\n", + "y = (iris[\"target\"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0\n", + "\n", + "log_reg = LogisticRegression()\n", + "log_reg.fit(X, y)\n", + "\n", + "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n", + "y_proba = log_reg.predict_proba(X_new)\n", + "decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]\n", + "\n", + "plt.figure(figsize=(8, 3))\n", + "plt.plot(X[y==0], y[y==0], \"bs\")\n", + "plt.plot(X[y==1], y[y==1], \"g^\")\n", + "plt.plot([decision_boundary, decision_boundary], [-1, 2], \"k:\", linewidth=2)\n", + "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris-Virginica\")\n", + "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris-Virginica\")\n", + "plt.text(decision_boundary+0.02, 0.15, \"Decision boundary\", fontsize=14, color=\"k\", ha=\"center\")\n", + "plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')\n", + "plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')\n", + "plt.xlabel(\"Petal width (cm)\", fontsize=14)\n", + "plt.ylabel(\"Probability\", fontsize=14)\n", + "plt.legend(loc=\"center left\", fontsize=14)\n", + "plt.axis([0, 3, -0.02, 1.02])\n", + "save_fig(\"logistic_regression_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "decision_boundary" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "log_reg.predict([[1.7], [1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", + "y = (iris[\"target\"] == 2).astype(np.int)\n", + "\n", + "log_reg = LogisticRegression(C=10**10)\n", + "log_reg.fit(X, y)\n", + "\n", + "x0, x1 = np.meshgrid(\n", + " np.linspace(2.9, 7, 500).reshape(-1, 1),\n", + " np.linspace(0.8, 2.7, 200).reshape(-1, 1),\n", + " )\n", + "X_new = np.c_[x0.ravel(), x1.ravel()]\n", + "\n", + "y_proba = log_reg.predict_proba(X_new)\n", + "\n", + "plt.figure(figsize=(10, 4))\n", + "plt.plot(X[y==0, 0], X[y==0, 1], \"bs\")\n", + "plt.plot(X[y==1, 0], X[y==1, 1], \"g^\")\n", + "\n", + "zz = y_proba[:, 1].reshape(x0.shape)\n", + "contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)\n", + "\n", + "\n", + "left_right = np.array([2.9, 7])\n", + "boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n", + "\n", + "plt.clabel(contour, inline=1, fontsize=12)\n", + "plt.plot(left_right, boundary, \"k--\", linewidth=3)\n", + "plt.text(3.5, 1.5, \"Not Iris-Virginica\", fontsize=14, color=\"b\", ha=\"center\")\n", + "plt.text(6.5, 2.3, \"Iris-Virginica\", fontsize=14, color=\"g\", ha=\"center\")\n", + "plt.xlabel(\"Petal length\", fontsize=14)\n", + "plt.ylabel(\"Petal width\", fontsize=14)\n", + "plt.axis([2.9, 7, 0.8, 2.7])\n", + "save_fig(\"logistic_regression_contour_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", + "y = iris[\"target\"]\n", + "\n", + "softmax_reg = LogisticRegression(multi_class=\"multinomial\", solver=\"lbfgs\", C=10)\n", + "softmax_reg.fit(X, y)\n", + "\n", + "x0, x1 = np.meshgrid(\n", + " np.linspace(0, 8, 500).reshape(-1, 1),\n", + " np.linspace(0, 3.5, 200).reshape(-1, 1),\n", + " )\n", + "X_new = np.c_[x0.ravel(), x1.ravel()]\n", + "\n", + "\n", + "y_proba = softmax_reg.predict_proba(X_new)\n", + "y_predict = softmax_reg.predict(X_new)\n", + "\n", + "zz1 = y_proba[:, 1].reshape(x0.shape)\n", + "zz = y_predict.reshape(x0.shape)\n", + "\n", + "plt.figure(figsize=(10, 4))\n", + "plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris-Virginica\")\n", + "plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris-Versicolour\")\n", + "plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris-Setosa\")\n", + "\n", + "from matplotlib.colors import ListedColormap\n", + "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", + "\n", + "plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n", + "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n", + "plt.clabel(contour, inline=1, fontsize=12)\n", + "plt.xlabel(\"Petal length\", fontsize=14)\n", + "plt.ylabel(\"Petal width\", fontsize=14)\n", + "plt.legend(loc=\"center left\", fontsize=14)\n", + "plt.axis([0, 7, 0, 3.5])\n", + "save_fig(\"softmax_regression_contour_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "softmax_reg.predict([[5, 2]])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "softmax_reg.predict_proba([[5, 2]])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.1" + }, + "toc": { + "toc_cell": false, + "toc_number_sections": true, + "toc_threshold": 6, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}