handson-ml/04_training_linear_models.i...

1216 lines
34 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Chapter 4 Training Linear Models**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"_This notebook contains all the sample code and solutions to the exercices in chapter 4._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import numpy.random as rnd\n",
"import os\n",
"\n",
"# to make this notebook's output stable across runs\n",
"rnd.seed(42)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"# Where to save the figures\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"training_linear_models\"\n",
"\n",
"def save_fig(fig_id, tight_layout=True):\n",
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
" print(\"Saving figure\", fig_id)\n",
" if tight_layout:\n",
" plt.tight_layout()\n",
" plt.savefig(path, format='png', dpi=300)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Linear regression using the Normal Equation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = 2 * rnd.rand(100, 1)\n",
"y = 4 + 3 * X + rnd.randn(100, 1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.plot(X, y, \"b.\")\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.axis([0, 2, 0, 15])\n",
"save_fig(\"generated_data_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy.linalg as LA\n",
"\n",
"X_b = np.c_[np.ones((100, 1)), X] # add x0 = 1 to each instance\n",
"theta_best = LA.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"theta_best"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_new = np.array([[0], [2]])\n",
"X_new_b = np.c_[np.ones((2, 1)), X_new] # add x0 = 1 to each instance\n",
"y_predict = X_new_b.dot(theta_best)\n",
"y_predict"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n",
"plt.plot(X, y, \"b.\")\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.legend(loc=\"upper left\", fontsize=14)\n",
"plt.axis([0, 2, 0, 15])\n",
"save_fig(\"linear_model_predictions\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"lin_reg = LinearRegression()\n",
"lin_reg.fit(X, y)\n",
"lin_reg.intercept_, lin_reg.coef_"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lin_reg.predict(X_new)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Linear regression using batch gradient descent"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"theta_path_bgd = []\n",
"\n",
"def plot_gradient_descent(theta, eta, theta_path=None):\n",
" m = len(X_b)\n",
" plt.plot(X, y, \"b.\")\n",
" n_iterations = 1000\n",
" for iteration in range(n_iterations):\n",
" if iteration < 10:\n",
" y_predict = X_new_b.dot(theta)\n",
" style = \"b-\" if iteration > 0 else \"r--\"\n",
" plt.plot(X_new, y_predict, style)\n",
" gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n",
" theta = theta - eta * gradients\n",
" if theta_path is not None:\n",
" theta_path.append(theta)\n",
" plt.xlabel(\"$x_1$\", fontsize=18)\n",
" plt.axis([0, 2, 0, 15])\n",
" plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)\n",
"\n",
"rnd.seed(42)\n",
"theta = rnd.randn(2,1) # random initialization\n",
"\n",
"plt.figure(figsize=(10,4))\n",
"plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n",
"plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n",
"\n",
"save_fig(\"gradient_descent_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stochastic Gradient Descent"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"theta_path_sgd = []\n",
"\n",
"n_iterations = 50\n",
"t0, t1 = 5, 50 # learning schedule hyperparameters\n",
"\n",
"rnd.seed(42)\n",
"theta = rnd.randn(2,1) # random initialization\n",
"\n",
"def learning_schedule(t):\n",
" return t0 / (t + t1)\n",
"\n",
"m = len(X_b)\n",
"\n",
"for epoch in range(n_iterations):\n",
" for i in range(m):\n",
" if epoch == 0 and i < 20:\n",
" y_predict = X_new_b.dot(theta)\n",
" style = \"b-\" if i > 0 else \"r--\"\n",
" plt.plot(X_new, y_predict, style)\n",
" random_index = rnd.randint(m)\n",
" xi = X_b[random_index:random_index+1]\n",
" yi = y[random_index:random_index+1]\n",
" gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n",
" eta = learning_schedule(epoch * m + i)\n",
" theta = theta - eta * gradients\n",
" theta_path_sgd.append(theta)\n",
"\n",
"plt.plot(X, y, \"b.\")\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.axis([0, 2, 0, 15])\n",
"save_fig(\"sgd_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"theta"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import SGDRegressor\n",
"sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)\n",
"sgd_reg.fit(X, y.ravel())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sgd_reg.intercept_, sgd_reg.coef_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Mini-batch gradient descent"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"theta_path_mgd = []\n",
"\n",
"n_iterations = 50\n",
"minibatch_size = 20\n",
"\n",
"rnd.seed(42)\n",
"theta = rnd.randn(2,1) # random initialization\n",
"\n",
"t0, t1 = 10, 1000\n",
"def learning_schedule(t):\n",
" return t0 / (t + t1)\n",
"\n",
"t = 0\n",
"for epoch in range(n_iterations):\n",
" shuffled_indices = rnd.permutation(m)\n",
" X_b_shuffled = X_b[shuffled_indices]\n",
" y_shuffled = y[shuffled_indices]\n",
" for i in range(0, m, minibatch_size):\n",
" t += 1\n",
" xi = X_b_shuffled[i:i+minibatch_size]\n",
" yi = y_shuffled[i:i+minibatch_size]\n",
" gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n",
" eta = learning_schedule(t)\n",
" theta = theta - eta * gradients\n",
" theta_path_mgd.append(theta)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"theta"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"theta_path_bgd = np.array(theta_path_bgd)\n",
"theta_path_sgd = np.array(theta_path_sgd)\n",
"theta_path_mgd = np.array(theta_path_mgd)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(7,4))\n",
"plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], \"r-s\", linewidth=1, label=\"Stochastic\")\n",
"plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], \"g-+\", linewidth=2, label=\"Mini-batch\")\n",
"plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], \"b-o\", linewidth=3, label=\"Batch\")\n",
"plt.legend(loc=\"upper left\", fontsize=16)\n",
"plt.xlabel(r\"$\\theta_0$\", fontsize=20)\n",
"plt.ylabel(r\"$\\theta_1$ \", fontsize=20, rotation=0)\n",
"plt.axis([2.5, 4.5, 2.3, 3.9])\n",
"save_fig(\"gradient_descent_paths_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Polynomial regression"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import numpy.random as rnd\n",
"\n",
"rnd.seed(42)\n",
"m = 100\n",
"X = 6 * rnd.rand(m, 1) - 3\n",
"y = 2 + X + 0.5 * X**2 + rnd.randn(m, 1)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.plot(X, y, \"b.\")\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.axis([-3, 3, 0, 10])\n",
"save_fig(\"quadratic_data_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing import PolynomialFeatures\n",
"poly_features = PolynomialFeatures(degree=2, include_bias=False)\n",
"X_poly = poly_features.fit_transform(X)\n",
"X[0]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_poly[0]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lin_reg = LinearRegression()\n",
"lin_reg.fit(X_poly, y)\n",
"lin_reg.intercept_, lin_reg.coef_"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n",
"X_new_poly = poly_features.transform(X_new)\n",
"y_new = lin_reg.predict(X_new_poly)\n",
"plt.plot(X, y, \"b.\")\n",
"plt.plot(X_new, y_new, \"r-\", linewidth=2, label=\"Predictions\")\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.legend(loc=\"upper left\", fontsize=14)\n",
"plt.axis([-3, 3, 0, 10])\n",
"save_fig(\"quadratic_predictions_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"for style, width, degree in ((\"g-\", 1, 300), (\"b--\", 2, 2), (\"r-+\", 2, 1)):\n",
" polybig_features = PolynomialFeatures(degree=degree, include_bias=False)\n",
" std_scaler = StandardScaler()\n",
" lin_reg = LinearRegression()\n",
" polynomial_regression = Pipeline((\n",
" (\"poly_features\", polybig_features),\n",
" (\"std_scaler\", std_scaler),\n",
" (\"lin_reg\", lin_reg),\n",
" ))\n",
" polynomial_regression.fit(X, y)\n",
" y_newbig = polynomial_regression.predict(X_new)\n",
" plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)\n",
"\n",
"plt.plot(X, y, \"b.\", linewidth=3)\n",
"plt.legend(loc=\"upper left\")\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.axis([-3, 3, 0, 10])\n",
"save_fig(\"high_degree_polynomials_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def plot_learning_curves(model, X, y):\n",
" X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)\n",
" train_errors, val_errors = [], []\n",
" for m in range(1, len(X_train)):\n",
" model.fit(X_train[:m], y_train[:m])\n",
" y_train_predict = model.predict(X_train[:m])\n",
" y_val_predict = model.predict(X_val)\n",
" train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))\n",
" val_errors.append(mean_squared_error(y_val_predict, y_val))\n",
"\n",
" plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"Training set\")\n",
" plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n",
" plt.legend(loc=\"upper right\", fontsize=14)\n",
" plt.xlabel(\"Training set size\", fontsize=14)\n",
" plt.ylabel(\"RMSE\", fontsize=14)\n",
"\n",
"lin_reg = LinearRegression()\n",
"plot_learning_curves(lin_reg, X, y)\n",
"plt.axis([0, 80, 0, 3])\n",
"save_fig(\"underfitting_learning_curves_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"\n",
"polynomial_regression = Pipeline((\n",
" (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n",
" (\"sgd_reg\", LinearRegression()),\n",
" ))\n",
"\n",
"plot_learning_curves(polynomial_regression, X, y)\n",
"plt.axis([0, 80, 0, 3])\n",
"save_fig(\"learning_curves_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Regularized models"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import Ridge\n",
"\n",
"rnd.seed(42)\n",
"m = 20\n",
"X = 3 * rnd.rand(m, 1)\n",
"y = 1 + 0.5 * X + rnd.randn(m, 1) / 1.5\n",
"X_new = np.linspace(0, 3, 100).reshape(100, 1)\n",
"\n",
"def plot_model(model_class, polynomial, alphas, **model_kargs):\n",
" for alpha, style in zip(alphas, (\"b-\", \"g--\", \"r:\")):\n",
" model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()\n",
" if polynomial:\n",
" model = Pipeline((\n",
" (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n",
" (\"std_scaler\", StandardScaler()),\n",
" (\"regul_reg\", model),\n",
" ))\n",
" model.fit(X, y)\n",
" y_new_regul = model.predict(X_new)\n",
" lw = 2 if alpha > 0 else 1\n",
" plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r\"$\\alpha = {}$\".format(alpha))\n",
" plt.plot(X, y, \"b.\", linewidth=3)\n",
" plt.legend(loc=\"upper left\", fontsize=15)\n",
" plt.xlabel(\"$x_1$\", fontsize=18)\n",
" plt.axis([0, 3, 0, 4])\n",
"\n",
"plt.figure(figsize=(8,4))\n",
"plt.subplot(121)\n",
"plot_model(Ridge, polynomial=False, alphas=(0, 10, 100))\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.subplot(122)\n",
"plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1))\n",
"\n",
"save_fig(\"ridge_regression_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import Ridge\n",
"ridge_reg = Ridge(alpha=1, solver=\"cholesky\")\n",
"ridge_reg.fit(X, y)\n",
"ridge_reg.predict([[1.5]])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sgd_reg = SGDRegressor(penalty=\"l2\", random_state=42)\n",
"sgd_reg.fit(X, y.ravel())\n",
"ridge_reg.predict([[1.5]])"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ridge_reg = Ridge(alpha=1, solver=\"sag\")\n",
"ridge_reg.fit(X, y)\n",
"ridge_reg.predict([[1.5]])"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import Lasso\n",
"\n",
"plt.figure(figsize=(8,4))\n",
"plt.subplot(121)\n",
"plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1))\n",
"plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
"plt.subplot(122)\n",
"plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1)\n",
"\n",
"save_fig(\"lasso_regression_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import Lasso\n",
"lasso_reg = Lasso(alpha=0.1)\n",
"lasso_reg.fit(X, y)\n",
"lasso_reg.predict([[1.5]])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import ElasticNet\n",
"elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)\n",
"elastic_net.fit(X, y)\n",
"elastic_net.predict([[1.5]])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"rnd.seed(42)\n",
"m = 100\n",
"X = 6 * rnd.rand(m, 1) - 3\n",
"y = 2 + X + 0.5 * X**2 + rnd.randn(m, 1)\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)\n",
"\n",
"poly_scaler = Pipeline((\n",
" (\"poly_features\", PolynomialFeatures(degree=90, include_bias=False)),\n",
" (\"std_scaler\", StandardScaler()),\n",
" ))\n",
"\n",
"X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n",
"X_val_poly_scaled = poly_scaler.transform(X_val)\n",
"\n",
"sgd_reg = SGDRegressor(n_iter=1,\n",
" penalty=None,\n",
" eta0=0.0005,\n",
" warm_start=True,\n",
" learning_rate=\"constant\",\n",
" random_state=42)\n",
"\n",
"n_epochs = 500\n",
"train_errors, val_errors = [], []\n",
"for epoch in range(n_epochs):\n",
" sgd_reg.fit(X_train_poly_scaled, y_train)\n",
" y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n",
" y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n",
" train_errors.append(mean_squared_error(y_train_predict, y_train))\n",
" val_errors.append(mean_squared_error(y_val_predict, y_val))\n",
"\n",
"best_epoch = np.argmin(val_errors)\n",
"best_val_rmse = np.sqrt(val_errors[best_epoch])\n",
"\n",
"plt.annotate('Best model',\n",
" xy=(best_epoch, best_val_rmse),\n",
" xytext=(best_epoch, best_val_rmse + 1),\n",
" ha=\"center\",\n",
" arrowprops=dict(facecolor='black', shrink=0.05),\n",
" fontsize=16,\n",
" )\n",
"\n",
"best_val_rmse -= 0.03 # just to make the graph look better\n",
"plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], \"k:\", linewidth=2)\n",
"plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n",
"plt.plot(np.sqrt(train_errors), \"r--\", linewidth=2, label=\"Training set\")\n",
"plt.legend(loc=\"upper right\", fontsize=14)\n",
"plt.xlabel(\"Epoch\", fontsize=14)\n",
"plt.ylabel(\"RMSE\", fontsize=14)\n",
"save_fig(\"early_stopping_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.base import clone\n",
"sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None,\n",
" learning_rate=\"constant\", eta0=0.0005,\n",
" random_state=42)\n",
"\n",
"minimum_val_error = float(\"inf\")\n",
"best_epoch = None\n",
"best_model = None\n",
"for epoch in range(1000):\n",
" sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n",
" y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n",
" val_error = mean_squared_error(y_val_predict, y_val)\n",
" if val_error < minimum_val_error:\n",
" minimum_val_error = val_error\n",
" best_epoch = epoch\n",
" best_model = clone(sgd_reg)\n",
"\n",
"best_epoch, best_model"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"t1a, t1b, t2a, t2b = -1, 3, -1.5, 1.5\n",
"\n",
"# ignoring bias term\n",
"t1s = np.linspace(t1a, t1b, 500)\n",
"t2s = np.linspace(t2a, t2b, 500)\n",
"t1, t2 = np.meshgrid(t1s, t2s)\n",
"T = np.c_[t1.ravel(), t2.ravel()]\n",
"Xr = np.array([[-1, 1], [-0.3, -1], [1, 0.1]])\n",
"yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]\n",
"\n",
"J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)\n",
"\n",
"N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)\n",
"N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)\n",
"\n",
"t_min_idx = np.unravel_index(np.argmin(J), J.shape)\n",
"t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]\n",
"\n",
"t_init = np.array([[0.25], [-1]])"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.1, n_iterations = 50):\n",
" path = [theta]\n",
" for iteration in range(n_iterations):\n",
" gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + 2 * l2 * theta\n",
"\n",
" theta = theta - eta * gradients\n",
" path.append(theta)\n",
" return np.array(path)\n",
"\n",
"plt.figure(figsize=(12, 8))\n",
"for i, N, l1, l2, title in ((0, N1, 0.5, 0, \"Lasso\"), (1, N2, 0, 0.1, \"Ridge\")):\n",
" JR = J + l1 * N1 + l2 * N2**2\n",
" \n",
" tr_min_idx = np.unravel_index(np.argmin(JR), JR.shape)\n",
" t1r_min, t2r_min = t1[tr_min_idx], t2[tr_min_idx]\n",
"\n",
" levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)\n",
" levelsJR=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(JR) - np.min(JR)) + np.min(JR)\n",
" levelsN=np.linspace(0, np.max(N), 10)\n",
" \n",
" path_J = bgd_path(t_init, Xr, yr, l1=0, l2=0)\n",
" path_JR = bgd_path(t_init, Xr, yr, l1, l2)\n",
" path_N = bgd_path(t_init, Xr, yr, np.sign(l1)/3, np.sign(l2), core=0)\n",
"\n",
" plt.subplot(221 + i * 2)\n",
" plt.grid(True)\n",
" plt.axhline(y=0, color='k')\n",
" plt.axvline(x=0, color='k')\n",
" plt.contourf(t1, t2, J, levels=levelsJ, alpha=0.9)\n",
" plt.contour(t1, t2, N, levels=levelsN)\n",
" plt.plot(path_J[:, 0], path_J[:, 1], \"w-o\")\n",
" plt.plot(path_N[:, 0], path_N[:, 1], \"y-^\")\n",
" plt.plot(t1_min, t2_min, \"rs\")\n",
" plt.title(r\"$\\ell_{}$ penalty\".format(i + 1), fontsize=16)\n",
" plt.axis([t1a, t1b, t2a, t2b])\n",
"\n",
" plt.subplot(222 + i * 2)\n",
" plt.grid(True)\n",
" plt.axhline(y=0, color='k')\n",
" plt.axvline(x=0, color='k')\n",
" plt.contourf(t1, t2, JR, levels=levelsJR, alpha=0.9)\n",
" plt.plot(path_JR[:, 0], path_JR[:, 1], \"w-o\")\n",
" plt.plot(t1r_min, t2r_min, \"rs\")\n",
" plt.title(title, fontsize=16)\n",
" plt.axis([t1a, t1b, t2a, t2b])\n",
"\n",
"for subplot in (221, 223):\n",
" plt.subplot(subplot)\n",
" plt.ylabel(r\"$\\theta_2$\", fontsize=20, rotation=0)\n",
"\n",
"for subplot in (223, 224):\n",
" plt.subplot(subplot)\n",
" plt.xlabel(r\"$\\theta_1$\", fontsize=20)\n",
"\n",
"save_fig(\"lasso_vs_ridge_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Logistic regression"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"t = np.linspace(-10, 10, 100)\n",
"sig = 1 / (1 + np.exp(-t))\n",
"plt.figure(figsize=(9, 3))\n",
"plt.plot([-10, 10], [0, 0], \"k-\")\n",
"plt.plot([-10, 10], [0.5, 0.5], \"k:\")\n",
"plt.plot([-10, 10], [1, 1], \"k:\")\n",
"plt.plot([0, 0], [-1.1, 1.1], \"k-\")\n",
"plt.plot(t, sig, \"b-\", linewidth=2, label=r\"$\\sigma(t) = \\frac{1}{1 + e^{-t}}$\")\n",
"plt.xlabel(\"t\")\n",
"plt.legend(loc=\"upper left\", fontsize=20)\n",
"plt.axis([-10, 10, -0.1, 1.1])\n",
"save_fig(\"logistic_function_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn import datasets\n",
"iris = datasets.load_iris()\n",
"list(iris.keys())"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print(iris.DESCR)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"X = iris[\"data\"][:, 3:] # petal width\n",
"y = (iris[\"target\"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0\n",
"\n",
"log_reg = LogisticRegression()\n",
"log_reg.fit(X, y)\n",
"\n",
"X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n",
"y_proba = log_reg.predict_proba(X_new)\n",
"decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]\n",
"\n",
"plt.figure(figsize=(8, 3))\n",
"plt.plot(X[y==0], y[y==0], \"bs\")\n",
"plt.plot(X[y==1], y[y==1], \"g^\")\n",
"plt.plot([decision_boundary, decision_boundary], [-1, 2], \"k:\", linewidth=2)\n",
"plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris-Virginica\")\n",
"plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris-Virginica\")\n",
"plt.text(decision_boundary+0.02, 0.15, \"Decision boundary\", fontsize=14, color=\"k\", ha=\"center\")\n",
"plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')\n",
"plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')\n",
"plt.xlabel(\"Petal width (cm)\", fontsize=14)\n",
"plt.ylabel(\"Probability\", fontsize=14)\n",
"plt.legend(loc=\"center left\", fontsize=14)\n",
"plt.axis([0, 3, -0.02, 1.02])\n",
"save_fig(\"logistic_regression_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"decision_boundary"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"log_reg.predict([[1.7], [1.5]])"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n",
"y = (iris[\"target\"] == 2).astype(np.int)\n",
"\n",
"log_reg = LogisticRegression(C=10**10)\n",
"log_reg.fit(X, y)\n",
"\n",
"x0, x1 = np.meshgrid(\n",
" np.linspace(2.9, 7, 500).reshape(-1, 1),\n",
" np.linspace(0.8, 2.7, 200).reshape(-1, 1),\n",
" )\n",
"X_new = np.c_[x0.ravel(), x1.ravel()]\n",
"\n",
"y_proba = log_reg.predict_proba(X_new)\n",
"\n",
"plt.figure(figsize=(10, 4))\n",
"plt.plot(X[y==0, 0], X[y==0, 1], \"bs\")\n",
"plt.plot(X[y==1, 0], X[y==1, 1], \"g^\")\n",
"\n",
"zz = y_proba[:, 1].reshape(x0.shape)\n",
"contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)\n",
"\n",
"\n",
"left_right = np.array([2.9, 7])\n",
"boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n",
"\n",
"plt.clabel(contour, inline=1, fontsize=12)\n",
"plt.plot(left_right, boundary, \"k--\", linewidth=3)\n",
"plt.text(3.5, 1.5, \"Not Iris-Virginica\", fontsize=14, color=\"b\", ha=\"center\")\n",
"plt.text(6.5, 2.3, \"Iris-Virginica\", fontsize=14, color=\"g\", ha=\"center\")\n",
"plt.xlabel(\"Petal length\", fontsize=14)\n",
"plt.ylabel(\"Petal width\", fontsize=14)\n",
"plt.axis([2.9, 7, 0.8, 2.7])\n",
"save_fig(\"logistic_regression_contour_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n",
"y = iris[\"target\"]\n",
"\n",
"softmax_reg = LogisticRegression(multi_class=\"multinomial\", solver=\"lbfgs\", C=10)\n",
"softmax_reg.fit(X, y)\n",
"\n",
"x0, x1 = np.meshgrid(\n",
" np.linspace(0, 8, 500).reshape(-1, 1),\n",
" np.linspace(0, 3.5, 200).reshape(-1, 1),\n",
" )\n",
"X_new = np.c_[x0.ravel(), x1.ravel()]\n",
"\n",
"\n",
"y_proba = softmax_reg.predict_proba(X_new)\n",
"y_predict = softmax_reg.predict(X_new)\n",
"\n",
"zz1 = y_proba[:, 1].reshape(x0.shape)\n",
"zz = y_predict.reshape(x0.shape)\n",
"\n",
"plt.figure(figsize=(10, 4))\n",
"plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris-Virginica\")\n",
"plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris-Versicolour\")\n",
"plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris-Setosa\")\n",
"\n",
"from matplotlib.colors import ListedColormap\n",
"custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n",
"\n",
"plt.contourf(x0, x1, zz, cmap=custom_cmap, linewidth=5)\n",
"contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n",
"plt.clabel(contour, inline=1, fontsize=12)\n",
"plt.xlabel(\"Petal length\", fontsize=14)\n",
"plt.ylabel(\"Petal width\", fontsize=14)\n",
"plt.legend(loc=\"center left\", fontsize=14)\n",
"plt.axis([0, 7, 0, 3.5])\n",
"save_fig(\"softmax_regression_contour_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"softmax_reg.predict([[5, 2]])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"softmax_reg.predict_proba([[5, 2]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Exercise solutions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Coming soon**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"nav_menu": {},
"toc": {
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 6,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}