Use 'np.random' rather than 'import numpy.random as rnd', and add random_state to make notebook's output constant

main
Aurélien Geron 2017-06-06 17:20:38 +02:00
parent 1a165e2864
commit e9a8883531
1 changed files with 166 additions and 56 deletions

View File

@ -55,11 +55,10 @@
"\n", "\n",
"# Common imports\n", "# Common imports\n",
"import numpy as np\n", "import numpy as np\n",
"import numpy.random as rnd\n",
"import os\n", "import os\n",
"\n", "\n",
"# to make this notebook's output stable across runs\n", "# to make this notebook's output stable across runs\n",
"rnd.seed(42)\n", "np.random.seed(42)\n",
"\n", "\n",
"# To plot pretty figures\n", "# To plot pretty figures\n",
"%matplotlib inline\n", "%matplotlib inline\n",
@ -102,16 +101,16 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"rnd.seed(4)\n", "np.random.seed(4)\n",
"m = 60\n", "m = 60\n",
"w1, w2 = 0.1, 0.3\n", "w1, w2 = 0.1, 0.3\n",
"noise = 0.1\n", "noise = 0.1\n",
"\n", "\n",
"angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5\n", "angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5\n",
"X = np.empty((m, 3))\n", "X = np.empty((m, 3))\n",
"X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2\n", "X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2\n",
"X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2\n", "X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n",
"X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)" "X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)"
] ]
}, },
{ {
@ -144,7 +143,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -185,7 +186,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 7,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -194,14 +197,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"## PCA using Scikit-Learn" "## PCA using Scikit-Learn"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"With Scikit-Learn, PCA is really trivial. It even takes care of mean centering for you:" "With Scikit-Learn, PCA is really trivial. It even takes care of mean centering for you:"
] ]
@ -226,7 +235,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 9,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -237,7 +248,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 10,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -259,7 +272,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 11,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -291,7 +306,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:" "Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:"
] ]
@ -300,7 +318,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 13,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -309,7 +329,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"We can compute the reconstruction error:" "We can compute the reconstruction error:"
] ]
@ -318,7 +341,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 14,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -327,7 +352,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"The inverse transform in the SVD approach looks like this:" "The inverse transform in the SVD approach looks like this:"
] ]
@ -347,7 +375,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:" "The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:"
] ]
@ -367,7 +398,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"The `PCA` object gives access to the principal components that it computed:" "The `PCA` object gives access to the principal components that it computed:"
] ]
@ -387,7 +421,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Compare to the first two principal components computed using the SVD method:" "Compare to the first two principal components computed using the SVD method:"
] ]
@ -407,14 +444,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Notice how the axes are flipped." "Notice how the axes are flipped."
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Now let's look at the explained variance ratio:" "Now let's look at the explained variance ratio:"
] ]
@ -429,19 +472,25 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"print(pca.explained_variance_ratio_)" "pca.explained_variance_ratio_"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"The first dimension explains 84.2% of the variance, while the second explains 14.6%." "The first dimension explains 84.2% of the variance, while the second explains 14.6%."
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"By projecting down to 2D, we lost about 1.1% of the variance:" "By projecting down to 2D, we lost about 1.1% of the variance:"
] ]
@ -461,7 +510,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):" "Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):"
] ]
@ -470,7 +522,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 21,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -479,7 +533,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Next, let's generate some nice figures! :)" "Next, let's generate some nice figures! :)"
] ]
@ -822,8 +879,8 @@
"stretch = 5\n", "stretch = 5\n",
"m = 200\n", "m = 200\n",
"\n", "\n",
"rnd.seed(3)\n", "np.random.seed(3)\n",
"X = rnd.randn(m, 2) / 10\n", "X = np.random.randn(m, 2) / 10\n",
"X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n", "X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n",
"X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n", "X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n",
"\n", "\n",
@ -941,7 +998,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 34,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -966,7 +1025,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 36,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1052,7 +1113,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 41, "execution_count": 41,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1061,7 +1124,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"## Incremental PCA" "## Incremental PCA"
] ]
@ -1122,7 +1188,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": 45,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1131,7 +1199,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: " "Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: "
] ]
@ -1151,7 +1222,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:" "But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:"
] ]
@ -1171,14 +1245,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"### Using `memmap()`" "### Using `memmap()`"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:" "Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:"
] ]
@ -1202,7 +1282,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk." "Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk."
] ]
@ -1211,7 +1294,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 49,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1220,7 +1305,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Next, another program would load the data and use it for training:" "Next, another program would load the data and use it for training:"
] ]
@ -1258,14 +1346,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"## Time complexity" "## Time complexity"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:" "Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:"
] ]
@ -1297,7 +1391,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):" "Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):"
] ]
@ -1316,7 +1413,7 @@
"times_pca = []\n", "times_pca = []\n",
"sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n", "sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n",
"for n_samples in sizes:\n", "for n_samples in sizes:\n",
" X = rnd.randn(n_samples, 5)\n", " X = np.random.randn(n_samples, 5)\n",
" pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n", " pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n",
" t1 = time.time()\n", " t1 = time.time()\n",
" pca.fit(X)\n", " pca.fit(X)\n",
@ -1338,7 +1435,10 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"deletable": true,
"editable": true
},
"source": [ "source": [
"And now let's compare their performance on datasets of 2,000 instances with various numbers of features:" "And now let's compare their performance on datasets of 2,000 instances with various numbers of features:"
] ]
@ -1358,7 +1458,7 @@
"times_pca = []\n", "times_pca = []\n",
"sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n", "sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n",
"for n_features in sizes:\n", "for n_features in sizes:\n",
" X = rnd.randn(2000, n_features)\n", " X = np.random.randn(2000, n_features)\n",
" pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n", " pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n",
" t1 = time.time()\n", " t1 = time.time()\n",
" pca.fit(X)\n", " pca.fit(X)\n",
@ -1392,7 +1492,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 55, "execution_count": 55,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1403,7 +1505,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 56, "execution_count": 56,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1531,7 +1635,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 61, "execution_count": 61,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1583,7 +1689,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 64, "execution_count": 64,
"metadata": { "metadata": {
"collapsed": true "collapsed": true,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1610,7 +1718,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 66, "execution_count": 66,
"metadata": { "metadata": {
"collapsed": false "collapsed": false,
"deletable": true,
"editable": true
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1679,7 +1789,7 @@
"source": [ "source": [
"from sklearn.manifold import TSNE\n", "from sklearn.manifold import TSNE\n",
"\n", "\n",
"tsne = TSNE(n_components=2)\n", "tsne = TSNE(n_components=2, random_state=42)\n",
"X_reduced_tsne = tsne.fit_transform(X)" "X_reduced_tsne = tsne.fit_transform(X)"
] ]
}, },