Use 'np.random' rather than 'import numpy.random as rnd', and add random_state to make notebook's output constant
parent
1a165e2864
commit
e9a8883531
|
@ -55,11 +55,10 @@
|
|||
"\n",
|
||||
"# Common imports\n",
|
||||
"import numpy as np\n",
|
||||
"import numpy.random as rnd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# to make this notebook's output stable across runs\n",
|
||||
"rnd.seed(42)\n",
|
||||
"np.random.seed(42)\n",
|
||||
"\n",
|
||||
"# To plot pretty figures\n",
|
||||
"%matplotlib inline\n",
|
||||
|
@ -102,16 +101,16 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"rnd.seed(4)\n",
|
||||
"np.random.seed(4)\n",
|
||||
"m = 60\n",
|
||||
"w1, w2 = 0.1, 0.3\n",
|
||||
"noise = 0.1\n",
|
||||
"\n",
|
||||
"angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5\n",
|
||||
"angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5\n",
|
||||
"X = np.empty((m, 3))\n",
|
||||
"X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2\n",
|
||||
"X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2\n",
|
||||
"X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)"
|
||||
"X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2\n",
|
||||
"X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n",
|
||||
"X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -144,7 +143,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -185,7 +186,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -194,14 +197,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"## PCA using Scikit-Learn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"With Scikit-Learn, PCA is really trivial. It even takes care of mean centering for you:"
|
||||
]
|
||||
|
@ -226,7 +235,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -237,7 +248,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -259,7 +272,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -291,7 +306,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:"
|
||||
]
|
||||
|
@ -300,7 +318,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -309,7 +329,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"We can compute the reconstruction error:"
|
||||
]
|
||||
|
@ -318,7 +341,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -327,7 +352,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"The inverse transform in the SVD approach looks like this:"
|
||||
]
|
||||
|
@ -347,7 +375,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:"
|
||||
]
|
||||
|
@ -367,7 +398,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"The `PCA` object gives access to the principal components that it computed:"
|
||||
]
|
||||
|
@ -387,7 +421,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Compare to the first two principal components computed using the SVD method:"
|
||||
]
|
||||
|
@ -407,14 +444,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Notice how the axes are flipped."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Now let's look at the explained variance ratio:"
|
||||
]
|
||||
|
@ -429,19 +472,25 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(pca.explained_variance_ratio_)"
|
||||
"pca.explained_variance_ratio_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"The first dimension explains 84.2% of the variance, while the second explains 14.6%."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"By projecting down to 2D, we lost about 1.1% of the variance:"
|
||||
]
|
||||
|
@ -461,7 +510,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):"
|
||||
]
|
||||
|
@ -470,7 +522,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -479,7 +533,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Next, let's generate some nice figures! :)"
|
||||
]
|
||||
|
@ -822,8 +879,8 @@
|
|||
"stretch = 5\n",
|
||||
"m = 200\n",
|
||||
"\n",
|
||||
"rnd.seed(3)\n",
|
||||
"X = rnd.randn(m, 2) / 10\n",
|
||||
"np.random.seed(3)\n",
|
||||
"X = np.random.randn(m, 2) / 10\n",
|
||||
"X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n",
|
||||
"X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n",
|
||||
"\n",
|
||||
|
@ -941,7 +998,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -966,7 +1025,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1052,7 +1113,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1061,7 +1124,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"## Incremental PCA"
|
||||
]
|
||||
|
@ -1122,7 +1188,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1131,7 +1199,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: "
|
||||
]
|
||||
|
@ -1151,7 +1222,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:"
|
||||
]
|
||||
|
@ -1171,14 +1245,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"### Using `memmap()`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:"
|
||||
]
|
||||
|
@ -1202,7 +1282,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk."
|
||||
]
|
||||
|
@ -1211,7 +1294,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1220,7 +1305,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Next, another program would load the data and use it for training:"
|
||||
]
|
||||
|
@ -1258,14 +1346,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"## Time complexity"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:"
|
||||
]
|
||||
|
@ -1297,7 +1391,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):"
|
||||
]
|
||||
|
@ -1316,7 +1413,7 @@
|
|||
"times_pca = []\n",
|
||||
"sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n",
|
||||
"for n_samples in sizes:\n",
|
||||
" X = rnd.randn(n_samples, 5)\n",
|
||||
" X = np.random.randn(n_samples, 5)\n",
|
||||
" pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n",
|
||||
" t1 = time.time()\n",
|
||||
" pca.fit(X)\n",
|
||||
|
@ -1338,7 +1435,10 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"source": [
|
||||
"And now let's compare their performance on datasets of 2,000 instances with various numbers of features:"
|
||||
]
|
||||
|
@ -1358,7 +1458,7 @@
|
|||
"times_pca = []\n",
|
||||
"sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n",
|
||||
"for n_features in sizes:\n",
|
||||
" X = rnd.randn(2000, n_features)\n",
|
||||
" X = np.random.randn(2000, n_features)\n",
|
||||
" pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n",
|
||||
" t1 = time.time()\n",
|
||||
" pca.fit(X)\n",
|
||||
|
@ -1392,7 +1492,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1403,7 +1505,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1531,7 +1635,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1583,7 +1689,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
"collapsed": true,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1610,7 +1718,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
"collapsed": false,
|
||||
"deletable": true,
|
||||
"editable": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -1679,7 +1789,7 @@
|
|||
"source": [
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"\n",
|
||||
"tsne = TSNE(n_components=2)\n",
|
||||
"tsne = TSNE(n_components=2, random_state=42)\n",
|
||||
"X_reduced_tsne = tsne.fit_transform(X)"
|
||||
]
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue