Clarify stratified sampling paragraph in ch02

main
Aurélien Geron 2017-10-16 14:19:08 +02:00
parent 80e59e1dd1
commit 830d6e4751
1 changed files with 129 additions and 150 deletions

View File

@ -35,9 +35,7 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
@ -80,9 +78,7 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import os\n",
@ -106,9 +102,7 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"fetch_housing_data()"
@ -117,9 +111,7 @@
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
@ -182,9 +174,7 @@
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# to make this notebook's output identical at every run\n",
@ -194,9 +184,7 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
@ -223,9 +211,7 @@
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import hashlib\n",
@ -242,9 +228,7 @@
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# This version supports both Python 2 and Python 3, instead of just Python 3.\n",
@ -255,9 +239,7 @@
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"housing_with_id = housing.reset_index() # adds an `index` column\n",
@ -267,9 +249,7 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
@ -288,9 +268,7 @@
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
@ -319,9 +297,7 @@
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# Divide by 1.5 to limit the number of income categories\n",
@ -351,9 +327,7 @@
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import StratifiedShuffleSplit\n",
@ -370,15 +344,22 @@
"metadata": {},
"outputs": [],
"source": [
"housing[\"income_cat\"].value_counts() / len(housing)"
"strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"housing[\"income_cat\"].value_counts() / len(housing)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def income_cat_proportions(data):\n",
@ -397,7 +378,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@ -406,10 +387,8 @@
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"for set_ in (strat_train_set, strat_test_set):\n",
@ -425,7 +404,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 30,
"metadata": {
"collapsed": true
},
@ -436,7 +415,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@ -446,7 +425,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@ -463,7 +442,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@ -477,7 +456,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@ -505,7 +484,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 35,
"metadata": {
"collapsed": true
},
@ -516,7 +495,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@ -525,7 +504,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
@ -540,7 +519,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@ -552,7 +531,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 39,
"metadata": {
"collapsed": true
},
@ -572,7 +551,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
@ -582,7 +561,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
@ -594,7 +573,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
@ -610,7 +589,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 43,
"metadata": {
"collapsed": true
},
@ -622,7 +601,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
@ -632,7 +611,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
@ -641,7 +620,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@ -650,7 +629,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
@ -661,7 +640,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 48,
"metadata": {
"collapsed": true
},
@ -681,7 +660,7 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 49,
"metadata": {
"collapsed": true
},
@ -692,7 +671,7 @@
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
@ -701,7 +680,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
@ -717,7 +696,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
@ -733,7 +712,7 @@
},
{
"cell_type": "code",
"execution_count": 52,
"execution_count": 53,
"metadata": {
"collapsed": true
},
@ -744,7 +723,7 @@
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": 54,
"metadata": {
"collapsed": true
},
@ -756,7 +735,7 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
@ -765,7 +744,7 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@ -774,7 +753,7 @@
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
@ -791,7 +770,7 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@ -808,7 +787,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
@ -818,7 +797,7 @@
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@ -841,7 +820,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
@ -861,7 +840,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
@ -877,7 +856,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 63,
"metadata": {
"collapsed": true
},
@ -1080,7 +1059,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@ -1101,7 +1080,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
@ -1117,7 +1096,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
@ -1128,7 +1107,7 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
@ -1144,7 +1123,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 68,
"metadata": {
"collapsed": true
},
@ -1176,7 +1155,7 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
@ -1193,7 +1172,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 70,
"metadata": {
"collapsed": true
},
@ -1213,7 +1192,7 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
@ -1229,7 +1208,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 72,
"metadata": {
"collapsed": true
},
@ -1257,7 +1236,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@ -1279,7 +1258,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 74,
"metadata": {
"collapsed": true
},
@ -1295,7 +1274,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
@ -1305,7 +1284,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
@ -1321,7 +1300,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
@ -1333,7 +1312,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
@ -1354,7 +1333,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
@ -1363,7 +1342,7 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
@ -1372,7 +1351,7 @@
},
{
"cell_type": "code",
"execution_count": 80,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
@ -1386,7 +1365,7 @@
},
{
"cell_type": "code",
"execution_count": 81,
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
@ -1398,7 +1377,7 @@
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
@ -1410,7 +1389,7 @@
},
{
"cell_type": "code",
"execution_count": 83,
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
@ -1429,7 +1408,7 @@
},
{
"cell_type": "code",
"execution_count": 84,
"execution_count": 85,
"metadata": {
"collapsed": true
},
@ -1444,7 +1423,7 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
@ -1458,7 +1437,7 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
@ -1470,7 +1449,7 @@
},
{
"cell_type": "code",
"execution_count": 87,
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
@ -1482,7 +1461,7 @@
},
{
"cell_type": "code",
"execution_count": 88,
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
@ -1494,7 +1473,7 @@
},
{
"cell_type": "code",
"execution_count": 89,
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
@ -1508,7 +1487,7 @@
},
{
"cell_type": "code",
"execution_count": 90,
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
@ -1518,7 +1497,7 @@
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
@ -1534,7 +1513,7 @@
},
{
"cell_type": "code",
"execution_count": 92,
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
@ -1563,7 +1542,7 @@
},
{
"cell_type": "code",
"execution_count": 93,
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
@ -1572,7 +1551,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
@ -1588,7 +1567,7 @@
},
{
"cell_type": "code",
"execution_count": 95,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
@ -1599,7 +1578,7 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
@ -1608,7 +1587,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
@ -1628,7 +1607,7 @@
},
{
"cell_type": "code",
"execution_count": 98,
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
@ -1639,7 +1618,7 @@
},
{
"cell_type": "code",
"execution_count": 99,
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
@ -1649,7 +1628,7 @@
},
{
"cell_type": "code",
"execution_count": 100,
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
@ -1662,7 +1641,7 @@
},
{
"cell_type": "code",
"execution_count": 101,
"execution_count": 102,
"metadata": {
"collapsed": true
},
@ -1682,7 +1661,7 @@
},
{
"cell_type": "code",
"execution_count": 102,
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
@ -1705,7 +1684,7 @@
},
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
@ -1727,7 +1706,7 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 105,
"metadata": {
"collapsed": true
},
@ -1738,7 +1717,7 @@
},
{
"cell_type": "code",
"execution_count": 105,
"execution_count": 106,
"metadata": {
"collapsed": true
},
@ -1759,7 +1738,7 @@
},
{
"cell_type": "code",
"execution_count": 106,
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
@ -1797,7 +1776,7 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
@ -1823,7 +1802,7 @@
},
{
"cell_type": "code",
"execution_count": 108,
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
@ -1841,7 +1820,7 @@
},
{
"cell_type": "code",
"execution_count": 109,
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
@ -1871,7 +1850,7 @@
},
{
"cell_type": "code",
"execution_count": 110,
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
@ -1904,7 +1883,7 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
@ -1922,7 +1901,7 @@
},
{
"cell_type": "code",
"execution_count": 112,
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
@ -1945,7 +1924,7 @@
},
{
"cell_type": "code",
"execution_count": 113,
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
@ -1970,7 +1949,7 @@
},
{
"cell_type": "code",
"execution_count": 114,
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
@ -2009,7 +1988,7 @@
},
{
"cell_type": "code",
"execution_count": 115,
"execution_count": 116,
"metadata": {
"collapsed": true
},
@ -2047,7 +2026,7 @@
},
{
"cell_type": "code",
"execution_count": 116,
"execution_count": 117,
"metadata": {
"collapsed": true
},
@ -2065,7 +2044,7 @@
},
{
"cell_type": "code",
"execution_count": 117,
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
@ -2075,7 +2054,7 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
@ -2091,7 +2070,7 @@
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
@ -2107,7 +2086,7 @@
},
{
"cell_type": "code",
"execution_count": 120,
"execution_count": 121,
"metadata": {
"collapsed": true
},
@ -2121,7 +2100,7 @@
},
{
"cell_type": "code",
"execution_count": 121,
"execution_count": 122,
"metadata": {
"collapsed": true
},
@ -2139,7 +2118,7 @@
},
{
"cell_type": "code",
"execution_count": 122,
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
@ -2155,7 +2134,7 @@
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
@ -2185,7 +2164,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 125,
"metadata": {
"collapsed": true
},
@ -2200,7 +2179,7 @@
},
{
"cell_type": "code",
"execution_count": 125,
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
@ -2216,7 +2195,7 @@
},
{
"cell_type": "code",
"execution_count": 126,
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
@ -2250,7 +2229,7 @@
},
{
"cell_type": "code",
"execution_count": 127,
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
@ -2266,7 +2245,7 @@
},
{
"cell_type": "code",
"execution_count": 128,
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
@ -2282,7 +2261,7 @@
},
{
"cell_type": "code",
"execution_count": 129,
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
@ -2313,7 +2292,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.2"
},
"nav_menu": {
"height": "279px",