From f558bf43e52a62c65ac5c36c613f11923b06e4ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 5 Jan 2018 14:36:11 +0100 Subject: [PATCH] Upgrade to latest pandas version, update resampling API --- tools_pandas.ipynb | 923 +++++++++++++++++---------------------------- 1 file changed, 340 insertions(+), 583 deletions(-) diff --git a/tools_pandas.ipynb b/tools_pandas.ipynb index 379443e..6580f20 100644 --- a/tools_pandas.ipynb +++ b/tools_pandas.ipynb @@ -23,9 +23,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from __future__ import division, print_function, unicode_literals" @@ -41,9 +39,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd" @@ -71,9 +67,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s = pd.Series([2,-1,3,5])\n", @@ -91,9 +85,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -110,9 +102,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s + [1000,2000,3000,4000]" @@ -128,9 +118,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s + 1000" @@ -146,9 +134,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s < 0" @@ -165,9 +151,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2 = pd.Series([68, 83, 112, 68], index=[\"alice\", \"bob\", \"charles\", \"darwin\"])\n", @@ -184,9 +168,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2[\"bob\"]" @@ -202,9 +184,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2[1]" @@ -220,9 +200,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.loc[\"bob\"]" @@ -231,9 +209,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.iloc[1]" @@ -249,9 +225,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s2.iloc[1:3]" @@ -267,9 +241,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise = pd.Series([1000, 1001, 1002, 1003])\n", @@ -279,9 +251,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise_slice = surprise[2:]\n", @@ -298,9 +268,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -319,9 +287,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "surprise_slice.iloc[0]" @@ -338,9 +304,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "weights = {\"alice\": 68, \"bob\": 83, \"colin\": 86, \"darwin\": 68}\n", @@ -358,9 +322,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s4 = pd.Series(weights, index = [\"colin\", \"alice\"])\n", @@ -378,9 +340,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "print(s2.keys())\n", @@ -401,9 +361,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s5 = pd.Series([1000,1000,1000,1000])\n", @@ -431,9 +389,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "meaning = pd.Series(42, [\"life\", \"universe\", \"everything\"])\n", @@ -451,9 +407,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "s6 = pd.Series([83, 68], index=[\"bob\", \"alice\"], name=\"weights\")\n", @@ -465,14 +419,13 @@ "metadata": {}, "source": [ "## Plotting a `Series`\n", - "Pandas makes it easy to plot `Series` data using matplotlib (for more details on matplotlib, check out the [matplotlib tutorial](tools_matplotlib.ipynb)). Just import matplotlib and call the `plot` method:" + "Pandas makes it easy to plot `Series` data using matplotlib (for more details on matplotlib, check out the [matplotlib tutorial](tools_matplotlib.ipynb)). Just import matplotlib and call the `plot()` method:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -504,15 +457,13 @@ "* it can handle timezones.\n", "\n", "## Time range\n", - "Let's start by creating a time series using `timerange`. This returns a `DatetimeIndex` containing one datetime per hour for 12 hours starting on October 29th 2016 at 5:30pm." + "Let's start by creating a time series using `pd.date_range()`. This returns a `DatetimeIndex` containing one datetime per hour for 12 hours starting on October 29th 2016 at 5:30pm." ] }, { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "dates = pd.date_range('2016/10/29 5:30pm', periods=12, freq='H')\n", @@ -529,9 +480,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series = pd.Series(temperatures, dates)\n", @@ -548,9 +497,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series.plot(kind=\"bar\")\n", @@ -564,15 +511,13 @@ "metadata": {}, "source": [ "## Resampling\n", - "Pandas let's us resample a time series very simply. Just call the `resample` method and specify a new frequency:" + "Pandas lets us resample a time series very simply. Just call the `resample()` method and specify a new frequency:" ] }, { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "temp_series_freq_2H = temp_series.resample(\"2H\")\n", @@ -583,15 +528,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's take a look at the result:" + "The resampling operation is actually a deferred operation, which is why we did not get a `Series` object, but a `DatetimeIndexResampler` object instead. To actually perform the resampling operation, we can simply call the `mean()` method: Pandas will compute the mean of every pair of consecutive hours:" ] }, { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false - }, + "metadata": {}, + "outputs": [], + "source": [ + "temp_series_freq_2H = temp_series_freq_2H.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, "outputs": [], "source": [ "temp_series_freq_2H.plot(kind=\"bar\")\n", @@ -602,18 +561,33 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note how the values have automatically been aggregated into 2-hour periods. If we look at the 6-8pm period, for example, we had a value of `5.1` at 6:30pm, and `6.1` at 7:30pm. After resampling, we just have one value of `5.6`, which is the mean of `5.1` and `6.1`. Computing the mean is the default behavior, but it is also possible to use a different aggregation function, for example we can decide to keep the minimum value of each period:" + "Note how the values have automatically been aggregated into 2-hour periods. If we look at the 6-8pm period, for example, we had a value of `5.1` at 6:30pm, and `6.1` at 7:30pm. After resampling, we just have one value of `5.6`, which is the mean of `5.1` and `6.1`. Rather than computing the mean, we could have used any other aggregation function, for example we can decide to keep the minimum value of each period:" ] }, { "cell_type": "code", - "execution_count": 30, - "metadata": { - "collapsed": false - }, + "execution_count": 31, + "metadata": {}, "outputs": [], "source": [ - "temp_series_freq_2H = temp_series.resample(\"2H\", how=np.min)\n", + "temp_series_freq_2H = temp_series.resample(\"2H\").min()\n", + "temp_series_freq_2H" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or, equivalently, we could use the `apply()` method instead:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "temp_series_freq_2H = temp_series.resample(\"2H\").apply(np.min)\n", "temp_series_freq_2H" ] }, @@ -627,13 +601,11 @@ }, { "cell_type": "code", - "execution_count": 31, - "metadata": { - "collapsed": false - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ - "temp_series_freq_15min = temp_series.resample(\"15Min\")\n", + "temp_series_freq_15min = temp_series.resample(\"15Min\").mean()\n", "temp_series_freq_15min.head(n=10) # `head` displays the top n values" ] }, @@ -641,14 +613,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "One solution is to fill the gaps by interpolating. We just call the `interpolate` method. The default is to use linear interpolation, but we can also select another method, such as cubic interpolation:" + "One solution is to fill the gaps by interpolating. We just call the `interpolate()` method. The default is to use linear interpolation, but we can also select another method, such as cubic interpolation:" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -659,10 +630,8 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": { - "collapsed": false - }, + "execution_count": 35, + "metadata": {}, "outputs": [], "source": [ "temp_series.plot(label=\"Period: 1 hour\")\n", @@ -676,15 +645,13 @@ "metadata": {}, "source": [ "## Timezones\n", - "By default datetimes are *naive*: they are not aware of timezones, so 2016-10-30 02:30 might mean October 30th 2016 at 2:30am in Paris or in New York. We can make datetimes timezone *aware* by calling the `tz_localize` method:" + "By default datetimes are *naive*: they are not aware of timezones, so 2016-10-30 02:30 might mean October 30th 2016 at 2:30am in Paris or in New York. We can make datetimes timezone *aware* by calling the `tz_localize()` method:" ] }, { "cell_type": "code", - "execution_count": 34, - "metadata": { - "collapsed": false - }, + "execution_count": 36, + "metadata": {}, "outputs": [], "source": [ "temp_series_ny = temp_series.tz_localize(\"America/New_York\")\n", @@ -702,10 +669,8 @@ }, { "cell_type": "code", - "execution_count": 35, - "metadata": { - "collapsed": false - }, + "execution_count": 37, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris = temp_series_ny.tz_convert(\"Europe/Paris\")\n", @@ -721,10 +686,8 @@ }, { "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": false - }, + "execution_count": 38, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris_naive = temp_series_paris.tz_localize(None)\n", @@ -740,10 +703,8 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false - }, + "execution_count": 39, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -762,10 +723,8 @@ }, { "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": false - }, + "execution_count": 40, + "metadata": {}, "outputs": [], "source": [ "temp_series_paris_naive.tz_localize(\"Europe/Paris\", ambiguous=\"infer\")" @@ -776,15 +735,13 @@ "metadata": {}, "source": [ "## Periods\n", - "The `period_range` function returns a `PeriodIndex` instead of a `DatetimeIndex`. For example, let's get all quarters in 2016 and 2017:" + "The `pd.period_range()` function returns a `PeriodIndex` instead of a `DatetimeIndex`. For example, let's get all quarters in 2016 and 2017:" ] }, { "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, + "execution_count": 41, + "metadata": {}, "outputs": [], "source": [ "quarters = pd.period_range('2016Q1', periods=8, freq='Q')\n", @@ -800,10 +757,8 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": { - "collapsed": false - }, + "execution_count": 42, + "metadata": {}, "outputs": [], "source": [ "quarters + 3" @@ -813,15 +768,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `asfreq` method lets us change the frequency of the `PeriodIndex`. All periods are lengthened or shortened accordingly. For example, let's convert all the quarterly periods to monthly periods (zooming in):" + "The `asfreq()` method lets us change the frequency of the `PeriodIndex`. All periods are lengthened or shortened accordingly. For example, let's convert all the quarterly periods to monthly periods (zooming in):" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": { - "collapsed": false - }, + "execution_count": 43, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"M\")" @@ -836,10 +789,8 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": { - "collapsed": false - }, + "execution_count": 44, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"M\", how=\"start\")" @@ -854,10 +805,8 @@ }, { "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": false - }, + "execution_count": 45, + "metadata": {}, "outputs": [], "source": [ "quarters.asfreq(\"A\")" @@ -872,10 +821,8 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": false - }, + "execution_count": 46, + "metadata": {}, "outputs": [], "source": [ "quarterly_revenue = pd.Series([300, 320, 290, 390, 320, 360, 310, 410], index = quarters)\n", @@ -884,10 +831,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "collapsed": false - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "quarterly_revenue.plot(kind=\"line\")\n", @@ -903,10 +848,8 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": { - "collapsed": false - }, + "execution_count": 48, + "metadata": {}, "outputs": [], "source": [ "last_hours = quarterly_revenue.to_timestamp(how=\"end\", freq=\"H\")\n", @@ -922,10 +865,8 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "collapsed": false - }, + "execution_count": 49, + "metadata": {}, "outputs": [], "source": [ "last_hours.to_period()" @@ -940,10 +881,8 @@ }, { "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false - }, + "execution_count": 50, + "metadata": {}, "outputs": [], "source": [ "months_2016 = pd.period_range(\"2016\", periods=12, freq=\"M\")\n", @@ -965,10 +904,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false - }, + "execution_count": 51, + "metadata": {}, "outputs": [], "source": [ "people_dict = {\n", @@ -1001,10 +938,8 @@ }, { "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false - }, + "execution_count": 52, + "metadata": {}, "outputs": [], "source": [ "people[\"birthyear\"]" @@ -1019,10 +954,8 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": { - "collapsed": false - }, + "execution_count": 53, + "metadata": {}, "outputs": [], "source": [ "people[[\"birthyear\", \"hobby\"]]" @@ -1037,10 +970,8 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": { - "collapsed": false - }, + "execution_count": 54, + "metadata": {}, "outputs": [], "source": [ "d2 = pd.DataFrame(\n", @@ -1060,10 +991,8 @@ }, { "cell_type": "code", - "execution_count": 53, - "metadata": { - "collapsed": false - }, + "execution_count": 55, + "metadata": {}, "outputs": [], "source": [ "values = [\n", @@ -1088,16 +1017,14 @@ }, { "cell_type": "code", - "execution_count": 54, - "metadata": { - "collapsed": false - }, + "execution_count": 56, + "metadata": {}, "outputs": [], "source": [ "masked_array = np.ma.asarray(values, dtype=np.object)\n", "masked_array[(0, 2), (1, 2)] = np.ma.masked\n", "d3 = pd.DataFrame(\n", - " values,\n", + " masked_array,\n", " columns=[\"birthyear\", \"children\", \"hobby\", \"weight\"],\n", " index=[\"alice\", \"bob\", \"charles\"]\n", " )\n", @@ -1113,10 +1040,8 @@ }, { "cell_type": "code", - "execution_count": 55, - "metadata": { - "collapsed": false - }, + "execution_count": 57, + "metadata": {}, "outputs": [], "source": [ "d4 = pd.DataFrame(\n", @@ -1136,10 +1061,8 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": false - }, + "execution_count": 58, + "metadata": {}, "outputs": [], "source": [ "people = pd.DataFrame({\n", @@ -1161,10 +1084,8 @@ }, { "cell_type": "code", - "execution_count": 57, - "metadata": { - "collapsed": false - }, + "execution_count": 59, + "metadata": {}, "outputs": [], "source": [ "d5 = pd.DataFrame(\n", @@ -1191,10 +1112,8 @@ }, { "cell_type": "code", - "execution_count": 58, - "metadata": { - "collapsed": false - }, + "execution_count": 60, + "metadata": {}, "outputs": [], "source": [ "d5[\"public\"]" @@ -1202,13 +1121,11 @@ }, { "cell_type": "code", - "execution_count": 59, - "metadata": { - "collapsed": false - }, + "execution_count": 61, + "metadata": {}, "outputs": [], "source": [ - "d5[\"public\", \"hobby\"] # Same result as d4[\"public\"][\"hobby\"]" + "d5[\"public\", \"hobby\"] # Same result as d5[\"public\"][\"hobby\"]" ] }, { @@ -1221,10 +1138,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "collapsed": false - }, + "execution_count": 62, + "metadata": {}, "outputs": [], "source": [ "d5" @@ -1234,15 +1149,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There are two levels of columns, and two levels of indices. We can drop a column level by calling `droplevel` (the same goes for indices):" + "There are two levels of columns, and two levels of indices. We can drop a column level by calling `droplevel()` (the same goes for indices):" ] }, { "cell_type": "code", - "execution_count": 61, - "metadata": { - "collapsed": false - }, + "execution_count": 63, + "metadata": {}, "outputs": [], "source": [ "d5.columns = d5.columns.droplevel(level = 0)\n", @@ -1259,10 +1172,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "metadata": { - "collapsed": false - }, + "execution_count": 64, + "metadata": {}, "outputs": [], "source": [ "d6 = d5.T\n", @@ -1274,15 +1185,13 @@ "metadata": {}, "source": [ "## Stacking and unstacking levels\n", - "Calling the `stack` method will push the lowest column level after the lowest index:" + "Calling the `stack()` method will push the lowest column level after the lowest index:" ] }, { "cell_type": "code", - "execution_count": 63, - "metadata": { - "collapsed": false - }, + "execution_count": 65, + "metadata": {}, "outputs": [], "source": [ "d7 = d6.stack()\n", @@ -1295,15 +1204,13 @@ "source": [ "Note that many `NaN` values appeared. This makes sense because many new combinations did not exist before (eg. there was no `bob` in `London`).\n", "\n", - "Calling `unstack` will do the reverse, once again creating many `NaN` values." + "Calling `unstack()` will do the reverse, once again creating many `NaN` values." ] }, { "cell_type": "code", - "execution_count": 64, - "metadata": { - "collapsed": false - }, + "execution_count": 66, + "metadata": {}, "outputs": [], "source": [ "d8 = d7.unstack()\n", @@ -1319,10 +1226,8 @@ }, { "cell_type": "code", - "execution_count": 65, - "metadata": { - "collapsed": false - }, + "execution_count": 67, + "metadata": {}, "outputs": [], "source": [ "d9 = d8.unstack()\n", @@ -1333,14 +1238,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `stack` and `unstack` methods let you select the `level` to stack/unstack. You can even stack/unstack multiple levels at once:" + "The `stack()` and `unstack()` methods let you select the `level` to stack/unstack. You can even stack/unstack multiple levels at once:" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 68, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -1354,7 +1258,7 @@ "metadata": {}, "source": [ "## Most methods return modified copies\n", - "As you may have noticed, the `stack` and `unstack` methods do not modify the object they apply to. Instead, they work on a copy and return that copy. This is true of most methods in pandas." + "As you may have noticed, the `stack()` and `unstack()` methods do not modify the object they apply to. Instead, they work on a copy and return that copy. This is true of most methods in pandas." ] }, { @@ -1367,10 +1271,8 @@ }, { "cell_type": "code", - "execution_count": 67, - "metadata": { - "collapsed": false - }, + "execution_count": 69, + "metadata": {}, "outputs": [], "source": [ "people" @@ -1380,15 +1282,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `loc` attribute lets you access rows instead of columns. The result is `Series` object in which the `DataFrame`'s column names are mapped to row index labels:" + "The `loc` attribute lets you access rows instead of columns. The result is a `Series` object in which the `DataFrame`'s column names are mapped to row index labels:" ] }, { "cell_type": "code", - "execution_count": 68, - "metadata": { - "collapsed": false - }, + "execution_count": 70, + "metadata": {}, "outputs": [], "source": [ "people.loc[\"charles\"]" @@ -1403,10 +1303,8 @@ }, { "cell_type": "code", - "execution_count": 69, - "metadata": { - "collapsed": false - }, + "execution_count": 71, + "metadata": {}, "outputs": [], "source": [ "people.iloc[2]" @@ -1421,10 +1319,8 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": { - "collapsed": false - }, + "execution_count": 72, + "metadata": {}, "outputs": [], "source": [ "people.iloc[1:3]" @@ -1439,10 +1335,8 @@ }, { "cell_type": "code", - "execution_count": 71, - "metadata": { - "collapsed": false - }, + "execution_count": 73, + "metadata": {}, "outputs": [], "source": [ "people[np.array([True, False, True])]" @@ -1457,10 +1351,8 @@ }, { "cell_type": "code", - "execution_count": 72, - "metadata": { - "collapsed": false - }, + "execution_count": 74, + "metadata": {}, "outputs": [], "source": [ "people[people[\"birthyear\"] < 1990]" @@ -1476,10 +1368,8 @@ }, { "cell_type": "code", - "execution_count": 73, - "metadata": { - "collapsed": false - }, + "execution_count": 75, + "metadata": {}, "outputs": [], "source": [ "people" @@ -1487,13 +1377,11 @@ }, { "cell_type": "code", - "execution_count": 74, - "metadata": { - "collapsed": false - }, + "execution_count": 76, + "metadata": {}, "outputs": [], "source": [ - "people[\"age\"] = 2016 - people[\"birthyear\"] # adds a new column \"age\"\n", + "people[\"age\"] = 2018 - people[\"birthyear\"] # adds a new column \"age\"\n", "people[\"over 30\"] = people[\"age\"] > 30 # adds another column \"over 30\"\n", "birthyears = people.pop(\"birthyear\")\n", "del people[\"children\"]\n", @@ -1503,10 +1391,8 @@ }, { "cell_type": "code", - "execution_count": 75, - "metadata": { - "collapsed": false - }, + "execution_count": 77, + "metadata": {}, "outputs": [], "source": [ "birthyears" @@ -1521,10 +1407,8 @@ }, { "cell_type": "code", - "execution_count": 76, - "metadata": { - "collapsed": false - }, + "execution_count": 78, + "metadata": {}, "outputs": [], "source": [ "people[\"pets\"] = pd.Series({\"bob\": 0, \"charles\": 5, \"eugene\":1}) # alice is missing, eugene is ignored\n", @@ -1535,15 +1419,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "When adding a new column, it is added at the end (on the right) by default. You can also insert a column anywhere else using the `insert` method:" + "When adding a new column, it is added at the end (on the right) by default. You can also insert a column anywhere else using the `insert()` method:" ] }, { "cell_type": "code", - "execution_count": 77, - "metadata": { - "collapsed": false - }, + "execution_count": 79, + "metadata": {}, "outputs": [], "source": [ "people.insert(1, \"height\", [172, 181, 185])\n", @@ -1555,15 +1437,13 @@ "metadata": {}, "source": [ "## Assigning new columns\n", - "You can also create new columns by calling the `assign` method. Note that this returns a new `DataFrame` object, the original is not modified:" + "You can also create new columns by calling the `assign()` method. Note that this returns a new `DataFrame` object, the original is not modified:" ] }, { "cell_type": "code", - "execution_count": 78, - "metadata": { - "collapsed": false - }, + "execution_count": 80, + "metadata": {}, "outputs": [], "source": [ "people.assign(\n", @@ -1581,10 +1461,8 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": { - "collapsed": false - }, + "execution_count": 81, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -1605,10 +1483,8 @@ }, { "cell_type": "code", - "execution_count": 80, - "metadata": { - "collapsed": false - }, + "execution_count": 82, + "metadata": {}, "outputs": [], "source": [ "d6 = people.assign(body_mass_index = people[\"weight\"] / (people[\"height\"] / 100) ** 2)\n", @@ -1624,10 +1500,8 @@ }, { "cell_type": "code", - "execution_count": 81, - "metadata": { - "collapsed": false - }, + "execution_count": 83, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -1643,15 +1517,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "But fear not, there is a simple solution. You can pass a function to the `assign` method (typically a `lambda` function), and this function will be called with the `DataFrame` as a parameter:" + "But fear not, there is a simple solution. You can pass a function to the `assign()` method (typically a `lambda` function), and this function will be called with the `DataFrame` as a parameter:" ] }, { "cell_type": "code", - "execution_count": 82, - "metadata": { - "collapsed": false - }, + "execution_count": 84, + "metadata": {}, "outputs": [], "source": [ "(people\n", @@ -1677,10 +1549,8 @@ }, { "cell_type": "code", - "execution_count": 83, - "metadata": { - "collapsed": false - }, + "execution_count": 85, + "metadata": {}, "outputs": [], "source": [ "people.eval(\"weight / (height/100) ** 2 > 25\")" @@ -1690,18 +1560,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Assignment expressions are also supported, and contrary to the `assign` method, this does not create a copy of the `DataFrame`, instead it directly modifies it:" + "Assignment expressions are also supported. Let's set `inplace=True` to directly modify the `DataFrame` rather than getting a modified copy:" ] }, { "cell_type": "code", - "execution_count": 84, - "metadata": { - "collapsed": false - }, + "execution_count": 86, + "metadata": {}, "outputs": [], "source": [ - "people.eval(\"body_mass_index = weight / (height/100) ** 2\")\n", + "people.eval(\"body_mass_index = weight / (height/100) ** 2\", inplace=True)\n", "people" ] }, @@ -1714,14 +1582,12 @@ }, { "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": false - }, + "execution_count": 87, + "metadata": {}, "outputs": [], "source": [ "overweight_threshold = 30\n", - "people.eval(\"overweight = body_mass_index > @overweight_threshold\")\n", + "people.eval(\"overweight = body_mass_index > @overweight_threshold\", inplace=True)\n", "people" ] }, @@ -1730,15 +1596,13 @@ "metadata": {}, "source": [ "## Querying a `DataFrame`\n", - "The `query` method lets you filter a `DataFrame` based on a query expression:" + "The `query()` method lets you filter a `DataFrame` based on a query expression:" ] }, { "cell_type": "code", - "execution_count": 86, - "metadata": { - "collapsed": false - }, + "execution_count": 88, + "metadata": {}, "outputs": [], "source": [ "people.query(\"age > 30 and pets == 0\")" @@ -1754,10 +1618,8 @@ }, { "cell_type": "code", - "execution_count": 87, - "metadata": { - "collapsed": false - }, + "execution_count": 89, + "metadata": {}, "outputs": [], "source": [ "people.sort_index(ascending=False)" @@ -1772,10 +1634,8 @@ }, { "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": false - }, + "execution_count": 90, + "metadata": {}, "outputs": [], "source": [ "people.sort_index(axis=1, inplace=True)\n", @@ -1791,10 +1651,8 @@ }, { "cell_type": "code", - "execution_count": 89, - "metadata": { - "collapsed": false - }, + "execution_count": 91, + "metadata": {}, "outputs": [], "source": [ "people.sort_values(by=\"age\", inplace=True)\n", @@ -1813,10 +1671,8 @@ }, { "cell_type": "code", - "execution_count": 90, - "metadata": { - "collapsed": false - }, + "execution_count": 92, + "metadata": {}, "outputs": [], "source": [ "people.plot(kind = \"line\", x = \"body_mass_index\", y = [\"height\", \"weight\"])\n", @@ -1827,14 +1683,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can pass extra arguments supported by matplotlib's functions. For example, we can create scatterplot and pass it a list of sizes using the `s` argument of matplotlib's `scatter` function:" + "You can pass extra arguments supported by matplotlib's functions. For example, we can create scatterplot and pass it a list of sizes using the `s` argument of matplotlib's `scatter()` function:" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 93, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -1860,10 +1715,8 @@ }, { "cell_type": "code", - "execution_count": 92, - "metadata": { - "collapsed": false - }, + "execution_count": 94, + "metadata": {}, "outputs": [], "source": [ "grades_array = np.array([[8,8,9],[10,9,9],[4, 8, 2], [9, 10, 10]])\n", @@ -1880,10 +1733,8 @@ }, { "cell_type": "code", - "execution_count": 93, - "metadata": { - "collapsed": false - }, + "execution_count": 95, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(grades)" @@ -1898,10 +1749,8 @@ }, { "cell_type": "code", - "execution_count": 94, - "metadata": { - "collapsed": false - }, + "execution_count": 96, + "metadata": {}, "outputs": [], "source": [ "grades + 1" @@ -1916,9 +1765,8 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 97, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -1935,10 +1783,8 @@ }, { "cell_type": "code", - "execution_count": 96, - "metadata": { - "collapsed": false - }, + "execution_count": 98, + "metadata": {}, "outputs": [], "source": [ "grades.mean()" @@ -1953,10 +1799,8 @@ }, { "cell_type": "code", - "execution_count": 97, - "metadata": { - "collapsed": false - }, + "execution_count": 99, + "metadata": {}, "outputs": [], "source": [ "(grades > 5).all()" @@ -1971,10 +1815,8 @@ }, { "cell_type": "code", - "execution_count": 98, - "metadata": { - "collapsed": false - }, + "execution_count": 100, + "metadata": {}, "outputs": [], "source": [ "(grades > 5).all(axis = 1)" @@ -1989,10 +1831,8 @@ }, { "cell_type": "code", - "execution_count": 99, - "metadata": { - "collapsed": false - }, + "execution_count": 101, + "metadata": {}, "outputs": [], "source": [ "(grades == 10).any(axis = 1)" @@ -2007,10 +1847,8 @@ }, { "cell_type": "code", - "execution_count": 100, - "metadata": { - "collapsed": false - }, + "execution_count": 102, + "metadata": {}, "outputs": [], "source": [ "grades - grades.mean() # equivalent to: grades - [7.75, 8.75, 7.50]" @@ -2025,10 +1863,8 @@ }, { "cell_type": "code", - "execution_count": 101, - "metadata": { - "collapsed": false - }, + "execution_count": 103, + "metadata": {}, "outputs": [], "source": [ "pd.DataFrame([[7.75, 8.75, 7.50]]*4, index=grades.index, columns=grades.columns)" @@ -2043,9 +1879,8 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 104, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2063,10 +1898,8 @@ }, { "cell_type": "code", - "execution_count": 103, - "metadata": { - "collapsed": false - }, + "execution_count": 105, + "metadata": {}, "outputs": [], "source": [ "bonus_array = np.array([[0,np.nan,2],[np.nan,1,0],[0, 1, 0], [3, 3, 0]])\n", @@ -2076,9 +1909,8 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 106, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2095,14 +1927,13 @@ "## Handling missing data\n", "Dealing with missing data is a frequent task when working with real life data. Pandas offers a few tools to handle missing data.\n", " \n", - "Let's try to fix the problem above. For example, we can decide that missing data should result in a zero, instead of `NaN`. We can replace all `NaN` values by a any value using the `fillna` method:" + "Let's try to fix the problem above. For example, we can decide that missing data should result in a zero, instead of `NaN`. We can replace all `NaN` values by a any value using the `fillna()` method:" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 107, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2119,10 +1950,8 @@ }, { "cell_type": "code", - "execution_count": 106, - "metadata": { - "collapsed": false - }, + "execution_count": 108, + "metadata": {}, "outputs": [], "source": [ "fixed_bonus_points = bonus_points.fillna(0)\n", @@ -2142,10 +1971,8 @@ }, { "cell_type": "code", - "execution_count": 107, - "metadata": { - "collapsed": false - }, + "execution_count": 109, + "metadata": {}, "outputs": [], "source": [ "bonus_points" @@ -2160,9 +1987,8 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 110, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2179,10 +2005,8 @@ }, { "cell_type": "code", - "execution_count": 109, - "metadata": { - "collapsed": false - }, + "execution_count": 111, + "metadata": {}, "outputs": [], "source": [ "better_bonus_points = bonus_points.copy()\n", @@ -2201,10 +2025,8 @@ }, { "cell_type": "code", - "execution_count": 110, - "metadata": { - "collapsed": false - }, + "execution_count": 112, + "metadata": {}, "outputs": [], "source": [ "grades + better_bonus_points" @@ -2219,9 +2041,8 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 113, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2235,15 +2056,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There's not much we can do about December and Colin: it's bad enough that we are making up bonus points, but we can't reasonably make up grades (well I guess some teachers probably do). So let's call the `dropna` method to get rid of rows that are full of `NaN`s:" + "There's not much we can do about December and Colin: it's bad enough that we are making up bonus points, but we can't reasonably make up grades (well I guess some teachers probably do). So let's call the `dropna()` method to get rid of rows that are full of `NaN`s:" ] }, { "cell_type": "code", - "execution_count": 112, - "metadata": { - "collapsed": false - }, + "execution_count": 114, + "metadata": {}, "outputs": [], "source": [ "final_grades_clean = final_grades.dropna(how=\"all\")\n", @@ -2259,10 +2078,8 @@ }, { "cell_type": "code", - "execution_count": 113, - "metadata": { - "collapsed": false - }, + "execution_count": 115, + "metadata": {}, "outputs": [], "source": [ "final_grades_clean = final_grades_clean.dropna(axis=1, how=\"all\")\n", @@ -2281,9 +2098,8 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 116, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2301,10 +2117,8 @@ }, { "cell_type": "code", - "execution_count": 115, - "metadata": { - "collapsed": false - }, + "execution_count": 117, + "metadata": {}, "outputs": [], "source": [ "grouped_grades = final_grades.groupby(\"hobby\")\n", @@ -2320,10 +2134,8 @@ }, { "cell_type": "code", - "execution_count": 116, - "metadata": { - "collapsed": false - }, + "execution_count": 118, + "metadata": {}, "outputs": [], "source": [ "grouped_grades.mean()" @@ -2346,10 +2158,8 @@ }, { "cell_type": "code", - "execution_count": 117, - "metadata": { - "collapsed": false - }, + "execution_count": 119, + "metadata": {}, "outputs": [], "source": [ "bonus_points" @@ -2357,10 +2167,8 @@ }, { "cell_type": "code", - "execution_count": 118, - "metadata": { - "collapsed": false - }, + "execution_count": 120, + "metadata": {}, "outputs": [], "source": [ "more_grades = final_grades_clean.stack().reset_index()\n", @@ -2373,15 +2181,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we can call the `pivot_table` function for this `DataFrame`, asking to group by the `name` column. By default, `pivot_table` computes the `mean` of each numeric column:" + "Now we can call the `pd.pivot_table()` function for this `DataFrame`, asking to group by the `name` column. By default, `pivot_table()` computes the mean of each numeric column:" ] }, { "cell_type": "code", - "execution_count": 119, - "metadata": { - "collapsed": false - }, + "execution_count": 121, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\")" @@ -2391,15 +2197,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can change the aggregation function by setting the `aggfunc` attribute, and we can also specify the list of columns whose values will be aggregated:" + "We can change the aggregation function by setting the `aggfunc` argument, and we can also specify the list of columns whose values will be aggregated:" ] }, { "cell_type": "code", - "execution_count": 120, - "metadata": { - "collapsed": false - }, + "execution_count": 122, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\", values=[\"grade\",\"bonus\"], aggfunc=np.max)" @@ -2414,10 +2218,8 @@ }, { "cell_type": "code", - "execution_count": 121, - "metadata": { - "collapsed": false - }, + "execution_count": 123, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=\"name\", values=\"grade\", columns=\"month\", margins=True)" @@ -2432,10 +2234,8 @@ }, { "cell_type": "code", - "execution_count": 122, - "metadata": { - "collapsed": false - }, + "execution_count": 124, + "metadata": {}, "outputs": [], "source": [ "pd.pivot_table(more_grades, index=(\"name\", \"month\"), margins=True)" @@ -2451,9 +2251,8 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 125, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2469,14 +2268,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `head` method returns the top 5 rows:" + "The `head()` method returns the top 5 rows:" ] }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 126, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2488,15 +2286,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Of course there's also a `tail` function to view the bottom 5 rows. You can pass the number of rows you want:" + "Of course there's also a `tail()` function to view the bottom 5 rows. You can pass the number of rows you want:" ] }, { "cell_type": "code", - "execution_count": 125, - "metadata": { - "collapsed": false - }, + "execution_count": 127, + "metadata": {}, "outputs": [], "source": [ "large_df.tail(n=2)" @@ -2506,14 +2302,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `info` method prints out a summary of each columns contents:" + "The `info()` method prints out a summary of each columns contents:" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 128, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2525,7 +2320,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, the `describe` method gives a nice overview of the main aggregated values over each column:\n", + "Finally, the `describe()` method gives a nice overview of the main aggregated values over each column:\n", "* `count`: number of non-null (not NaN) values\n", "* `mean`: mean of non-null values\n", "* `std`: [standard deviation](https://en.wikipedia.org/wiki/Standard_deviation) of non-null values\n", @@ -2536,9 +2331,8 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 129, "metadata": { - "collapsed": false, "scrolled": false }, "outputs": [], @@ -2556,10 +2350,8 @@ }, { "cell_type": "code", - "execution_count": 128, - "metadata": { - "collapsed": false - }, + "execution_count": 130, + "metadata": {}, "outputs": [], "source": [ "my_df = pd.DataFrame(\n", @@ -2580,10 +2372,8 @@ }, { "cell_type": "code", - "execution_count": 129, - "metadata": { - "collapsed": true - }, + "execution_count": 131, + "metadata": {}, "outputs": [], "source": [ "my_df.to_csv(\"my_df.csv\")\n", @@ -2600,10 +2390,8 @@ }, { "cell_type": "code", - "execution_count": 130, - "metadata": { - "collapsed": false - }, + "execution_count": 132, + "metadata": {}, "outputs": [], "source": [ "for filename in (\"my_df.csv\", \"my_df.html\", \"my_df.json\"):\n", @@ -2624,10 +2412,8 @@ }, { "cell_type": "code", - "execution_count": 131, - "metadata": { - "collapsed": false - }, + "execution_count": 133, + "metadata": {}, "outputs": [], "source": [ "try:\n", @@ -2646,10 +2432,8 @@ }, { "cell_type": "code", - "execution_count": 132, - "metadata": { - "collapsed": false - }, + "execution_count": 134, + "metadata": {}, "outputs": [], "source": [ "my_df_loaded = pd.read_csv(\"my_df.csv\", index_col=0)\n", @@ -2665,10 +2449,8 @@ }, { "cell_type": "code", - "execution_count": 133, - "metadata": { - "collapsed": false - }, + "execution_count": 135, + "metadata": {}, "outputs": [], "source": [ "us_cities = None\n", @@ -2700,10 +2482,8 @@ }, { "cell_type": "code", - "execution_count": 134, - "metadata": { - "collapsed": false - }, + "execution_count": 136, + "metadata": {}, "outputs": [], "source": [ "city_loc = pd.DataFrame(\n", @@ -2719,10 +2499,8 @@ }, { "cell_type": "code", - "execution_count": 135, - "metadata": { - "collapsed": false - }, + "execution_count": 137, + "metadata": {}, "outputs": [], "source": [ "city_pop = pd.DataFrame(\n", @@ -2739,15 +2517,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's join these `DataFrame`s using the `merge` function:" + "Now let's join these `DataFrame`s using the `merge()` function:" ] }, { "cell_type": "code", - "execution_count": 136, - "metadata": { - "collapsed": false - }, + "execution_count": 138, + "metadata": {}, "outputs": [], "source": [ "pd.merge(left=city_loc, right=city_pop, on=\"city\")" @@ -2764,10 +2540,8 @@ }, { "cell_type": "code", - "execution_count": 137, - "metadata": { - "collapsed": false - }, + "execution_count": 139, + "metadata": {}, "outputs": [], "source": [ "all_cities = pd.merge(left=city_loc, right=city_pop, on=\"city\", how=\"outer\")\n", @@ -2783,10 +2557,8 @@ }, { "cell_type": "code", - "execution_count": 138, - "metadata": { - "collapsed": false - }, + "execution_count": 140, + "metadata": {}, "outputs": [], "source": [ "pd.merge(left=city_loc, right=city_pop, on=\"city\", how=\"right\")" @@ -2801,10 +2573,8 @@ }, { "cell_type": "code", - "execution_count": 139, - "metadata": { - "collapsed": false - }, + "execution_count": 141, + "metadata": {}, "outputs": [], "source": [ "city_pop2 = city_pop.copy()\n", @@ -2817,15 +2587,13 @@ "metadata": {}, "source": [ "## Concatenation\n", - "Rather than joining `DataFrame`s, we may just want to concatenate them. That's what `concat` is for:" + "Rather than joining `DataFrame`s, we may just want to concatenate them. That's what `concat()` is for:" ] }, { "cell_type": "code", - "execution_count": 140, - "metadata": { - "collapsed": false - }, + "execution_count": 142, + "metadata": {}, "outputs": [], "source": [ "result_concat = pd.concat([city_loc, city_pop])\n", @@ -2841,10 +2609,8 @@ }, { "cell_type": "code", - "execution_count": 141, - "metadata": { - "collapsed": false - }, + "execution_count": 143, + "metadata": {}, "outputs": [], "source": [ "result_concat.loc[3]" @@ -2859,10 +2625,8 @@ }, { "cell_type": "code", - "execution_count": 142, - "metadata": { - "collapsed": false - }, + "execution_count": 144, + "metadata": {}, "outputs": [], "source": [ "pd.concat([city_loc, city_pop], ignore_index=True)" @@ -2877,10 +2641,8 @@ }, { "cell_type": "code", - "execution_count": 143, - "metadata": { - "collapsed": false - }, + "execution_count": 145, + "metadata": {}, "outputs": [], "source": [ "pd.concat([city_loc, city_pop], join=\"inner\")" @@ -2895,9 +2657,8 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 146, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2914,9 +2675,8 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 147, "metadata": { - "collapsed": false, "scrolled": true }, "outputs": [], @@ -2935,15 +2695,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `append` method is a useful shorthand for concatenating `DataFrame`s vertically:" + "The `append()` method is a useful shorthand for concatenating `DataFrame`s vertically:" ] }, { "cell_type": "code", - "execution_count": 146, - "metadata": { - "collapsed": false - }, + "execution_count": 148, + "metadata": {}, "outputs": [], "source": [ "city_loc.append(city_pop)" @@ -2953,7 +2711,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As always in pandas, the `append` method does *not* actually modify `city_loc`: it works on a copy and returns the modified copy." + "As always in pandas, the `append()` method does *not* actually modify `city_loc`: it works on a copy and returns the modified copy." ] }, { @@ -2966,10 +2724,8 @@ }, { "cell_type": "code", - "execution_count": 147, - "metadata": { - "collapsed": false - }, + "execution_count": 149, + "metadata": {}, "outputs": [], "source": [ "city_eco = city_pop.copy()\n", @@ -2986,10 +2742,8 @@ }, { "cell_type": "code", - "execution_count": 148, - "metadata": { - "collapsed": false - }, + "execution_count": 150, + "metadata": {}, "outputs": [], "source": [ "city_eco[\"economy\"] = city_eco[\"eco_code\"].astype('category')\n", @@ -3005,10 +2759,8 @@ }, { "cell_type": "code", - "execution_count": 149, - "metadata": { - "collapsed": false - }, + "execution_count": 151, + "metadata": {}, "outputs": [], "source": [ "city_eco[\"economy\"].cat.categories = [\"Finance\", \"Energy\", \"Tourism\"]\n", @@ -3024,10 +2776,8 @@ }, { "cell_type": "code", - "execution_count": 150, - "metadata": { - "collapsed": false - }, + "execution_count": 152, + "metadata": {}, "outputs": [], "source": [ "city_eco.sort_values(by=\"economy\", ascending=False)" @@ -3042,25 +2792,32 @@ "# What next?\n", "As you probably noticed by now, pandas is quite a large library with *many* features. Although we went through the most important features, there is still a lot to discover. Probably the best way to learn more is to get your hands dirty with some real-life data. It is also a good idea to go through pandas' excellent [documentation](http://pandas.pydata.org/pandas-docs/stable/index.html), in particular the [Cookbook](http://pandas.pydata.org/pandas-docs/stable/cookbook.html)." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" + "pygments_lexer": "ipython3", + "version": "3.6.3" }, "toc": { "toc_cell": false, @@ -3071,5 +2828,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 }