diff --git a/Chapter 5. Getting Started with Pandas.ipynb b/Chapter 5. Getting Started with Pandas.ipynb
new file mode 100644
index 0000000..4d44ba4
--- /dev/null
+++ b/Chapter 5. Getting Started with Pandas.ipynb
@@ -0,0 +1,3965 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Chapter 5. Getting Started With Pandas"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.1 Introduction to pandas Data Structure"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Series and Data Structure"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "obj = pd.Series([i for i in range(4)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0\n",
+ "1 1\n",
+ "2 2\n",
+ "3 3\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Index and Series Value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "RangeIndex(start=0, stop=4, step=1)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0, 1, 2, 3], dtype=int64)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can assign index as follows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "obj_1 = pd.Series(np.arange(4), ['a', 'b', 'c', 'd'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a 0\n",
+ "b 1\n",
+ "c 2\n",
+ "d 3\n",
+ "dtype: int32"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_1['a']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "b 1\n",
+ "c 2\n",
+ "d 3\n",
+ "dtype: int32"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_1[obj_1 > 0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " Arithmatic Calculation with numpy built in function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a 1.000000\n",
+ "b 2.718282\n",
+ "c 7.389056\n",
+ "d 20.085537\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.exp(obj_1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create Series with python Dictionary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n",
+ "obj_3 = pd.Series(sdata)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "What happens if the overiding index is not in the index of Series you overide"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "states = ['California', 'Ohio', 'Oregon', 'Texas']\n",
+ "obj_4 = pd.Series(sdata, index = states)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "California NaN\n",
+ "Ohio 35000.0\n",
+ "Oregon 16000.0\n",
+ "Texas 71000.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "isnull / and not null function to detect missing data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "California True\n",
+ "Ohio False\n",
+ "Oregon False\n",
+ "Texas False\n",
+ "dtype: bool"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.isnull(obj_4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "California False\n",
+ "Ohio True\n",
+ "Oregon True\n",
+ "Texas True\n",
+ "dtype: bool"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.notnull(obj_4)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "California NaN\n",
+ "Ohio 70000.0\n",
+ "Oregon 32000.0\n",
+ "Texas 142000.0\n",
+ "Utah NaN\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_3 + obj_4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Series object / index object can have a name attribute."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "state\n",
+ "California NaN\n",
+ "Ohio 35000.0\n",
+ "Oregon 16000.0\n",
+ "Texas 71000.0\n",
+ "Name: Pop, dtype: float64"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj_4.name = 'Pop'\n",
+ "obj_4.index.name = 'state'\n",
+ "obj_4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also assign index after the Series object is created."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0\n",
+ "1 1\n",
+ "2 2\n",
+ "3 3\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "bob 0\n",
+ "Steve 1\n",
+ "Jeff 2\n",
+ "Ryan 3\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.index = ['bob', 'Steve', 'Jeff', 'Ryan']\n",
+ "obj"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### DataFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pop | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.5 | \n",
+ " Ohio | \n",
+ " 2000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1.7 | \n",
+ " Ohio | \n",
+ " 2001 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3.6 | \n",
+ " Ohio | \n",
+ " 2002 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2.4 | \n",
+ " Nevada | \n",
+ " 2001 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2.9 | \n",
+ " Nevada | \n",
+ " 2002 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 3.2 | \n",
+ " Nevada | \n",
+ " 2003 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pop state year\n",
+ "0 1.5 Ohio 2000\n",
+ "1 1.7 Ohio 2001\n",
+ "2 3.6 Ohio 2002\n",
+ "3 2.4 Nevada 2001\n",
+ "4 2.9 Nevada 2002\n",
+ "5 3.2 Nevada 2003"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n",
+ "frame = pd.DataFrame(data)\n",
+ "frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pop | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1.5 | \n",
+ " Ohio | \n",
+ " 2000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pop state year\n",
+ "0 1.5 Ohio 2000"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pop | \n",
+ " state | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 3.2 | \n",
+ " Nevada | \n",
+ " 2003 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pop state year\n",
+ "5 3.2 Nevada 2003"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.tail(1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "DataFrame's Columns can be arranged with columns argument in DataFrame function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop'],\n",
+ " index = ['one', 'two', 'three', 'four', 'five', 'six'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Values in each column can be retiredved as Series either by dict-like notation or by attribute"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "one Ohio\n",
+ "two Ohio\n",
+ "three Ohio\n",
+ "four Nevada\n",
+ "five Nevada\n",
+ "six Nevada\n",
+ "Name: state, dtype: object"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2['state']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "one 2000\n",
+ "two 2001\n",
+ "three 2002\n",
+ "four 2001\n",
+ "five 2002\n",
+ "six 2003\n",
+ "Name: year, dtype: int64"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2.year"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Row can also be retrieved by position or name with the special loc / iloc attribute"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "year 2002\n",
+ "state Ohio\n",
+ "pop 3.6\n",
+ "Name: three, dtype: object"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2.loc['three']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "year 2002\n",
+ "state Ohio\n",
+ "pop 3.6\n",
+ "Name: three, dtype: object"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2.iloc[2]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " Please note that iloc attribute is now discouraged."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " year | \n",
+ " state | \n",
+ " pop | \n",
+ " debt | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | one | \n",
+ " 2000 | \n",
+ " Ohio | \n",
+ " 1.5 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | two | \n",
+ " 2001 | \n",
+ " Ohio | \n",
+ " 1.7 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | three | \n",
+ " 2002 | \n",
+ " Ohio | \n",
+ " 3.6 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | four | \n",
+ " 2001 | \n",
+ " Nevada | \n",
+ " 2.4 | \n",
+ " -1.5 | \n",
+ "
\n",
+ " \n",
+ " | five | \n",
+ " 2002 | \n",
+ " Nevada | \n",
+ " 2.9 | \n",
+ " -1.7 | \n",
+ "
\n",
+ " \n",
+ " | six | \n",
+ " 2003 | \n",
+ " Nevada | \n",
+ " 3.2 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " year state pop debt\n",
+ "one 2000 Ohio 1.5 NaN\n",
+ "two 2001 Ohio 1.7 NaN\n",
+ "three 2002 Ohio 3.6 NaN\n",
+ "four 2001 Nevada 2.4 -1.5\n",
+ "five 2002 Nevada 2.9 -1.7\n",
+ "six 2003 Nevada 3.2 NaN"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "val = pd.Series([-1.2, -1.5, -1.7], index =[' two', 'four', 'five'])\n",
+ "frame2['debt'] = val\n",
+ "frame2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Del function is to delete columns in the DataFrame object."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " year | \n",
+ " state | \n",
+ " pop | \n",
+ " debt | \n",
+ " eastern | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | one | \n",
+ " 2000 | \n",
+ " Ohio | \n",
+ " 1.5 | \n",
+ " NaN | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | two | \n",
+ " 2001 | \n",
+ " Ohio | \n",
+ " 1.7 | \n",
+ " NaN | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | three | \n",
+ " 2002 | \n",
+ " Ohio | \n",
+ " 3.6 | \n",
+ " NaN | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | four | \n",
+ " 2001 | \n",
+ " Nevada | \n",
+ " 2.4 | \n",
+ " -1.5 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | five | \n",
+ " 2002 | \n",
+ " Nevada | \n",
+ " 2.9 | \n",
+ " -1.7 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | six | \n",
+ " 2003 | \n",
+ " Nevada | \n",
+ " 3.2 | \n",
+ " NaN | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " year state pop debt eastern\n",
+ "one 2000 Ohio 1.5 NaN True\n",
+ "two 2001 Ohio 1.7 NaN True\n",
+ "three 2002 Ohio 3.6 NaN True\n",
+ "four 2001 Nevada 2.4 -1.5 False\n",
+ "five 2002 Nevada 2.9 -1.7 False\n",
+ "six 2003 Nevada 3.2 NaN False"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2['eastern'] = frame2['state'] == 'Ohio'\n",
+ "frame2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "del frame2['eastern']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " year | \n",
+ " state | \n",
+ " pop | \n",
+ " debt | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | one | \n",
+ " 2000 | \n",
+ " Ohio | \n",
+ " 1.5 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | two | \n",
+ " 2001 | \n",
+ " Ohio | \n",
+ " 1.7 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | three | \n",
+ " 2002 | \n",
+ " Ohio | \n",
+ " 3.6 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | four | \n",
+ " 2001 | \n",
+ " Nevada | \n",
+ " 2.4 | \n",
+ " -1.5 | \n",
+ "
\n",
+ " \n",
+ " | five | \n",
+ " 2002 | \n",
+ " Nevada | \n",
+ " 2.9 | \n",
+ " -1.7 | \n",
+ "
\n",
+ " \n",
+ " | six | \n",
+ " 2003 | \n",
+ " Nevada | \n",
+ " 3.2 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " year state pop debt\n",
+ "one 2000 Ohio 1.5 NaN\n",
+ "two 2001 Ohio 1.7 NaN\n",
+ "three 2002 Ohio 3.6 NaN\n",
+ "four 2001 Nevada 2.4 -1.5\n",
+ "five 2002 Nevada 2.9 -1.7\n",
+ "six 2003 Nevada 3.2 NaN"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The column returned from indexing a DataFrame is a view on the underlying data, not a copy. Thus, any in-place modifications to the Series will be reflected in the DataFrame."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If the nested dict converted to DataFrame with pandas, pandas interpret\n",
+ "- outer dict: Columns\n",
+ "- inner dict: Index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Nevada | \n",
+ " Ohio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2000 | \n",
+ " NaN | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " | 2001 | \n",
+ " 2.4 | \n",
+ " 1.7 | \n",
+ "
\n",
+ " \n",
+ " | 2002 | \n",
+ " 2.9 | \n",
+ " 3.6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Nevada Ohio\n",
+ "2000 NaN 1.5\n",
+ "2001 2.4 1.7\n",
+ "2002 2.9 3.6"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pop = {' Nevada': {2001: 2.4, 2002: 2.9},'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}\n",
+ "frame3 = pd.DataFrame(pop)\n",
+ "frame3"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Transpose the DataFrame with attribute 'T'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 2000 | \n",
+ " 2001 | \n",
+ " 2002 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Nevada | \n",
+ " NaN | \n",
+ " 2.4 | \n",
+ " 2.9 | \n",
+ "
\n",
+ " \n",
+ " | Ohio | \n",
+ " 1.5 | \n",
+ " 1.7 | \n",
+ " 3.6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 2000 2001 2002\n",
+ " Nevada NaN 2.4 2.9\n",
+ "Ohio 1.5 1.7 3.6"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame3.T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Nevada | \n",
+ " Ohio | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2001 | \n",
+ " 2.4 | \n",
+ " 1.7 | \n",
+ "
\n",
+ " \n",
+ " | 2002 | \n",
+ " 2.9 | \n",
+ " 3.6 | \n",
+ "
\n",
+ " \n",
+ " | 2003 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Nevada Ohio\n",
+ "2001 2.4 1.7\n",
+ "2002 2.9 3.6\n",
+ "2003 NaN NaN"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(pop, index = [2001,2002,2003])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Can Assign column name / index name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "frame3.index.name = 'year'; frame3.columns.name = 'state'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | state | \n",
+ " Nevada | \n",
+ " Ohio | \n",
+ "
\n",
+ " \n",
+ " | year | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2000 | \n",
+ " NaN | \n",
+ " 1.5 | \n",
+ "
\n",
+ " \n",
+ " | 2001 | \n",
+ " 2.4 | \n",
+ " 1.7 | \n",
+ "
\n",
+ " \n",
+ " | 2002 | \n",
+ " 2.9 | \n",
+ " 3.6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "state Nevada Ohio\n",
+ "year \n",
+ "2000 NaN 1.5\n",
+ "2001 2.4 1.7\n",
+ "2002 2.9 3.6"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[ nan, 1.5],\n",
+ " [ 2.4, 1.7],\n",
+ " [ 2.9, 3.6]])"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame3.values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[2000, 'Ohio', 1.5, nan],\n",
+ " [2001, 'Ohio', 1.7, nan],\n",
+ " [2002, 'Ohio', 3.6, nan],\n",
+ " [2001, 'Nevada', 2.4, -1.5],\n",
+ " [2002, 'Nevada', 2.9, -1.7],\n",
+ " [2003, 'Nevada', 3.2, nan]], dtype=object)"
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame2.values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Index Objects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "obj = pd.Series( range( 3), index =[' a', 'b', 'c'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Index objects are immutable and thus can't be modified by the user"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "Index does not support mutable operations",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m3\u001b[0m \u001b[1;31m# Type Error\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32mC:\\Users\\DELL\\Anaconda3\\lib\\site-packages\\pandas\\indexes\\base.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 1402\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1403\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1404\u001b[0;31m \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Index does not support mutable operations\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1405\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1406\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mTypeError\u001b[0m: Index does not support mutable operations"
+ ]
+ }
+ ],
+ "source": [
+ "obj.index[1] = 3 # Type Error"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1.5\n",
+ "1 -2.5\n",
+ "2 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 88,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "labels = pd.Index(np.arange(3))\n",
+ "obj2 = pd.Series([1.5, -2.5, 0], index = labels)\n",
+ "obj2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "'Ohio' in frame3.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "2003 in frame3.index"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['foo', 'foo', 'bar', 'bar'], dtype='object')"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])\n",
+ "dup_labels"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([ True, True, False], dtype=bool)"
+ ]
+ },
+ "execution_count": 106,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame3.index.isin([2000, 2001])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5.2 Essential Functionality"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Reindexing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "obj = pd.Series([ 4.5, 7.2, -5.3, 3.6], index =['d', 'b', 'a', 'c'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a -5.3\n",
+ "b 7.2\n",
+ "c 3.6\n",
+ "d 4.5\n",
+ "e NaN\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n",
+ "obj2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Reindex row then forward fill NaN Value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 blue\n",
+ "1 blue\n",
+ "2 purple\n",
+ "3 purple\n",
+ "4 yellow\n",
+ "5 yellow\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 114,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj3 = pd.Series([' blue', 'purple', 'yellow'], index =[ 0, 2, 4])\n",
+ "obj3.reindex(range(6), method = 'ffill')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "reindex can be used with columns keyword"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | state | \n",
+ " Texas | \n",
+ " Utah | \n",
+ " California | \n",
+ "
\n",
+ " \n",
+ " | year | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2001 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2002 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "state Texas Utah California\n",
+ "year \n",
+ "2000 NaN NaN NaN\n",
+ "2001 NaN NaN NaN\n",
+ "2002 NaN NaN NaN"
+ ]
+ },
+ "execution_count": 117,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "states = ['Texas', 'Utah', 'California']\n",
+ "frame3.reindex(columns = states)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Dropping Entries from an Axis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a 0.0\n",
+ "b 1.0\n",
+ "d 3.0\n",
+ "e 4.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 119,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])\n",
+ "new_obj = obj.drop('c')\n",
+ "new_obj"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 120,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " one | \n",
+ " two | \n",
+ " three | \n",
+ " four | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Ohio | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | Colorado | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | Utah | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | New York | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " one two three four\n",
+ "Ohio 0 1 2 3\n",
+ "Colorado 4 5 6 7\n",
+ "Utah 8 9 10 11\n",
+ "New York 12 13 14 15"
+ ]
+ },
+ "execution_count": 120,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.DataFrame(np.arange(16). reshape((4,4)), index =['Ohio', 'Colorado', 'Utah', 'New York'], columns =['one', 'two', 'three', 'four'])\n",
+ "data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " one | \n",
+ " two | \n",
+ " three | \n",
+ " four | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Ohio | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | Colorado | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | Utah | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | New York | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " one two three four\n",
+ "Ohio 0.0 1.0 2.0 3.0\n",
+ "Colorado 4.0 NaN NaN NaN\n",
+ "Utah NaN NaN NaN NaN\n",
+ "New York NaN NaN NaN NaN"
+ ]
+ },
+ "execution_count": 123,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[data < 5] "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Selection with loc and iloc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 124,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "two 5\n",
+ "three 6\n",
+ "Name: Colorado, dtype: int32"
+ ]
+ },
+ "execution_count": 124,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.loc['Colorado', ['two', 'three']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Integer Indexes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For more precise handling, use loc(for labels) or iloc(for integers)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "ser = pd.Series(np.arange(3.))\n",
+ "ser2 = pd.Series( np.arange( 3.), index =[' a', 'b', 'c'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 128,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 128,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ser[:1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0.0\n",
+ "1 1.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 130,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ser.loc[:1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 132,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 132,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ser.iloc[:1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Arithmetic and Data Alignment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 134,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " a 5.2\n",
+ "c 1.1\n",
+ "d NaN\n",
+ "e 0.0\n",
+ "f NaN\n",
+ "g NaN\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 134,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "s1 = pd.Series([ 7.3, -2.5, 3.4, 1.5], index =[' a', 'c', 'd', 'e'])\n",
+ "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index =[' a', 'c', 'e', 'f', 'g'])\n",
+ "s1 + s2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df1 = pd.DataFrame(np.arange(9.). reshape(( 3, 3)), columns = list('bcd'), index =['Ohio', 'Texas', 'Colorado'])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 139,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "df2 = pd.DataFrame( np.arange(12.). reshape((4, 3)), columns = list('bde'), index =['Utah', 'Ohio', 'Texas', 'Oregon'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b | \n",
+ " c | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Colorado | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | Ohio | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | Oregon | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | Texas | \n",
+ " 9.0 | \n",
+ " NaN | \n",
+ " 12.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | Utah | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b c d e\n",
+ "Colorado NaN NaN NaN NaN\n",
+ "Ohio 3.0 NaN 6.0 NaN\n",
+ "Oregon NaN NaN NaN NaN\n",
+ "Texas 9.0 NaN 12.0 NaN\n",
+ "Utah NaN NaN NaN NaN"
+ ]
+ },
+ "execution_count": 141,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1 + df2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you add DataFrame objects with no column or row labels in common, the result will contain all nulls:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 142,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "df1 = pd.DataFrame({' A': [1, 2]})\n",
+ "df2 = pd.DataFrame({' B': [3, 4]})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " B | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A B\n",
+ "0 NaN NaN\n",
+ "1 NaN NaN"
+ ]
+ },
+ "execution_count": 143,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1 - df2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 6.0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 9.0 | \n",
+ " 11.0 | \n",
+ " 13.0 | \n",
+ " 15.0 | \n",
+ " 9.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 18.0 | \n",
+ " 20.0 | \n",
+ " 22.0 | \n",
+ " 24.0 | \n",
+ " 14.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 15.0 | \n",
+ " 16.0 | \n",
+ " 17.0 | \n",
+ " 18.0 | \n",
+ " 19.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c d e\n",
+ "0 0.0 2.0 4.0 6.0 4.0\n",
+ "1 9.0 11.0 13.0 15.0 9.0\n",
+ "2 18.0 20.0 22.0 24.0 14.0\n",
+ "3 15.0 16.0 17.0 18.0 19.0"
+ ]
+ },
+ "execution_count": 146,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1 = pd.DataFrame(np.arange(12.). reshape((3, 4)), columns = list('abcd'))\n",
+ "df2 = pd.DataFrame(np.arange( 20.). reshape((4, 5)),columns = list('abcde'))\n",
+ "\n",
+ "df1.add(df2, fill_value = 0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Methods with 'r' attached means row-wise calcuation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Function Application and Mapping"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Numpy ufuncs(element-wise array methods) also work with pandas objects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Utah | \n",
+ " 0.404903 | \n",
+ " 1.199043 | \n",
+ " 0.815118 | \n",
+ "
\n",
+ " \n",
+ " | Ohio | \n",
+ " 0.002818 | \n",
+ " 0.479237 | \n",
+ " 0.942592 | \n",
+ "
\n",
+ " \n",
+ " | Texas | \n",
+ " 1.240834 | \n",
+ " -1.286434 | \n",
+ " -1.919315 | \n",
+ "
\n",
+ " \n",
+ " | Oregon | \n",
+ " -0.419946 | \n",
+ " -1.189319 | \n",
+ " -1.215549 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b d e\n",
+ "Utah 0.404903 1.199043 0.815118\n",
+ "Ohio 0.002818 0.479237 0.942592\n",
+ "Texas 1.240834 -1.286434 -1.919315\n",
+ "Oregon -0.419946 -1.189319 -1.215549"
+ ]
+ },
+ "execution_count": 148,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame = pd.DataFrame(np.random.randn(4, 3), columns = list('bde'), index =['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
+ "frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Utah | \n",
+ " 0.404903 | \n",
+ " 1.199043 | \n",
+ " 0.815118 | \n",
+ "
\n",
+ " \n",
+ " | Ohio | \n",
+ " 0.002818 | \n",
+ " 0.479237 | \n",
+ " 0.942592 | \n",
+ "
\n",
+ " \n",
+ " | Texas | \n",
+ " 1.240834 | \n",
+ " 1.286434 | \n",
+ " 1.919315 | \n",
+ "
\n",
+ " \n",
+ " | Oregon | \n",
+ " 0.419946 | \n",
+ " 1.189319 | \n",
+ " 1.215549 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b d e\n",
+ "Utah 0.404903 1.199043 0.815118\n",
+ "Ohio 0.002818 0.479237 0.942592\n",
+ "Texas 1.240834 1.286434 1.919315\n",
+ "Oregon 0.419946 1.189319 1.215549"
+ ]
+ },
+ "execution_count": 150,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.abs(frame)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Instead of numpy uFunc, lambda also works"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "b 1.660779\n",
+ "d 2.485477\n",
+ "e 2.861908\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 152,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "f = lambda x: x.max() - x.min()\n",
+ "frame.apply(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Utah 0.794140\n",
+ "Ohio 0.939774\n",
+ "Texas 3.160149\n",
+ "Oregon 0.795603\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 154,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.apply(f, axis = 'columns')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Apply function need not return a scalar value; it can also return a Series with multiple value"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def f(x):\n",
+ " return pd.Series([x.min(), x.max()], index = ['min', 'max'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 159,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | min | \n",
+ " -0.419946 | \n",
+ " -1.286434 | \n",
+ " -1.919315 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 1.240834 | \n",
+ " 1.199043 | \n",
+ " 0.942592 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b d e\n",
+ "min -0.419946 -1.286434 -1.919315\n",
+ "max 1.240834 1.199043 0.942592"
+ ]
+ },
+ "execution_count": 159,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.apply(f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " min | \n",
+ " max | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Utah | \n",
+ " 0.404903 | \n",
+ " 1.199043 | \n",
+ "
\n",
+ " \n",
+ " | Ohio | \n",
+ " 0.002818 | \n",
+ " 0.942592 | \n",
+ "
\n",
+ " \n",
+ " | Texas | \n",
+ " -1.919315 | \n",
+ " 1.240834 | \n",
+ "
\n",
+ " \n",
+ " | Oregon | \n",
+ " -1.215549 | \n",
+ " -0.419946 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " min max\n",
+ "Utah 0.404903 1.199043\n",
+ "Ohio 0.002818 0.942592\n",
+ "Texas -1.919315 1.240834\n",
+ "Oregon -1.215549 -0.419946"
+ ]
+ },
+ "execution_count": 161,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.apply(f, axis = 1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Element-wise Python functions can be used too!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | Utah | \n",
+ " 0.40 | \n",
+ " 1.20 | \n",
+ " 0.82 | \n",
+ "
\n",
+ " \n",
+ " | Ohio | \n",
+ " 0.00 | \n",
+ " 0.48 | \n",
+ " 0.94 | \n",
+ "
\n",
+ " \n",
+ " | Texas | \n",
+ " 1.24 | \n",
+ " -1.29 | \n",
+ " -1.92 | \n",
+ "
\n",
+ " \n",
+ " | Oregon | \n",
+ " -0.42 | \n",
+ " -1.19 | \n",
+ " -1.22 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b d e\n",
+ "Utah 0.40 1.20 0.82\n",
+ "Ohio 0.00 0.48 0.94\n",
+ "Texas 1.24 -1.29 -1.92\n",
+ "Oregon -0.42 -1.19 -1.22"
+ ]
+ },
+ "execution_count": 165,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "form = lambda x: '%.2f' % x\n",
+ "frame.applymap(form)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The reason for the name applymap is that Series has a map method for applying an element-wise function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Utah 0.82\n",
+ "Ohio 0.94\n",
+ "Texas -1.92\n",
+ "Oregon -1.22\n",
+ "Name: e, dtype: object"
+ ]
+ },
+ "execution_count": 167,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame['e'].map(form)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sorting and Ranking"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a 1\n",
+ "b 2\n",
+ "c 3\n",
+ "d 0\n",
+ "dtype: int32"
+ ]
+ },
+ "execution_count": 168,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj = pd.Series(range(4), index = ['d', 'a', 'b', 'c'])\n",
+ "obj.sort_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " d | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | one | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | three | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " d a b c\n",
+ "one 4 5 6 7\n",
+ "three 0 1 2 3"
+ ]
+ },
+ "execution_count": 169,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame = pd.DataFrame(np.arange(8). reshape((2, 4)), index =['three', 'one'], columns =['d', 'a', 'b', 'c'])\n",
+ "frame.sort_index()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Since the column is a kind of index, so you can use sort_index method for columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ " d | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | three | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | one | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c d\n",
+ "three 1 2 3 0\n",
+ "one 5 6 7 4"
+ ]
+ },
+ "execution_count": 171,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.sort_index(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " d | \n",
+ " c | \n",
+ " b | \n",
+ " a | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | three | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | one | \n",
+ " 4 | \n",
+ " 7 | \n",
+ " 6 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " d c b a\n",
+ "three 0 3 2 1\n",
+ "one 4 7 6 5"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.sort_index(1, ascending = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Also you can sort the DataFrame or Series with sort_value method"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "4 -3.0\n",
+ "5 2.0\n",
+ "0 4.0\n",
+ "2 7.0\n",
+ "1 NaN\n",
+ "3 NaN\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 173,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj = pd.Series([ 4, np.nan, 7, np.nan, -3, 2])\n",
+ "obj.sort_values()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 175,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2 7.0\n",
+ "0 4.0\n",
+ "5 2.0\n",
+ "4 -3.0\n",
+ "1 NaN\n",
+ "3 NaN\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 175,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.sort_values(ascending = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can also sort the DataFrame on the values of multiple columns what??"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 177,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " -3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "0 0 4\n",
+ "1 1 7\n",
+ "2 0 -3\n",
+ "3 1 2"
+ ]
+ },
+ "execution_count": 177,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame = pd.DataFrame({'b': [4,7,-3,2], 'a': [0,1,0,1]})\n",
+ "frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 179,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " -3 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "2 0 -3\n",
+ "3 1 2\n",
+ "0 0 4\n",
+ "1 1 7"
+ ]
+ },
+ "execution_count": 179,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.sort_values(by = 'b')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 182,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " -3 | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b\n",
+ "2 0 -3\n",
+ "0 0 4\n",
+ "3 1 2\n",
+ "1 1 7"
+ ]
+ },
+ "execution_count": 182,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame.sort_values(by = ['a', 'b'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 187,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 6.5\n",
+ "1 1.0\n",
+ "2 6.5\n",
+ "3 4.5\n",
+ "4 3.0\n",
+ "5 2.0\n",
+ "6 4.5\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 187,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj = pd.Series([7,-5,7,4,2,0,4])\n",
+ "obj.rank()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ranks can also be assigned according to the order in which they’re observed in the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 6.0\n",
+ "1 1.0\n",
+ "2 7.0\n",
+ "3 4.0\n",
+ "4 3.0\n",
+ "5 2.0\n",
+ "6 5.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 188,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.rank(method = 'first')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 191,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 2.0\n",
+ "1 7.0\n",
+ "2 2.0\n",
+ "3 4.0\n",
+ "4 5.0\n",
+ "5 6.0\n",
+ "6 4.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 191,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Assign tie values the maximum rank in the group\n",
+ "obj.rank(ascending = False, method = 'max')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 190,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1.0\n",
+ "1 7.0\n",
+ "2 1.0\n",
+ "3 3.0\n",
+ "4 5.0\n",
+ "5 6.0\n",
+ "6 3.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 190,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.rank(ascending = False, method = 'min')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Not surprisingly, row-wise rank also possible"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 192,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " b | \n",
+ " a | \n",
+ " c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " b a c\n",
+ "0 3.0 2.0 1.0\n",
+ "1 3.0 1.0 2.0\n",
+ "2 1.0 2.0 3.0\n",
+ "3 3.0 2.0 1.0"
+ ]
+ },
+ "execution_count": 192,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame = pd.DataFrame({' b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],'c': [-2, 5, 8, -2.5]})\n",
+ "frame.rank(axis = 'columns')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5.3 Summarizing and Computing Descriptive Statistics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 200,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " one | \n",
+ " two | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | a | \n",
+ " 1.40 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | b | \n",
+ " 7.10 | \n",
+ " -4.5 | \n",
+ "
\n",
+ " \n",
+ " | c | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | d | \n",
+ " 0.75 | \n",
+ " -1.3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " one two\n",
+ "a 1.40 NaN\n",
+ "b 7.10 -4.5\n",
+ "c NaN NaN\n",
+ "d 0.75 -1.3"
+ ]
+ },
+ "execution_count": 200,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame([[ 1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index =['a', 'b', 'c', 'd'], columns =['one', 'two'])\n",
+ "df "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 201,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "one 9.25\n",
+ "two -5.80\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 201,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Row-wise calculation \n",
+ "- Passing axis =' columns' or axis = 1 sums across the columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 202,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a 1.40\n",
+ "b 2.60\n",
+ "c 0.00\n",
+ "d -0.55\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 202,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.sum(axis = 'columns')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled with the skipna option:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 203,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "a NaN\n",
+ "b 1.300\n",
+ "c NaN\n",
+ "d -0.275\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 203,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.mean(axis = 'columns', skipna = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 209,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "one d\n",
+ "two b\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 209,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.idxmax()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 210,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "one d\n",
+ "two b\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 210,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.idxmin()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 223,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "25% one 1.075\n",
+ " two -3.700\n",
+ "75% one 4.250\n",
+ " two -2.100\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 223,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([(df.describe()).loc['25%'],(df.describe()).loc['75%']], keys = ['25%', '75%'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Uniqe Values, Value Counts, and Membership"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 233,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['c', 'a', 'd', 'b'], dtype=object)"
+ ]
+ },
+ "execution_count": 233,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])\n",
+ "uniques = obj.unique()\n",
+ "uniques"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 234,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "c 3\n",
+ "a 3\n",
+ "b 2\n",
+ "d 1\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 234,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "obj.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 235,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "b 2\n",
+ "a 3\n",
+ "c 3\n",
+ "d 1\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 235,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.value_counts(obj.values, sort = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "'isin' performs a vectorized set membership check and can be useful in filtering a dataset down to a subset of values in a Series or column in a DataFrame:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 237,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 c\n",
+ "5 b\n",
+ "6 b\n",
+ "7 c\n",
+ "8 c\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 237,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mask = obj.isin(['b', 'c'])\n",
+ "obj[mask]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Related to isin is the Index.get_indexer method, which gives you an index array from an array of possibly non-distinct values into another array of distinct values:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 239,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([0, 2, 1, 1, 0, 2], dtype=int64)"
+ ]
+ },
+ "execution_count": 239,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])\n",
+ "unique_vals = pd.Series(['c', 'b', 'a'])\n",
+ "pd.Index(unique_vals).get_indexer(to_match)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Passing pandas.value_counts to this DataFrame’s apply function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 240,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Qu1 | \n",
+ " Qu2 | \n",
+ " Qu3 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Qu1 Qu2 Qu3\n",
+ "1 1.0 1.0 1.0\n",
+ "2 0.0 2.0 1.0\n",
+ "3 2.0 2.0 0.0\n",
+ "4 2.0 0.0 2.0\n",
+ "5 0.0 0.0 1.0"
+ ]
+ },
+ "execution_count": 240,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], \n",
+ " 'Qu2': [2, 3, 1, 2, 3], \n",
+ " 'Qu3': [1, 5, 2, 4, 4]})\n",
+ "data.apply(pd.value_counts).fillna(0)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}