from IPython.display import YouTubeVideo
YouTubeVideo('hKOkdgv6RkI', width=560, height=315)


import numpy as np
import pandas
import matplotlib.pyplot as plt


# Execute a unix command to download a file (if it’s not already
# downloaded), and show download progress
import os.path
if os.path.exists('stop-and-search.csv'):
    print("file already downloaded")
else:
    !wget "https://www.cl.cam.ac.uk/teaching/2021/DataSci/data/stop-and-search.csv"


# Import a dataframe using the pandas library
stopsearch = pandas.read_csv('stop-and-search.csv')

# How many rows are there?
print(f"This dataset has {len(stopsearch)} rows")

# What are the columns?
print(stopsearch.columns)

# Display the first 5 rows. iloc[:5] means ”select the first five rows”
# (not all columns fit on this page)
stopsearch.iloc[:5]


pandas.isna(stopsearch.age_range[:5]) # returns [False,False,True,False,False]

sum(pandas.isna(stopsearch.age_range)) # count number of missing values


url = 'https://www.cl.cam.ac.uk/teaching/2021/DataSci/data/iris.csv'
iris = pandas.read_csv(url)


iris.to_csv('iris.csv', index=False)


iris = pandas.DataFrame({
'species': ['setosa', 'virginica', 'virginica', 'setosa', 'versicolor'],
'Petal.length': [1.0, 5.0, 5.8, 1.7, 4.2],
'Petal.width': [0.2, 1.9, 1.6, 0.5, 1.2]
},
columns = ['Petal.length', 'Petal.width', 'species'])


iris = pandas.DataFrame([
('setosa', 1.0, 0.2), ('virginica', 5.0, 1.9), ('virginica', 5.8, 1.6),
('setosa', 1.7, 0.5), ('versicolor', 4.2, 1.2)
],
columns = ['species', 'Petal.length', 'Petal.width'])


stopsearch.columns          # get a list of column names
stopsearch.keys()           # … and another way to do the same
x = stopsearch['outcome']   # get a single column
x = stopsearch.outcome      # … and another way to do the same
del stopsearch['location']  # delete a column
# add or modify a column
stopsearch['outcome_N'] = np.where(stopsearch.outcome == 'False', 0, 1)


# list all the non-NA values, and their counts
stopsearch['age_range'].value_counts()    
# replace all the string age-ranges with numbers
r = {'18-24':21.5, '10-17':14, '25-34':30, 'over 34':40, 'under 10':8}
stopsearch['age'] = stopsearch['age_range'].replace(r)


stopsearch.loc[:, ['force','datetime','outcome']]  # all rows, some cols
stopsearch[['force','datetime','outcome']]         # … the same thing
stopsearch.loc[stopsearch.force=='cambridgeshire'] # some rows, all cols
stopsearch.loc[stopsearch.force=='cambridgeshire', # some rows, some cols
               ['force','datetime','outcome']]


stopsearch.iloc[:3]      # the first 3 rows
stopsearch[:3]           # … and another way to do the same
stopsearch.iloc[[0,3,5]] # select several rows
stopsearch.iloc[[5]]     # returns a one-row dataframe
stopsearch.sample(4)     # select 4 rows at random


wantcols = ['force','datetime','outcome']
stopsearch.loc[stopsearch.force=='cambridgeshire', wantcols]
stopsearch[wantcols].iloc[:3]
stopsearch.loc[stopsearch.force=='cambridgeshire', wantcols].iloc[:3]


stopsearch['force'].iat[5] # a scalar for the specified column and row
stopsearch.iloc[5]         # a tuple containing the values for row 5


stopsearch['outcome_N'][0] = 2


df = pandas.DataFrame({'x': [3,3,4,8,2]}, index=['a','b','c','d','e'])


# This looks like it should add [3,3,4] and [4,8,2] … but it doesn’t!
df['x'][:3] + df['x'][2:]


# To get the answer we were probably expecting,
df['x'][:3].values + df['x'][-3:].values


# Select cambridgeshire records, then tabulate by ethnicity and gender
df = stopsearch.loc[stopsearch.force=='cambridgeshire']
x = df.groupby(['officer_defined_ethnicity', 'gender']).apply(len)


# Apply the np.mean function to the 'age' column, separately for each gender
# (This assumes you've run the commands in 4.1 to define the age column, before defining df.)
df.groupby('gender')['age'].apply(lambda x: np.mean(x))


x.loc['Asian']           # select the sub-array of ethnicity Asian
x.loc[:, 'Other']        # select the sub-array of gender Other
x.loc[['Black','White']] # select two ethnicities, all genders


x.unstack(fill_value=0)


# Convert an indexed array into a long-form dataframe
# -- the array values will become a column, and we can specify its name
x[['Black','White']].reset_index(name='count')


# Convert an indexed array into a wide-form dataframe.
# (I'm using the line-continuation character "\" so my code isn't a confusing one-liner)
x[['Black','White']].unstack(fill_value=0) \
.reset_index() \
.rename_axis(None, axis=1)


df = stopsearch.loc[stopsearch.force=='cambridgeshire'].copy()
df['outcome'] = np.where(df.outcome == 'False', 'nothing', 'find')
x = df.groupby(['officer_defined_ethnicity', 'outcome']) \
.apply(len) \
.reset_index(name='n')


y = x.groupby('officer_defined_ethnicity')['n'].apply(sum).reset_index(name='ntot')


z = x.merge(y, on='officer_defined_ethnicity')
p = z.n / z.ntot
z['percent_find'] = np.round(p * 100, 1)
# Also compute a margin for error; see IB Data Science for the theory
z['err'] = np.round(1.96 * np.sqrt(p*(1-p)/z.ntot) * 100, 1)


x['ntot'] = x.groupby('officer_defined_ethnicity')['n'].transform(sum)

	officer_defined_ethnicity	outcome	n	n_tot	n / n_tot
0	Asian	find	116	192	60.4%
1	Asian	nothing	76	192	39.6%
2	Black	find	170	270	63.0%
3	Black	nothing	100	270	37.0%
4	Other	find	28	37	75.7%
5	Other	nothing	9	37	24.3%
6	White	find	1060	1740	60.9%
7	White	nothing	680	1740	30.1%

	officer_defined_ethnicity	outcome	n	ntot	percent_find	err
0	Asian	find	116	192	60.4	6.9
1	Asian	nothing	76	192	39.6	6.9
2	Black	find	170	270	63.0	5.8
3	Black	nothing	100	270	37.0	5.8
4	Other	find	28	37	75.7	13.8
5	Other	nothing	9	37	24.3	13.8
6	White	find	1060	1740	60.9	2.3
7	White	nothing	680	1740	39.1	2.3

Handling data with pandas¶

Contents¶

1. Preamble¶

2. What data looks like¶

2.1 MISSING VALUES¶

3. Importing, exporting, and creating dataframes¶

4. Selecting and modifying data¶

4.1 LIKE A DICTIONARY¶

4.2 LIKE A DATABASE TABLE¶

4.3 LIKE AN ARRAY¶

5. Tabulations and indexed arrays¶

5.1 DATAFRAME → INDEXED ARRAY¶

5.2 INDEXED ARRAY → DATAFRAME¶

6. Merging dataframes¶

6.1 WITH DATABASE-STYLE JOINS¶

6.2 WITH PANDAS INDEXING¶

	officer_defined_ethnicity	gender	count
0	Black	Female	10
1	Black	Male	257
2	White	Female	253
3	White	Male	1465
4	White	Other	5