from IPython.display import YouTubeVideo
YouTubeVideo('UO-d42qQqZU', width=560, height=315)


import numpy as np
import pandas
import matplotlib.pyplot as plt


import os.path
if os.path.exists('stop-and-search.csv'):
    print("file already downloaded")
else:
    !wget "https://www.cl.cam.ac.uk/teaching/2021/DataSci/data/stop-and-search.csv"
stopsearch = pandas.read_csv('stop-and-search.csv')

file already downloaded


df = stopsearch.groupby('officer_defined_ethnicity').apply(len).reset_index(name='n')

fig,ax = plt.subplots(figsize=(5,3))
ax.bar(df['officer_defined_ethnicity'], df['n'])
plt.show()


x = stopsearch.groupby(['object_of_search','gender']).apply(len)
df = x.unstack(fill_value=0).reset_index().rename_axis(None, axis=1)

fig,(ax1,ax2,ax3) = plt.subplots(nrows=1,ncols=3, figsize=(6,3), sharey=True)

# 1. Draw three histograms, one in each subplot
for (ax,eth) in zip([ax1,ax2,ax3], ['Female','Male','Other']):
    ax.barh(np.arange(len(df)), df[eth])

# 2. We've already specified, through sharey=True, that the three plots
# share a common y-axis. Nothing else to set.

# 3. No annotations needed
    
# 4. Configure ticks: labels on the y-axis, rotated ticks on x-axis
ax1.set_yticks(np.arange(len(df)))
ax1.set_yticklabels(df.object_of_search)

for ax in (ax1,ax2,ax3):
    for lbl in ax.get_xticklabels():
        lbl.set_rotation(-60)
        lbl.set_ha('left')

# 5. Titles
for ax,eth in zip([ax1,ax2,ax3], ['Female','Male','Other']):
    ax.set_title(eth)

#plt.savefig('res/plot0.png', transparent=False, bbox_inches='tight', pad_inches=0.1)    
plt.show()


x = stopsearch.location_latitude
x = x[~pandas.isna(x)]   # remove missing values

import scipy.stats
# Smoothing is slow, and it produces just as good results on a subset
density = scipy.stats.gaussian_kde(np.random.choice(x,50000))

fig,ax = plt.subplots()
ax.hist(x, bins=30, density=True, alpha=0.2, edgecolor='steelblue')
xsample = np.linspace(50,55,200)
ax.plot(xsample, density(xsample), color='steelblue')
ax.set_xlabel('latitude')
ax.set_title('Distribution of latitude')
plt.show()


df = stopsearch.loc[stopsearch.force=='cambridgeshire', ['datetime','outcome']].copy()
df['outcome'] = np.where(df.outcome=='False','nothing','find')
df['date'] = pandas.to_datetime(df.datetime.str.slice(stop=10), format='%Y-%m-%d')

# Number of events per date, sorted by timestamp
# (if timestamps were unsorted, the line would wiggle backwards and forwards)
df = df.groupby(['date','outcome']).apply(len).unstack(fill_value=0).reset_index()
df = df.iloc[np.argsort(df.date)]

fig,ax = plt.subplots(figsize=(5,1.5))
ax.plot(df.date, df.find + df.nothing, label='stops', linewidth=3)
ax.plot(df.date, df.find, label='find', linewidth=1)
ax.legend()

# Some magic to improve tick labels for an entire figure
fig.autofmt_xdate(bottom=0.2, rotation=-30, ha='left')
plt.show()


# There’s no point plotting more data than there are pixels on the output
df = stopsearch.iloc[np.random.choice(len(stopsearch), size=100000)]

fig,ax = plt.subplots()

cols = plt.get_cmap('Set2', len(np.unique(df.force)))
for i,police_force in enumerate(np.unique(df.force)):
    want_rows = (df.force == police_force)
    x,y = df.location_longitude[want_rows], df.location_latitude[want_rows]
    # Set the size, alpha, and colour of the points
    ax.scatter(x, y, s=1, alpha=.1, color=cols(i))

# Set the aspect ratio, based on the UK’s average latitude
ax.set_aspect(1/np.cos(54/360*2*np.pi))

# Pick coordinates to show (I chose these after seeing the plot first)
ax.set_xlim([-5,2])
ax.set_ylim([50.2, 55.8])

# Get rid of the tick marks and the outer frame
ax.set_xticks([])
ax.set_yticks([])
ax.axis('off')

plt.show()


df = stopsearch.loc[stopsearch.force=='cambridgeshire'].copy()
df['date'] = pandas.to_datetime(df.datetime.str.slice(stop=10), format='%Y-%m-%d')
df['weekday'] = df.date.dt.isocalendar()['day'] - 1 # 0=Mon, 6=Sun

# Get the number of stops, grouped by date. Also include the weekday variable,
# which is redundant, but we'll use it the next step to plot a histogram per day
# of week.
df2 = df.groupby(['date','weekday']).apply(len).reset_index(name='n')

with plt.rc_context({'figure.subplot.hspace':0.35}):
    fig = plt.figure(figsize=(8,5))

for i, weekday in enumerate(range(7)):
    ax = fig.add_subplot(3, 3, i+1)
    # 1. Draw the data
    ax.hist(df2.loc[df2.weekday==weekday,'n'].values, bins=range(15), alpha=.3)
    # 2. Configure limits
    ax.set_ylim([0,30])
    # 3. Add annotations
    ax.axvline(x=np.median(df2.n), linestyle='dotted', color='black')
    # 4. Configure ticks
    if i < 4: ax.set_xticklabels([])
    if (i % 3) != 0: ax.set_yticklabels([])
    # 5. Legend, axis, titles
    weekday_names = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
    ax.set_title(weekday_names[weekday])

fig.suptitle('Number of stops')
plt.show()


df = stopsearch.copy()

# Datetime operations are always a mystery, and need frequent documentation checks.
df['date'] = pandas.to_datetime(df.datetime.str.slice(stop=10), format='%Y-%m-%d')
df2 = df.date.dt.isocalendar() # gets the year, week of year, day of week (mon=1, sun=7)
for k in ['year','week','day']: df[k] = df2[k]

x = df.groupby(['year','day','week']).apply(len)

years = x.index.levels[0]

fig,axes = plt.subplots(len(years),1, figsize=(10,6), sharex=True, sharey=True)
for y,ax in zip(years, axes):
    xy = x.loc[y].unstack(fill_value=0)
    weeks = xy.columns
    im = ax.imshow(xy, origin='lower', extent=(min(weeks)-.5, max(weeks)+.5, 0.5, 7.5), 
              cmap='Blues', vmin=0, vmax=4000)
    ax.set_title(f"year {y}")

# Only need to set the xlim for one axis: we used sharex=True, so they all use the same
weeks = x.index.levels[2]
axes[0].set_xlim(min(weeks)-.5, max(weeks)+.5)

axes[0].set_yticks(np.arange(1,8))
axes[0].set_yticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'], fontsize=8)
axes[-1].set_xlabel('ISO week number')

plt.colorbar(im, ax=axes)

fig.suptitle('Total number of stops per day')
plt.show()

Plotting data with matplotlib¶

Contents¶

0. Preamble¶

1. Code structure for plotting¶

2. Gallery¶

2.1 MULTIPANEL BAR CHART¶

2.2 HISTOGRAM AND DENSITY PLOT¶

2.3 LINE PLOTS + LEGEND¶

2.4 SCATTER PLOT + DISCRETE COLOUR SCALE¶

2.5 MULTIPANEL PLOT AGAIN¶

2.6 HEATMAP + CONTINUOUS COLOUR SCALE¶