import numpy as np
import pandas
import matplotlib.pyplot as plt
import sklearn.linear_model
π = np.pi
Your task is to answer these questions using appropriate linear models, and to produce elegant plots to communicate your findings. Please submit a Jupyter notebook, or a pdf. Include explanations of what your models are, and of what your plots show.
The dataset is from https://www.metoffice.gov.uk/pub/data/weather/uk/climate/. Code for retrieving the dataset is given at the bottom.
url = 'https://www.cl.cam.ac.uk/teaching/2223/DataSci/data/climate.csv'
#climate = pandas.read_csv(url)
# Or run the scraping code at the bottom
# (Warning: the scraping code may break if the Met Office changes its webpages. If it breaks,
# just use the csv above.)
# Define two extra covariates
climate['t'] = climate.yyyy + (climate.mm-1)/12 # timestamp in years
climate['temp'] = (climate.tmin + climate.tmax)/2 # monthly avg temp
To illustrate fitting a linear model (see section 2 of the course), we'll fit a simple periodic model: $$ Y_i \sim \alpha + \beta_1 \sin(2\pi t_i) + \beta_2 \cos(2\pi t_i) + \operatorname{Normal}(0,\sigma^2) $$
# Restrict attention to Cambridge, 1985 onwards, for illustration
df = climate.loc[(climate.station=='Cambridge') & (climate.yyyy>=1985)]
X = np.column_stack([np.sin(2*π*df.t), np.cos(2*π*df.t)])
model = sklearn.linear_model.LinearRegression()
model.fit(X, df.temp)
α,(β1,β2) = (model.intercept_, model.coef_)
# Plot a curve to depict the fitted values
tnew = np.linspace(1985, 2024, 1000)
Xnew = np.column_stack([np.sin(2*π*tnew), np.cos(2*π*tnew)])
temp_pred = model.predict(Xnew)
fig,ax = plt.subplots(figsize=(15,1.7))
ax.scatter(df.t, df.temp, s=15, alpha=0.7)
ax.plot(tnew, temp_pred, color='0.7', zorder=0)
ax.set_xlim([1984,2025])
ax.set_ylim([-3,23])
ax.set_title('Monthly average temperature [°C]')
plt.show()
resid = df.temp - model.predict(X)
fig,ax = plt.subplots(figsize=(15,1.7))
ax.scatter(df.t, resid, c=np.where(resid<=0,'blue','red'), s=15)
ax.axhline(0, color='0.6', zorder=0)
ax.set_xlim([1984,2025])
ax.set_title('Monthly average temperature -- model residuals [°C]')
plt.show()
import numpy as np
import pandas
import requests, requests_cache
import urllib
import io, re
import lxml.html
import numbers
requests_cache.install_cache(backend='memory')
# Fetch a list of all measurement stations
stations_url = 'https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data'
stations = requests.get(stations_url).content
stations = lxml.html.fromstring(stations)
stations = stations.xpath("//h2[contains(text(),'List of all the historic stations')]/following-sibling::table/tbody/tr")
stations = [tr.xpath('./td/a/@href')[0] for tr in stations]
# Fetch historical data from each station
def parse_station_data(txt, i=None):
ls = txt.splitlines()
g = (i for i,l in enumerate(ls) if ' '.join(l.split()).startswith('yyyy mm tmax tmin'))
header_line = next(g)
d = '\n'.join([ls[header_line]+' status X1 X2 X3 X4 X5 X6'] + ls[(header_line+2):])
df = pandas.read_csv(io.StringIO(d), delim_whitespace=True)
df['station'] = [ls[0]] * len(df)
s = ls[1] + ' ' + ls[2]
m = re.search(r'Lat ([-\d.]+) Lon ([-\d.]+),? ([-\d.]+)\s?m.* amsl', s)
assert m is not None, f"Can't read details for {i}: {s}"
df['lat'] = [m.group(1)] * len(df)
df['lng'] = [m.group(2)] * len(df)
df['height'] = [m.group(3)] * len(df)
return df
df = [requests.get(station_url).content.decode('utf8') for station_url in stations]
df = [parse_station_data(txt,i) for i,txt in enumerate(df)]
# Concatenate all the station together
df = pandas.concat(df, ignore_index=True)
# Sometimes there are comment rows, and df.yyy is string
# Sometimes there are no comment rows, and df.yyy is int
# This code removes any comment rows, safely in either case
df = df.loc[df.yyyy.astype(str) != 'Site']
# Delete some dummy columns (which were only there for rows with comments)
for i in range(1,7): del df[f'X{i}']
# Normalize some long-winded station names
for old,new in [('Heathrow (London Airport)','Heathrow'),
('Lowestoft / Lowestoft Monckton Ave from Sept 2007','Lowestoft'),
('Whitby Coastguard / Whitby(from 2000 and $sunshine)','Whitby'),
('Nairn there is a site change in 1998','Nairn'),
('Cambridge NIAB','Cambridge')]:
df['station'] = np.where(df['station']==old, new, df['station'])
# Tidy the numerical columns, to only have numbers or NaN
def clean_num(x):
if isinstance(x, numbers.Number):
return x
elif x == '---':
return np.nan
else:
return float(re.match(r'([-\d.]+)', x).group(1))
df['tmin'] = np.array([clean_num(x) for x in df['tmin']])
df['tmax'] = np.array([clean_num(x) for x in df['tmax']])
df['yyyy'] = np.array([clean_num(x) for x in df['yyyy']]).astype(np.int)
df['mm'] = np.array([clean_num(x) for x in df['mm']]).astype(np.int)
climate = df