import numpy as np
import pandas


df = pandas.read_csv('data/iris.csv')


import io

data = '''
x,y
10,3
2,5
'''

df = pandas.read_csv(io.StringIO(data))


import requests
import io

my_url = "https://www.cl.cam.ac.uk/teaching/2122/DataSci/data/iris.csv"
data = requests.get(my_url).content.decode('utf8')
df = pandas.read_csv(io.StringIO(data))


import re    # standard Python module for regular expressions

s = """
207.46.13.169 - - [27/Aug/2017:06:52:11 +0000] "GET /marcus/essay/st&h2.html HTTP/1.1" 
200 3881 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X)
AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53
(compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
"""

# First attempt: match the first two items in the log line.
# If my pattern is right, re.match returns an object.
# If my pattern is wrong, re.match returns None.
pattern_test = r'\s*(\S+)\s*(\S+)'
re.match(pattern_test, s)

<re.Match object; span=(0, 16), match='\n207.46.13.169 -'>


# This is the full pattern I built up to. Python lets us add verbose comments
# to the pattern, which is handy for remembering what your code does when you look
# at it the next morning.
pattern = r'''(?x)  #   flag saying that this pattern has comments
\s*                 #   any whitespace at the start of the string
(?P<ip>\S+)         # one or more non-space characters: the IP address
\s+                 #   one or more spaces
(?P<client>\S+)     # the client identity
\s+
(?P<user>\S+)       # the userid
\s+
\[(?P<t>[^\]]*)\]   # [, then any number of not-] characters, then ]: the timestamp
\s+
"(?P<req>[^"]*)"    # ", then any number of not-" characters, then ": the request string
\s+
(?P<status>\d+)     # one or more numerical digits: the http status code
\s+
(?P<size>\d+)       # one or more numerical digits: the size
\s+
"(?P<ref>[^"]*)"    # the referrer URL
\s+
"(?P<ua>[^"]*)"     # the user agent i.e. browser type
'''
m = re.match(pattern, s)
m.groupdict()       # returns a dictionary of all the named sub-patterns

{'ip': '207.46.13.169',
 'client': '-',
 'user': '-',
 't': '27/Aug/2017:06:52:11 +0000',
 'req': 'GET /marcus/essay/st&h2.html HTTP/1.1',
 'status': '200',
 'size': '3881',
 'ref': '-',
 'ua': 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X)\nAppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53\n(compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'}


# Split the file into an array, one row per line, one column per field in the pattern
df = np.fromregex('data/webaccess_short.log', pattern, dtype=np.unicode_)

# Make a dictionary out of the columns, according to the named fields in the pattern
df = pandas.DataFrame({v: df[:,i-1] for v,i in re.compile(pattern).groupindex.items()})

df.sample(4)


import requests, requests_cache
requests_cache.install_cache('data/stopsearch')


# Make the request. This should print out <Response [200]>, meaning successfully retrieved
AVAILABILITY_URL = 'https://data.police.uk/api/crimes-street-dates'
stations_resp = requests.get(AVAILABILITY_URL)
stations_resp

<Response [200]>


stations_resp.text[:300]

'[{"date":"2021-10","stop-and-search":["avon-and-somerset","bedfordshire","btp","cambridgeshire","cheshire","city-of-london","cleveland","cumbria","derbyshire","devon-and-cornwall","dorset","durham","dyfed-powys","essex","gloucestershire","gwent","hertfordshire","humberside","kent","lancashire","leic'


x = requests.get(AVAILABILITY_URL).json()

print(type(x))      # x = [...]
print(type(x[0]))   # x = [{...}, ...]
print(x[0].keys())  # x = [{date, stop-and-search}, ...]
print(x[0])         # x = [{date, stop-and-search:[area,area,...]}, ...]

<class 'list'>
<class 'dict'>
dict_keys(['date', 'stop-and-search'])
{'date': '2021-10', 'stop-and-search': ['avon-and-somerset', 'bedfordshire', 'btp', 'cambridgeshire', 'cheshire', 'city-of-london', 'cleveland', 'cumbria', 'derbyshire', 'devon-and-cornwall', 'dorset', 'durham', 'dyfed-powys', 'essex', 'gloucestershire', 'gwent', 'hertfordshire', 'humberside', 'kent', 'lancashire', 'leicestershire', 'merseyside', 'metropolitan', 'norfolk', 'north-wales', 'north-yorkshire', 'northamptonshire', 'northumbria', 'nottinghamshire', 'south-wales', 'south-yorkshire', 'staffordshire', 'suffolk', 'surrey', 'sussex', 'thames-valley', 'warwickshire', 'west-mercia', 'west-yorkshire']}


availability = pandas.json_normalize(x, 'stop-and-search', 'date').rename(columns={0:'force'})
availability


import urllib
BASE_URL = 'https://data.police.uk/api/'
STOPSDATA_URL = urllib.parse.urljoin(BASE_URL, 'stops-force')
availability['url'] = [STOPSDATA_URL + '?' + urllib.parse.urlencode({'force':f, 'date':d}) 
                       for f,d in zip(availability.force, availability.date)]

def get_dataframe(q):
    url,date,force = (q.url, q.date, q.force)
    print(url + " ...", end="\r")
    response = requests.get(url)
    x = response.json()
    df = pandas.json_normalize(x, sep='_')
    df.insert(0, 'month', date)
    df.insert(0, 'force', force)
    return df

# only fetch a little, for illustration purposes
df = [get_dataframe(r) for i,r in availability.iloc[:10].iterrows()]
stopsearch = pandas.concat(df, axis=0, ignore_index=True, sort=False)

https://data.police.uk/api/stops-force?force=devon-and-cornwall&date=2021-10 ...


stopsearch.sample(3)


import requests
import lxml.html

BOATRACE_URL = 'https://en.wikipedia.org/wiki/List_of_The_Boat_Race_results'
resp = requests.get(BOATRACE_URL)
doc = lxml.html.fromstring(resp.content)


print(doc.tag)               # the type of element
print(len(doc))              # the number of children
print([n.tag for n in doc])  # tags of its children, <head> and <body>
print(doc.attrib)            # get the attributes, e.g. <html class="client-nojs" lang="en" dir="ltr">
print(doc.text, doc.tail)    # any text directly under in this element

html
2
['head', 'body']
{'class': 'client-nojs', 'lang': 'en', 'dir': 'ltr'}

 None


# Pick out the table element.
# (XPath queries return lists, but I only want one item, hence the [0].)
table = doc.xpath('//*[@id="mw-content-text"]/div[1]/table[2]')[0]

# Get a list of all rows i.e. <tr> elements inside the table.
# Print one, to check things look OK.
rows = table.xpath('.//tr')
print(lxml.etree.tostring(rows[5], encoding='unicode'))

# Extract the timestamp and winner columns.
# The timestamp is in the second child, in a <span> element with a "data-sort-value" attribute.
# The winner is in the third child.
df = {'t': [row[1].xpath('.//span[@data-sort-value]')[0].text for row in rows[1:]],
      'winner': [row[2].text for row in rows[1:]]}
df = pandas.DataFrame(df)
df.iloc[:5]

<tr>
<td align="center"><a href="/wiki/The_Boat_Race_1841" title="The Boat Race 1841">5</a></td>
<td align="center"><span data-sort-value="000000001841-04-14-0000" style="white-space:nowrap">14 April 1841</span> <img alt="double-dagger" src="//upload.wikimedia.org/wikipedia/commons/f/f9/Double-dagger-14-plain.png" decoding="async" width="9" height="14" data-file-width="9" data-file-height="14"/></td>
<td style="background:#B7E1E4; color:#000;" align="center">Cambridge</td>
<td align="center">32:03</td>
<td align="center">22 lengths</td>
<td align="center">1</td>
<td align="center">4
</td></tr>


import psycopg2    # module for connecting to a Postgresql database
import json        # standard module for reading json files

creds = json.load(open('res/secret_creds.json'))
conn = psycopg2.connect(**creds)


cmd = '''
SELECT *
FROM flood_stations AS s JOIN flood_measures AS m ON (m.station_uri = s.uri) 
WHERE river = %(river)s OR town = %(town)s
LIMIT 3
'''
pandas.read_sql(cmd, conn, params={'river': 'River Cam', 'town': 'Cambridge'})

	ip	client	user	t	req	status	size	ref	ua
821	82.132.187.229	-	-	27/Aug/2017:10:43:19 +0000	GET /apple-touch-icon-120x120.png HTTP/1.1	404	517	-	MobileSafari/602.1 CFNetwork/811.5.4 Darwin/16...
250	207.46.13.169	-	-	27/Aug/2017:08:09:13 +0000	GET /irene/cross/4-small.jpg HTTP/1.1	200	4784	-	Mozilla/5.0 (compatible; bingbot/2.0; +http://...
879	46.176.68.80	-	-	27/Aug/2017:10:59:29 +0000	HEAD http://54.246.111.98:80/mysql/mysqlmanage...	404	218	-	Mozilla/5.0 Jorgee
767	1.129.97.177	-	-	27/Aug/2017:10:31:52 +0000	GET /apple-touch-icon-120x120-precomposed.png ...	404	525	-	MobileSafari/602.1 CFNetwork/811.5.4 Darwin/16...

	force	month	age_range	outcome	involved_person	self_defined_ethnicity	gender	legislation	outcome_linked_to_object_of_search	datetime	...	officer_defined_ethnicity	type	operation_name	object_of_search	outcome_object_id	outcome_object_name	location_latitude	location_street_id	location_street_name	location_longitude
2595	city-of-london	2021-10	18-24	A no further action disposal	True	Other ethnic group - Not stated	Male	Police and Criminal Evidence Act 1984 (section 1)	False	2021-10-01T06:16:14+00:00	...	White	Person search	None	Stolen goods	bu-no-further-action	A no further action disposal	51.516571	587580.0	On or near New Fetter Lane	-0.108164
2186	cheshire	2021-10	25-34	A no further action disposal	True	White - English/Welsh/Scottish/Northern Irish/...	Male	Misuse of Drugs Act 1971 (section 23)	None	2021-10-04T10:30:00+00:00	...	White	Person and Vehicle search	None	Controlled drugs	bu-no-further-action	A no further action disposal	53.278951	570243.0	On or near Park/Open Space	-2.865144
1773	btp	2021-10	25-34	A no further action disposal	True	Black/African/Caribbean/Black British - Caribbean	Male	Misuse of Drugs Act 1971 (section 23)	False	2021-10-29T20:00:00+00:00	...	Black	Person search	None	Controlled drugs	bu-no-further-action	A no further action disposal	51.528000	1486616.0	West Ham (Dlr)	0.005042

	index	uri	label	id	catchment	river	town	lat	lng	index	measure_id	uri	station_uri	qualifier	parameter	period	unit	valuetype	low	high
0	345	http://environment.data.gov.uk/flood-monitorin...	Great Chesterford	E21778	Cam and Ely Ouse (Including South Level)	River Cam	Great Chesterford	52.061730	0.194279	397	398	http://environment.data.gov.uk/flood-monitorin...	http://environment.data.gov.uk/flood-monitorin...	Stage	Water Level	900.0	m	instantaneous	0.109	0.333
1	800	http://environment.data.gov.uk/flood-monitorin...	Weston Bampfylde	52113	Parrett, Brue and West Somerset Streams	River Cam	Weston Bampfylde	51.023159	-2.565568	918	919	http://environment.data.gov.uk/flood-monitorin...	http://environment.data.gov.uk/flood-monitorin...	Stage	Water Level	900.0	m	instantaneous	0.026	0.600
2	1272	http://environment.data.gov.uk/flood-monitorin...	Cambridge Baits Bite	E60101	Cam and Ely Ouse (Including South Level)	River Cam	Milton	52.236542	0.176925	1454	1455	http://environment.data.gov.uk/flood-monitorin...	http://environment.data.gov.uk/flood-monitorin...	Stage	Water Level	900.0	mASD	instantaneous	0.218	0.294

Recipes for data import and cleanup¶

Reading from a csv file ¶

Reading from a string¶

Reading from an http request¶

Parsing log files with regular expressions¶

Reading json from a web data service¶

Scraping a website with xpath¶

Querying an SQL database¶

	force	date
0	avon-and-somerset	2021-10
1	bedfordshire	2021-10
2	btp	2021-10
3	cambridgeshire	2021-10
4	cheshire	2021-10
...	...	...
1538	warwickshire	2018-11
1539	west-mercia	2018-11
1540	west-midlands	2018-11
1541	west-yorkshire	2018-11
1542	wiltshire	2018-11

	t	winner
0	10 June 1829	Oxford
1	17 June 1836	Cambridge
2	3 April 1839	Cambridge
3	15 April 1840	Cambridge
4	14 April 1841	Cambridge