import numpy as np
import pandas


df = pandas.read_csv('data/iris.csv')


import io

data = '''
x,y
10,3
2,5
'''

df = pandas.read_csv(io.StringIO(data))


import requests
import io

my_url = "https://www.cl.cam.ac.uk/teaching/current/DataSci/data/iris.csv"
data = requests.get(my_url).content.decode('utf8')
df = pandas.read_csv(io.StringIO(data))


import re    # standard Python module for regular expressions

s = """
207.46.13.169 - - [27/Aug/2017:06:52:11 +0000] "GET /marcus/essay/st&h2.html HTTP/1.1" 
200 3881 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X)
AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53
(compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
"""

# First attempt: match the first two items in the log line.
# If my pattern is right, re.match returns an object.
# If my pattern is wrong, re.match returns None.
pattern_test = r'\s*(\S+)\s*(\S+)'
re.match(pattern_test, s)

<re.Match object; span=(0, 16), match='\n207.46.13.169 -'>


# This is the full pattern I built up to. Python lets us add verbose comments
# to the pattern, which is handy for remembering what your code does when you look
# at it the next morning.
pattern = r'''(?x)  #   flag saying that this pattern has comments
\s*                 #   any whitespace at the start of the string
(?P<ip>\S+)         # one or more non-space characters: the IP address
\s+                 #   one or more spaces
(?P<client>\S+)     # the client identity
\s+
(?P<user>\S+)       # the userid
\s+
\[(?P<t>[^\]]*)\]   # [, then any number of not-] characters, then ]: the timestamp
\s+
"(?P<req>[^"]*)"    # ", then any number of not-" characters, then ": the request string
\s+
(?P<status>\d+)     # one or more numerical digits: the http status code
\s+
(?P<size>\d+)       # one or more numerical digits: the size
\s+
"(?P<ref>[^"]*)"    # the referrer URL
\s+
"(?P<ua>[^"]*)"     # the user agent i.e. browser type
'''
m = re.match(pattern, s)
m.groupdict()       # returns a dictionary of all the named sub-patterns

{'ip': '207.46.13.169',
 'client': '-',
 'user': '-',
 't': '27/Aug/2017:06:52:11 +0000',
 'req': 'GET /marcus/essay/st&h2.html HTTP/1.1',
 'status': '200',
 'size': '3881',
 'ref': '-',
 'ua': 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X)\nAppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53\n(compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'}


# Split the file into an array, one row per line, one column per field in the pattern
t = [('ip',str,30), ('client',str,10), ('userid',str,10), ('timestamp',str,80), 
     ('req',str,300), ('status',int), ('size',int), 
     ('ref',str,300), ('uagent',str,300)]
df = np.fromregex('data/webaccess_short.log', pattern, dtype=t)

# Make a dictionary out of the columns, according to the named fields in the pattern
df = pandas.DataFrame({v: df[n] for (v,_),n in zip(re.compile(pattern).groupindex.items(), df.dtype.names)})

df.sample(4)


import requests, requests_cache
requests_cache.install_cache('data/stopsearch')


# Make the request. This should print out <Response [200]>, meaning successfully retrieved
AVAILABILITY_URL = 'https://data.police.uk/api/crimes-street-dates'
stations_resp = requests.get(AVAILABILITY_URL)
stations_resp

<Response [200]>


stations_resp.text[:300]

'[{"date":"2023-09","stop-and-search":["avon-and-somerset","bedfordshire","btp","cambridgeshire","city-of-london","cleveland","cumbria","derbyshire","dorset","durham","essex","gloucestershire","hampshire","hertfordshire","kent","lancashire","leicestershire","lincolnshire","merseyside","norfolk","nort'


x = requests.get(AVAILABILITY_URL).json()

print(type(x))      # x = [...]
print(type(x[0]))   # x = [{...}, ...]
print(x[0].keys())  # x = [{date, stop-and-search}, ...]
print(x[0])         # x = [{date, stop-and-search:[area,area,...]}, ...]

<class 'list'>
<class 'dict'>
dict_keys(['date', 'stop-and-search'])
{'date': '2023-09', 'stop-and-search': ['avon-and-somerset', 'bedfordshire', 'btp', 'cambridgeshire', 'city-of-london', 'cleveland', 'cumbria', 'derbyshire', 'dorset', 'durham', 'essex', 'gloucestershire', 'hampshire', 'hertfordshire', 'kent', 'lancashire', 'leicestershire', 'lincolnshire', 'merseyside', 'norfolk', 'north-wales', 'north-yorkshire', 'northamptonshire', 'northumbria', 'south-yorkshire', 'staffordshire', 'suffolk', 'surrey', 'sussex', 'thames-valley', 'west-mercia', 'west-midlands', 'west-yorkshire']}


availability = pandas.json_normalize(x, 'stop-and-search', 'date').rename(columns={0:'force'})
availability


import urllib
BASE_URL = 'https://data.police.uk/api/'
STOPSDATA_URL = urllib.parse.urljoin(BASE_URL, 'stops-force')
availability['url'] = [STOPSDATA_URL + '?' + urllib.parse.urlencode({'force':f, 'date':d}) 
                       for f,d in zip(availability.force, availability.date)]

def get_dataframe(q):
    url,date,force = (q.url, q.date, q.force)
    print(url + " ...", end="\r")
    response = requests.get(url)
    x = response.json()
    df = pandas.json_normalize(x, sep='_')
    df.insert(0, 'month', date)
    df.insert(0, 'force', force)
    return df

# only fetch a little, for illustration purposes
df = [get_dataframe(r) for i,r in availability.iloc[:10].iterrows()]
stopsearch = pandas.concat(df, axis=0, ignore_index=True, sort=False)

https://data.police.uk/api/stops-force?force=durham&date=2023-09 ... ... ......


stopsearch.sample(3)


import requests
import lxml.html

BOATRACE_URL = 'https://en.wikipedia.org/wiki/List_of_The_Boat_Race_results'
resp = requests.get(BOATRACE_URL)
doc = lxml.html.fromstring(resp.content)


print(doc.tag)               # the type of element
print(len(doc))              # the number of children
print([n.tag for n in doc])  # tags of its children, <head> and <body>
print(doc.attrib)            # get the attributes, e.g. <html class="client-nojs" lang="en" dir="ltr">
print(doc.text, doc.tail)    # any text directly under in this element

html
2
['head', 'body']
{'class': 'client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available', 'lang': 'en', 'dir': 'ltr'}

 None


# Pick out the table element.
# (XPath queries return lists, but I only want one item, hence the [0].)
table = doc.xpath('//*[@id="mw-content-text"]/div[1]/table[2]')[0]

# Get a list of all rows i.e. <tr> elements inside the table.
# Print one, to check things look OK.
rows = table.xpath('.//tr')
print(lxml.etree.tostring(rows[5], encoding='unicode'))

# Extract the timestamp and winner columns.
# The timestamp is in the second child, in a <span> element with a "data-sort-value" attribute.
# The winner is in the third child.
df = {'t': [row[1].xpath('.//span[@data-sort-value]')[0].text for row in rows[1:]],
      'winner': [row[2].text for row in rows[1:]]}
df = pandas.DataFrame(df)
df.iloc[:5]

<tr>
<td><a href="/wiki/The_Boat_Race_1841" title="The Boat Race 1841">5</a></td>
<td><span data-sort-value="000000001841-04-14-0000" style="white-space:nowrap">14 April 1841</span> ‡</td>
<td style="background:#B7E1E4; color:#000;">Cambridge</td>
<td>32:03</td>
<td>22 lengths</td>
<td>1</td>
<td>4
</td></tr>


import psycopg2    # module for connecting to a Postgresql database
import json        # standard module for reading json files

creds = json.load(open('res/secret_creds.json'))
conn = psycopg2.connect(**creds)


cmd = '''
SELECT *
FROM flood_stations AS s JOIN flood_measures AS m ON (m.station_uri = s.uri) 
WHERE river = %(river)s OR town = %(town)s
LIMIT 3
'''
pandas.read_sql(cmd, conn, params={'river': 'River Cam', 'town': 'Cambridge'})

	ip	client	user	t	req	status	size	ref	ua
362	207.46.13.85	-	-	27/Aug/2017:08:26:01 +0000	GET /damon/recipe/breadbutterpud2 HTTP/1.1	200	1635	-	Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Ma...
224	216.244.66.250	-	-	27/Aug/2017:08:02:49 +0000	GET /robots.txt HTTP/1.1	200	359	-	Mozilla/5.0 (compatible; DotBot/1.1; http://ww...
142	86.183.91.158	-	-	27/Aug/2017:07:50:23 +0000	GET /favicon.ico HTTP/1.1	404	503	-	MobileSafari/602.1 CFNetwork/811.5.4 Darwin/16...
243	73.246.74.104	-	-	27/Aug/2017:08:08:02 +0000	GET /irene/cross/12-jesus-dies-on-the-cross.jp...	200	77292	https://www.bing.com/	Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...

	force	month	age_range	outcome	involved_person	self_defined_ethnicity	gender	legislation	outcome_linked_to_object_of_search	datetime	...	officer_defined_ethnicity	type	operation_name	object_of_search	outcome_object_id	outcome_object_name	location_latitude	location_street_id	location_street_name	location_longitude
720	bedfordshire	2023-09	18-24	A no further action disposal	True	White - English/Welsh/Scottish/Northern Irish/...	Male	Misuse of Drugs Act 1971 (section 23)	None	2023-09-15T22:03:42+00:00	...	White	Person search	None	Controlled drugs	bu-no-further-action	A no further action disposal	NaN	NaN	NaN	NaN
332	avon-and-somerset	2023-09	over 34		True	Other ethnic group - Not stated	Female	Police and Criminal Evidence Act 1984 (section 1)	None	2023-09-25T00:00:00+00:00	...	White	Person search	None	Article for use in theft			51.456677	2278864.0	On or near Nightclub	-2.592162
3959	durham	2023-09	25-34	A no further action disposal	True	White - English/Welsh/Scottish/Northern Irish/...	Male	Misuse of Drugs Act 1971 (section 23)	False	2023-09-03T02:00:00+00:00	...	White	Person search	None	Controlled drugs	bu-no-further-action	A no further action disposal	54.727183	2166006.0	On or near Parking Area	-1.252132

	index	uri	label	id	catchment	river	town	lat	lng	index	measure_id	uri	station_uri	qualifier	parameter	period	unit	valuetype	low	high
0	345	http://environment.data.gov.uk/flood-monitorin...	Great Chesterford	E21778	Cam and Ely Ouse (Including South Level)	River Cam	Great Chesterford	52.061730	0.194279	397	398	http://environment.data.gov.uk/flood-monitorin...	http://environment.data.gov.uk/flood-monitorin...	Stage	Water Level	900.0	m	instantaneous	0.109	0.333
1	800	http://environment.data.gov.uk/flood-monitorin...	Weston Bampfylde	52113	Parrett, Brue and West Somerset Streams	River Cam	Weston Bampfylde	51.023159	-2.565568	918	919	http://environment.data.gov.uk/flood-monitorin...	http://environment.data.gov.uk/flood-monitorin...	Stage	Water Level	900.0	m	instantaneous	0.026	0.600
2	1272	http://environment.data.gov.uk/flood-monitorin...	Cambridge Baits Bite	E60101	Cam and Ely Ouse (Including South Level)	River Cam	Milton	52.236542	0.176925	1454	1455	http://environment.data.gov.uk/flood-monitorin...	http://environment.data.gov.uk/flood-monitorin...	Stage	Water Level	900.0	mASD	instantaneous	0.218	0.294

Recipes for data import and cleanup¶

Reading from a csv file ¶

Reading from a string¶

Reading from an http request¶

Parsing log files with regular expressions¶

Reading json from a web data service¶

Scraping a website with xpath¶

Querying an SQL database¶

	t	winner
0	10 June 1829	Oxford
1	17 June 1836	Cambridge
2	3 April 1839	Cambridge
3	15 April 1840	Cambridge
4	14 April 1841	Cambridge

	force	date
0	avon-and-somerset	2023-09
1	bedfordshire	2023-09
2	btp	2023-09
3	cambridgeshire	2023-09
4	city-of-london	2023-09
...	...	...
1411	warwickshire	2020-10
1412	west-mercia	2020-10
1413	west-midlands	2020-10
1414	west-yorkshire	2020-10
1415	wiltshire	2020-10