
# Code to extract trivia information from an html page downloaded from IMDB using Xpath embedded in Python.

import lxml.html
import glob, os
import pickle

        
handle = 'tt0319262'

g_output_table = dict()

nn = 0


def process_html(movie_id, filename):
    global nn
    root = lxml.html.parse(filename)
    charset = root.xpath('/html/head/meta/@charset')
    title = root.xpath('/html/head/meta[@property=\'og:title\']/@content')
    trivs = root.xpath('/html//div[@class="col-xs-12 drop-panel-content"]/p//text()')
    nn += 1
    print(f'{nn} {movie_id} {charset}  {title} n_tivs={len(trivs)}')
    record = { 'movie_id': movie_id, 'title':title, 'trivia':trivs }
    g_output_table[movie_id] = record


directory = '.'  # iterate over files in that directory
for filename in os.listdir(directory):
    fname = os.path.join(directory, filename)
    if os.path.isfile(fname):
        if filename.startswith("tt"):
            movie_id = filename
            process_html(movie_id, fname)


# Store data (serialize)
with open('imdb_trivia.pickle', 'wb') as handle:
    pickle.dump(g_output_table, handle, protocol=pickle.HIGHEST_PROTOCOL)

# eof
