#!/usr/bin/python
# 
# This script is used parse the log file generated by the crawler.
# Currently, it will map the ip address to its geo-location. The
# file and its structure might be reorganized in futuren when more
# features are needed.
#
# By Liang Wang @ Dept. Computer Science, University of Helsinki
# 2010.12.01
#


import os, sys
import pickle
import sqlite3 as sqlite

IP_DB_FILE = "./ip.db"

class IPDB(object):
    def __init__(self, db=IP_DB_FILE):
        self.conn = sqlite.connect(db)
        self.cur = self.conn.cursor()
        pass

    def ip2loc(self, ip):
        i = self.ip2int(ip)
        self.cur.execute("select * from iptable where ip_start<=? order by ip_start desc limit 1;", (i,))
        for r in self.cur:
            return {"ip":ip, "country_code":r[1], "country_name":r[2],\
                    "region_code":r[3], "region_name":r[4], "city":r[5],\
                    "zipcode":r[6], "latitude":r[7], "longtitude":r[8],\
                    "metrocodde":r[9]}

    def ip2int(self, ip):
        a = ip.split(".")
        k = 0
        for i in range(4):
            k = (k<<8) + int(a[i])
        return k

    def debug(self):
        # Put test code here.
        pass


class Parser(object):
    def __init__(self, dup = "-id"):
        self.ipdb = IPDB()
        self.set_enum(dup)
        pass

    def set_enum(self, m):
        if m == "-id":
            self.enum_func = self.enum_id
        elif m == "-idip":
            self.enum_func = self.enum_idip
        elif m == "-idipport":
            self.enum_func = self.enum_idipport

    def citiesInCountry(self, nodes, country_code):
        geo = {"unknown":0}
        err = 0
        for node in self.enum_idip(nodes.values()):
            try:
                info = self.ipdb.ip2loc(node["host"])
                if info and info["country_code"] == country_code:
                    if len(info["city"]) == 0:
                        print node["host"]
                    if info["city"] in geo:
                        geo[info["city"]] += 1
                    else:
                        geo[info["city"]] = 1
            except Exception, errMsg:
                print errMsg
                err += 1
        return geo, err

    def geoDistribution(self, nodes, col = "country_name"):
        geo = {"unknown":0}
        err = 0
        for node in self.enum_idip(nodes.values()):
            try:
                info = self.ipdb.ip2loc(node["host"])
                if info:
                    if info[col] in geo:
                        geo[info[col]] += 1
                    else:
                        geo[info[col]] = 1
                else:
                    geo["unknown"] += 1
            except Exception, errMsg:
                print errMsg
                err += 1
        return geo, err
    
    def enum_id(self, nodes):
        """Each (ID) maps to a distinct node"""
        for n in nodes:
            yield n[0]
        pass

    def enum_idip(self, nodes):
        """Each (ID,ip) maps to a distinct node"""
        for n in nodes:
            s = set()
            for i in n:
                if i["id"] not in s:
                    s.add(i["id"])
                    yield i
        pass

    def enum_idipport(self, nodes):
        """Each (ID,ip,port) maps to a distinct node"""
        for n in nodes:
            for i in n:
                yield i
        pass


def rawdata(f):
    """return the dict to the calling function without any processing."""
    nodes = pickle.Unpickler(open(f, "r")).load()
    return nodes


if __name__=="__main__":
    # The command line is in fix format, sorry for this.
    if len(sys.argv) < 3:
        print "Usage: %s -[id|idip|idipport] logfile" % sys.argv[0]
        sys.exit(1)
    nodes = rawdata(sys.argv[2])
    geo, err = Parser(sys.argv[1]).geoDistribution(nodes)
    for k, v in sorted(geo.items(), key=lambda x: -x[1]):
        print k,":",v
    print err, "errors."
    pass
