#! /usr/bin/env python
###############################################################################
#                                                                             #
#   Copyright 2005 University of Cambridge Computer Laboratory.               #
#                                                                             #
#   This file is part of Nprobe.                                              #
#                                                                             #
#   Nprobe is free software; you can redistribute it and/or modify            #
#   it under the terms of the GNU General Public License as published by      #
#   the Free Software Foundation; either version 2 of the License, or         #
#   (at your option) any later version.                                       #
#                                                                             #
#   Nprobe is distributed in the hope that it will be useful,                 #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#   GNU General Public License for more details.                              #
#                                                                             #
#   You should have received a copy of the GNU General Public License         #
#   along with Nprobe; if not, write to the Free Software                     #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA #
#                                                                             #
###############################################################################


##############################################################################
#
# Collect and collate Web Server and User Agent Data from HTTP Header Fields
#

import re
import sys
from sys import argv
import getopt

REP_TARGET_PERCENT = 100

class WebAgents:

    def __init__(self):

        self.sad = {}
        self.uad = {}

        self.nservers_seen = []
        self.nuagents_seen = []
        
        self.unrec_servers = []
        self.unrec_uagents = []

        self.compile_servers()
        self.compile_uagents()

###############################################################################

    def compile_servers(self):

        self.smatches = []

        for r in [

            '^(?P<marque>Apache)'
            '('
            '(?P<vers>/\d+\.\d+\.\d+)?'
            '.+?'
            '(\s*\((?P<os1>.*?)\))?'
            '(\s*\((?P<os2>.*?)\))?'
            '.*'
            ')?'
            '(?P<premarque>)',

            '(?P<marque>Microsoft-IIS)/'
            '(?P<vers>\d+\.\d+)'
            '(?P<os1>)?'
            '(?P<os2>)?'
            '.*'
            '(?P<premarque>)',
            
            '(?P<marque>WindWeb|(((Allegro-Software|ZyXEL)-)?RomPager)|Boa'
            '|Netscape|thttpd|WebSTAR|(S|s)quid|Lotus-Domino'
            '|Caudium|AV|CERN|3Com|ARGUS)'
            '(-(Brew|Commerce|FastTrack))?'
            '(/(v|V)?(?P<vers>\d+(\.\d+)+)(pre\d)?(\.STABLE\d)?)?'
            '(?P<os1>)?'
            '(?P<os2>)?'
            '.*'
            '(?P<premarque>)',
            
            '(?P<marque>(Cougar|APN HTTPD))'
            '(\s(v|V)?(?P<vers>\d+(\.\d+)+))?'
            '(?P<os1>)?'
            '(?P<os2>)?'
            '.*'
            '(?P<premarque>)',
            
            '(?P<marque>(Agranat|Virata)-EmWeb)'
            '(/(?P<vers>R\d+(_\d+)+))?'
            '(?P<os1>)?'
            '(?P<os2>)?'
            '.*'
            '(?P<premarque>)',
            
            '(?P<marque>httpd|Rapidsite/Apa|DCLK|WebLogic)'
            '(-?(Creative|AdSvr|HttpSvr))?'
            '(?P<vers>)?'
            '(?P<os1>)?'
            '(?P<os2>)?'
            '.*'
            '(?P<premarque>)',

            '^(?P<premarque>(IBM_HTTP_(Server|SERVER))|(Stronghold))'
            '/\d(\.\d+)+'
            '\s+'
            '(?P<marque>Apache)'
            '/(?P<vers>\d(\.\d+)+(-dev)?)'
            '.*?'
            '.*?'
            '(\((?P<os1>Unix|Win32)\))|(?P<os2>(RedHat))'
            '.*?',

            '^(?P<premarque>NetCache|Oracle)'
            '.*?'
            '\('
            '(?P<marque>NetApp|Apache)'
            '/(?P<vers>\d(\.\d+)+(\.R\d(D\d)?)?)'
            '.*'
            '(?P<os2>)(?P<os1>)',

            '^(?P<premarque>Oracle)'
            '.*?'
            '(?P<marque>Apache)'
            '/(?P<vers>\d(\.\d+)*)'
            '\s+'
            '\((?P<os1>Win32|Unix)\)'
            '.*'
            '(?P<os2>)'
            

            
            ]:
            self.smatches.append(re.compile(r))
            

###############################################################################

    def compile_uagents(self):

        self.umatches = []

        WINDOWS = '((Windows|Win)\s*(((NT|XP)(\s+\d\.\d)?)|98|95|9x|32|XP|2000|ME|Me|CE|95/98/2000/NT))'
        OTHER_OS = 'Mac_PowerPC|PPC|Linux|Sun|OpenBSD|(OS X)|(IRIX\d*)|SunOS|FreeBSD|OSF1'

        OSS = '%s|%s' % (WINDOWS, OTHER_OS)
        

        #'%s(?#WINDOWS)|Mac_PowerPC'
        for r in [

            '.*?'
            '(?P<premarque>)'
            '(?P<marque>MSIE)'
            '\s+'
            '(?P<vers>\d(\.\d+)+[a-z]?);'
            '.+?'
            '(?P<os1>%s)'
            '.*'
            '(?P<os2>)' % WINDOWS,

            '.*?'
            '\('
            '.*?'
            '(?P<os1>Windows|Macintosh|X11)'
            ';.*?'
            '(?P<os2>%s)'
            '.*?'
            '\)'
            '.*?'
            '(?P<premarque>)'
            '(?P<marque>Netscape|(Gecko(?!/\d+ Netscape)))\d?/'
            '(?P<vers>(\d\.\d+(\.\d*)?)|(\d+))'
            '.*' % OSS,

            '.*?'
            '\('
            '.*?'
            '(?P<premarque>)'
            '(?P<marque>Slurp|grub)'
            '.*?'
            '\)'
            '.*'
            '(?P<vers>)(?P<os1>)(?P<os2>)',

            '(?P<premarque>)'
            '(?P<marque>Googlebot|Scooter|libwww-perl)/'
            '(?P<vers>\d+(\.\d+)*)'
            '.*'
            '(?P<os1>)(?P<os2>)',
            
            '(?P<premarque>)'
            '^(?P<marque>Opera)/'
            '(?P<vers>\d\.\d+)'
            '\s+'
            '\('
            '(?P<os1>%s)'
            '.*?'
            '\)'
            '.*'
            '(?P<os2>)' % OSS,

            '.*?'
            '\(.*?'
            '(?P<os1>%s)'
            '.*?\)'
            '\s+'
            '(?P<premarque>)'
            '(?P<marque>Opera)\s+(?P<vers>\d\.\d+)'
            '.*'
            '(?P<os2>)' % OSS,


            '^Mozilla/\d\.\d+.*?'
            '(?P<premarque>)(?P<marque>)(?P<vers>)'
            '\(.*?'
            '(?P<os1>(Macintosh|Mac|X11))?'
            '.*?'
            '(?P<os2>%s)'
            '.*?'
            '\)'
            '.*' % OSS
            
            ]:
            self.umatches.append(re.compile(r))
            

###############################################################################

    def add_h(self, shost, chost, sa, ua):


        ad = self.uad.setdefault(ua, {})
        n = ad.setdefault(chost, 0)
        ad[chost] = n+1

        ad = self.sad.setdefault(sa, {})
        n = ad.setdefault(shost, 0)
        ad[chost] = n+1

###############################################################################

    def aggregate_h(self):

        def get_os1(match):
            for os in [match.group('os2'), match.group('os1'), 'Not stated']:
                if os:
                    break
            return os

        def get_os2(match):
            os = ''
            spc = ''
            for o in [match.group('os1'), match.group('os2')]:
                if o:
                    os += spc + o
                    spc = ' '
            if not os:
                os = 'Not stated'
            return os

        def list_os(osd):
            oslist = [(n, o) for (o, n) in osd.iteritems()]
            oslist.sort()
            oslist.reverse()
            return oslist


        
        for var, d, matches, get_os in \
            [('servers', self.sad, self.smatches, get_os1),
             ('uagents', self.uad, self.umatches, get_os2)]:

            try:
                unknowns = d[None]
                n = len(unknowns)
                del d[None]
                sdict = {'Not known': [n, {'Not known':n}]}
                tot_inst = n
            except KeyError:
                sdict = {}
                tot_inst = 0

            olist = []
                
            for agent, hdict in d.items():
                nhosts = len(hdict)
                tot_inst += nhosts
                for s in matches:
                    match = s.match(agent)
                    if match:
                        break

                if match:
                    premarque = match.group('premarque')
                    marque = match.group('marque')
                    if premarque:
                        marque = premarque + '/' + marque
                    if not marque:
                        marque = 'Not stated'
                    vers = match.group('vers')
                    os = get_os(match)
                else:
                    marque = 'Unrecognised'
                    vers = 'Not known'
                    os = 'Not known'
                    olist.append((nhosts, agent))
                    #olist.append((agent, nhosts))

                ss = sdict.setdefault(marque, [0, {}])
                ss[0] += nhosts
                n = ss[1].setdefault(os, 0)
                ss[1][os] += nhosts

            sortlist = [(n, agent, list_os(os)) for (agent, (n, os)) in sdict.iteritems()]
            sortlist.sort()
            sortlist.reverse()
            setattr(self, 'n%s_seen' % (var), sortlist)
            setattr(self, '%s_tot_inst' % (var), tot_inst)

            olist.sort()
            olist.reverse()
            setattr(self, 'unrec_%s' % (var), olist)

###############################################################################

    def report_h(self, file=None, verbose=0):

        def write(s):
            if file:
                file.write('%s\n' % s)

        def report(s):
            print s
            write(s)

        def vreport(s):
            if verbose:
                print s
            write(s)

        def vvreport(s):
            if verbose > 1:
                print s
            write(s)


        for label, key in [('Servers', 'servers'), ('User Agents', 'uagents')]:
        #for label, key in [('User Agents', 'uagents')]:

            alist = getattr(self, 'n%s_seen' % (key))
            olist = getattr(self, 'unrec_%s' % (key))
            tot_inst = getattr(self, '%s_tot_inst' % (key))

            print
            report(label)
            report('%s\n' % ('='*len(label)))

            rep_target = (tot_inst*REP_TARGET_PERCENT)/100
            reported = 0
            rec_not_rep = 0
            max_not_rep = 0
            not_rep = []
            
            for n, agent, oslist in alist:
                if reported < rep_target:
                    report('%s %d (%.3f%%)' % (agent, n, (n*100.00)/tot_inst))
                    for n2, o in oslist:
                        vreport('\t%s %d (%.3f%%)' % (o, n2, (n2*100.0)/tot_inst))
                else:
                    not_rep.append((n, agent))
                    rec_not_rep += n
                    max_not_rep = max(max_not_rep, n)
                        
                reported += n

            if rec_not_rep:
                report('\n%s Recognised but not individually reported %d (%.3f%%)\n' % (label, rec_not_rep, (rec_not_rep*100.0)/tot_inst))
                for n, agent in not_rep:
                    vvreport('%s %d' % (agent, n))
                report('\tMaximum instance %d (%.3f%%)\n' % (max_not_rep, (max_not_rep*100.0)/tot_inst))


            max_unrec = 0
            max_unrec_agent = ''

            vvreport('\nUnrecognised %s' % label)
            vvreport('=============%s\n' % ('='*len(label)))
            for n, agent in olist:
                vvreport('%s %d' % (agent, n))
                if n > max_unrec:
                    max_unrec = n
                    max_unrec_agent = agent
            report('\nMaximum unrecognised instance %d (%.3f%%)\n\t\'%s\'\n' % (max_unrec, (max_unrec*100.0)/tot_inst, max_unrec_agent))

###############################################################################

    def save_h(self, dir):

        try:
            f = open(dir + '/agents_seen', 'w')
        except OsError, s:
            print 'np_WebAgents - can\'t create agents file'
            print str(s)
            sys.exit(1)

        for lab, d in [('User Agents', self.uad), ('Servers',  self.sad)]:
            l = []
            for a, hd in d.items():
                l.append((len(hd), a))
            l.sort()
            l.reverse()
            f.write('%s\n\n' % (lab))
            for n, a in l:
                f.write('%s - %d\n' % (a, n))
            f.write('\n\n')

###############################################################################

    def get_h(self, file):

        tm = re.compile('^(?P<type>User Agents|Servers)$')
        am = re.compile('^(?P<agent>.*?) - (?P<n>\d+)$')

        try:
            f = open(file, 'r')
        except IOError, s:
            print 'np_WebAgents - can\'t open agents file'
            print str(s)
            sys.exit(1)

        for l in f.readlines():

            match = am.match(l)
            if match:
                #continue
                #print match.group('agent'), match.group('n')
                hd = {}
                for n in range(int(match.group('n'))):
                    hd[n] = 1
                d[match.group('agent')] = hd
            else:
                match = tm.match(l)
                if match:
                    type = match.group('type')
                    if type == 'User Agents':
                        d = self.uad
                    elif type == 'Servers':
                        d = self.sad
                    else:
                        d = None


###############################################################################

def main():

    verbose = 0
    file = None

    try:
        optlist, args = getopt.getopt(sys.argv[1:], 'vf:')

    except getopt.error, s:
        print 'np_WebAgents.py Error'
        print str(s)
        sys.exit(1)

    for opt in optlist:
        if opt[0] == '-v':
            verbose += 1
        if opt[0] == '-f':
            try:
                file = open(opt[1], 'w')
            except IOError, s:
                print 'Error'
                print str(s)
                sys.exit(1)

    if not args:
        print 'np_WebAgents.py Error - Agents file not specified'
        sys.exit(1)

    agents = WebAgents()
    agents.get_h(args[0])
    agents.aggregate_h()
    agents.report_h(file=file, verbose=verbose)


        

###############################################################################

        


# Call main when run as script
if __name__ == '__main__':
    main()
