#! /usr/bin/env python
###############################################################################
#                                                                             #
#   Copyright 2005 University of Cambridge Computer Laboratory.               #
#                                                                             #
#   This file is part of Nprobe.                                              #
#                                                                             #
#   Nprobe is free software; you can redistribute it and/or modify            #
#   it under the terms of the GNU General Public License as published by      #
#   the Free Software Foundation; either version 2 of the License, or         #
#   (at your option) any later version.                                       #
#                                                                             #
#   Nprobe is distributed in the hope that it will be useful,                 #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#   GNU General Public License for more details.                              #
#                                                                             #
#   You should have received a copy of the GNU General Public License         #
#   along with Nprobe; if not, write to the Free Software                     #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA #
#                                                                             #
###############################################################################


##############################################################################
#
# Run checks on filetypes (MIME)
#
##############################################################################

from os.path import isfile, join
import sys

from nprobe import http_server_objtype_string
import fmagic

##############################################################################

MAGIC_DEF = '/usr/share/magic'

##############################################################################

class FileTypes:

    def __init__(self, magic_file=MAGIC_DEF, objdir='', corr_by_magic=0, mlengths=[], verbose=0, report_diff_files=0):

        self.objdir = objdir
        self.corr_by_magic = corr_by_magic
        self.report_diff_files = report_diff_files
        self.verbose = verbose

        
        self.obj_checked = 0
        self.obj_ok = 0
        self.objdiffs = {}
        self.nofiles = 0
        self.nofilelist = []

        if not isfile(magic_file):
            print 'Error - magic file', magic_file, 'does not exist'
            sys.exit(1)

        self.magic = fmagic.fmagic(fmagic.MAGIC_MIME, magic_file)

        if not mlengths:
            self.check_obtype = self.check_ob
            self.mlens = []
        else:
            self.check_obtype = self.check_ob_iter
            minb = mlengths[0]
            maxb = mlengths[1]
            inc = mlengths[2]
            if inc == 0:
                self.mlens = [minb]
                while minb < maxb:
                    minb*=2
                    self.mlens.append(minb)
            else:
                self.mlens = range(minb, maxb + inc, inc)
            self.oks = [0]*(self.mlens[-1]+1)
        self.toks = {}
        

        

##############################################################################

    def get_mimetype(self, objfnm):
    
        return self.magic.magic_file(objfnm).strip().split(';')[0].split(',')[0]

##############################################################################

    def check_ob_iter(self, obtype, fnm):
    

        objfnm = join(self.objdir, fnm)

        if isfile(objfnm):
            self.obj_checked +=1
            ok = 0
            type = http_server_objtype_string(obtype)
            f = open(objfnm, 'r')
            self.magic.load_buffer(self.mlens[-1], f.fileno())
            for len in self.mlens:
                mtype = self.magic.magic_own_buffer(len).strip().split(';')[0].split(',')[0]
                #print fnm, 'len', len, 'type', mtype, 'claimed', type
                if mtype == type:
                    ok = 1
                    break
            f.close()
            if ok:
                self.obj_ok += 1
                self.oks[len] += 1
                l = self.toks.setdefault(type, [0]*(self.mlens[-1]+1))
                l[len] += 1
            else:
                self.save_diff(type, mtype, fnm)
        else:
            self.nofiles += 1
            if self.verbose:
                print 'NO FILE', fnm
            if self.report_diff_files:
                self.nofilelist.append(fnm)
            
##############################################################################

    def check_ob(self, obtype, fnm):
    

        objfnm = join(self.objdir, fnm)

        if isfile(objfnm):
            self.obj_checked +=1
            type = http_server_objtype_string(obtype)
            mtype = self.get_mimetype(objfnm)
            if mtype == type:
                self.obj_ok += 1
                if self.toks.has_key(type):
                    self.toks[type] += 1
                else:
                    self.toks[type] = 1
            else:
                self.save_diff(type, mtype, fnm)
        else:
            self.nofiles += 1
            if self.verbose:
                print 'NO FILE', fnm
            if self.report_diff_files:
                self.nofilelist.append(fnm)
            
##############################################################################

    def save_diff(self, type, ft, fnm):

        if self.corr_by_magic:
            tmp = ft
            ft = type
            type = tmp
            s1 = 'by magic'
            s2 = 'claimed'
        else:
            s1 = 'claimed'
            s2 = 'by magic'
            
        if self.verbose:
            print '%10s %s %s - %s %s' % (fnm, s1, type, s2, ft)
        d = self.objdiffs.setdefault(type, [0, {}])
        d[0] += 1
        if self.report_diff_files:
            d2 = d[1].setdefault(ft, [])
            c, t = fnm.split('.')
            d2.append((int(c), int(t)))
        else:
            d2 = d[1].setdefault(ft, 0)
            d[1][ft] += 1

        
            
##############################################################################

    def report_diffs(self, file=None):

        def writefile(s):
            if file:
                file.write(s+'\n')

        def report(s):
            print s
            writefile(s)

        def report_b(s):
            if file:
                file.write(s+'\n')
            else:
                print s

        def add(x, y): return x + y
                
        if not self.corr_by_magic:
            corr_str = 'HTTP header claimed type'
            s1 = 'claimed'
            s2 = 'magic'
        else:
            corr_str = '\'file magic\''
            s2 = 'claimed'
            s1 = 'magic'

        if self.report_diff_files:
            difflist = [(tot, type, [(len(fl), ft, fl) for ft, fl in fts.items()]) for type, (tot, fts) in self.objdiffs.items()]
        else:
            difflist = [(tot, type, [(subtot, ft, []) for ft, subtot in fts.items()]) for type, (tot, fts) in self.objdiffs.items()]

        difflist.sort()

        print
        ndiffs = self.obj_checked - self.obj_ok
        report('HTTP claimed object types against \'magic\' identified types:-\n')
        report('%d Objects checked, %d (%.3f%%) differ, %d nofiles' % (self.obj_checked, ndiffs, float(ndiffs*100)/self.obj_checked, self.nofiles))
        report('  (Differences correlated by %s)\n' % (corr_str))

        report('%35s%15s%11s%11s%16s\n' % ((s1 + ' type').center(35), 'Non-matches', 'Matches', 'Total', '%Non-matches'))
        for tot, type, fts in difflist:
            if self.mlens:
                if self.toks.has_key(type):
                    totok = reduce(add, self.toks[type])
                else:
                    totok = 0
            else:
                totok = self.toks[type]
            totall = tot + totok
            report('%s%15d%11d%11d%12.3f' % (type.center(35), tot, totok, totall, float(tot*100)/totall))
                   
            fts.sort()

            id_string = s2 + ' - '
            for subtot, ft, fl in fts:
                report('  %40s%10d' % (id_string + ft, subtot))
                id_string = ''
                if self.report_diff_files:
                    fl.sort()
                    for f in fl:
                        report_b('      %d.%d' % (f[0], f[1]))
            report('\n')
                
           ##  if self.report_diff_files:
##                 for subtot, ft, fl in fts:
##                     report('  %s (%s) %d' % (ft, s2, subtot))
##                     fl.sort()
##                     for f in fl:
##                         report_b('      %d.%d' % (f[0], f[1]))
##             else:
##                 for subtot, ft in fts:
##                     report('  %s (%s) %d' % (ft, s2, subtot))

        report('\n')

        if self.report_diff_files:
            report_b('Nofiles:-')
            for f in self.nofilelist:
                report_b('  %s' % (f))

        report('\n')

        if self.mlens:
            report('No. bytes examined for successful match (all types):-\n')
            report('%10s%10s%10s' % ('bytes', 'matches', '% whole'))       
            accum = 0
            for i in self.mlens:
                this = self.oks[i]
                accum += this
                report('%10d %10d %7.3f' % (i, this, float(accum*100)/self.obj_checked))
            report_b('\n')
            report_b('No. bytes examined for successful match (by type):-\n')
            report('%30s%10s%10s' % ('Type'.center(30), 'bytes', 'matches')) 
            gg = self.toks.items()
            gg.sort()
            for type, llist in gg:
                typestr = type
                for i in self.mlens:
                    if llist[i]:
                        report_b('%30s%10d%10d' % (typestr, i, llist[i]))
                        typestr = ''
        report('\n\n')

###############################################################################
###############################################################################

def main():

    ft = FileTypes()


        

###############################################################################
###############################################################################

        


# Call main when run as script
if __name__ == '__main__':
    main()
