#!/usr/local/bin/perl5 
#
# Author: Alain Desilets
# Date: May 1995
#
# Purpose: 
#   Used for analysing a web page and determining wether or not it 
#   contains a compilation of related web resources.
#   are lists of WWW resources
#
#
# The reusable functions defined in this file are (all other functions 
# were not designed for external use):
#
# Function              Purpose
# --------              -------
# IdentifyLists:	Identifies parts of an HTML page which look
#			like a list of WWW resources.
# 
#
# -----------------------------------------------------------------------------
# HISTORY
# $Log: cutList.pl,v $
# Revision 2.1  1995/11/15  16:12:32  alain
# Creation of version 2: This version was formerly know as v0.3. It solves many bugs in the parsing of URLs, but for some reason that results in worse decision trees than with v0.2
#
# Revision 1.1  1995/11/15  15:35:42  alain
# Creation of version 1: This version was formerly known as v0.2. It has bugs related to the  parsing of URLs, but seems to result in better decision trees...
#
# Revision 0.1  1995/11/15  15:35:41  alain
# Initial deposit
#
# -----------------------------------------------------------------------------

push (@INC, '/perl5/lib');
push(@INC, "/home2/httpd/htdocs/IR/perl/");
push(@INC, "/software/perl/lib/");
require 'Long_DBM.pl';
require('/home2/httpd/htdocs/IR/perl/new.pl');
require ('classifyString.pl');
require('mysets.pl');
require('urlPatterns.pl');
require('wwwurl.pl');
require ('/home2/httpd/htdocs/IR/perl/Web_Agent/robot_get.pl');

#
# To perform regression testing on this library, uncomment
# next line
#
#SetTraceSwitchesForRegressionTest();


#
# Index for various fields pertaining to a segment statistics structure
#
$fieldNum = 0;
$SEG_TEXT = $fieldNum;
$fieldNum++;
$SEG_CHAR_COUNT = $fieldNum;
$fieldNum++;
$SEG_IS_REF = $fieldNum;
$fieldNum++;
$SEG_SRV_NAME = $fieldNum;
$fieldNum++;
1;



# sub IdentifyLists
#
# Abstract
#    Identifies which parts of an HTML page look like WWW resource lists
#
# Inputs
#    $fileName: name of HTML file
#
# Standard Output: Lists all the WWW resources files found
#

sub IdentifyLists {
   local ($fileName) = @_;

		# Split the page into URLreference and non-URLreference
		# segments
   @splitInpStr = SegmentHTMLPage ($fileName);

		# Calculate URL reference stats for whole document
   $lastIndex = $#splitInpStr;
   $ptBaseStats = URLReferenceStats (0, $lastIndex, @splitInpStr);
   if ($traceIdentifyLists eq "on") {
      print $nl, "-- IdentifyLists: Run on file ", $fileName, $nl;
      print $nl, "-- IdentifyLists: base statistics: RefRatio, SrvRatio, 
            RefTotal, CharsTotal = ", $nl,
            $$ptBaseStats{'RefRatio'}, " ", $$ptBaseStats{'SrvRatio'}, " ",
            $$ptBaseStats{'RefTotal'}, " ", $$ptBaseStats{'CharsTotal'}, $nl;
   };

		# Find all substrings that look lik a resource list
   $ptResourceLists = 
      FindResourceLists ($ptBaseStats, @splitInpStr);

   if ($traceIdentifyLists eq "on") {
      print "-- IdentifyLists: Here are the maximal resource lists", 
            "found", $nl;
      foreach $aSubs (@$ptResourceList) {
         $start = $$aSubs[0];
         $end = $$aSubs[1];
         PrintResourceList ($start, $end, @splitInpStr);
      };
   };

}



# sub SegmentHTMLPage 
# 
# Abstract
#    Reads an HTML page and segments it into hyperlink and non-hyperlink
#    sections.
#
# INputs
#    $pageID: a string identifying the HTML page
#    $idType: tells whether $pageID is a path to a file or the URL of a web
#             page. If this argument is not specified, it defaults to
#             'path'.
#    $cache:  cache for storing web page. (Optional) 

sub SegmentHTMLPage {

    local ($pageID, $idType, $cache) = @_;

    if (!defined($idType)) {$idType = 'path'};

    if ($idType eq 'path') {

	# Treat $pageID as the path to file, and read file into a string.
	
	if (! open(INPUTFILE, $pageID)) {
	    die $nl."ERROR: Sub SegmentHTMLPage could not open file ".$pageID.$nl; 
	};

	@inputArray = <INPUTFILE>;
	$inputString = join('', @inputArray);

    } 

    elsif ($idType eq 'url') {
	# Treat $pageID as the URL. Retrieve page into a single string.
	$inputString = RetrievePage($cache,$pageID,'','mozilla','*/*');

#      print "--- ParseHTMLPage: retrieved page\n$inputString\n--- End of page\n";	
    }



    
    # Split input string into URLreference and non-URLreference
    # segments
    $inpStrSmallCaps = $inputString;
    @splitInpStr = MySplit($hyperlinkPattern, $inpStrSmallCaps, 1);

    # Compute statistics on each segment
    @segmentedInpStr = ();
    for ($ii=0; $ii <= $#splitInpStr; $ii++) {
	$aSegment = $splitInpStr[$ii];
	${$segmentedInpStr[$ii]} [$SEG_CHAR_COUNT] = length($aSegment);
        ${$segmentedInpStr[$ii]} [$SEG_TEXT] = $splitInpStr[$ii];
#        if ($aSegment =~ $hyperlinkPattern) {
        if ($aSegment =~ /$hyperlinkPattern/i) {
            ${$segmentedInpStr[$ii]} [$SEG_IS_REF] = 1;
            $address = ParseHRefServer($aSegment);
            ${$segmentedInpStr[$ii]} [$SEG_SRV_NAME] = $address;   
        } else {
            ${$segmentedInpStr[$ii]} [$SEG_IS_REF] = 0;
            ${$segmentedInpStr[$ii]} [$SEG_SRV_NAME] = '';   
        }
    }

    $counter = 0;
    if ($traceSegmentHTMLPage eq "on") {
        print "-- SegmentHTMLPage: The segmented input string is:", $nl;
        foreach $aSegment (@segmentedInpStr) {
	    if ($$aSegment[$SEG_IS_REF]) {
	        $type = ' HREF ';
            } else {
	        $type = ' NON-HREF ';
	    }
	    print "-- SegmentHTMLPage:  START OF $type SEGMENT  ", $counter;
	    $aString = $$aSegment[$SEG_TEXT];
	    print $nl, $aString,
	    $nl, "-- SegmentHTMLPage:  END OF SEGMENT ", $counter, $nl;
	    $counter++;
        }
    }
    return @segmentedInpStr;
}


sub PrintSubstring {
   local ($start, $end, @splitString) = @_;

   for ($ii = $start; $ii <= $end; $ii++) {
      print ${$splitString[$ii]} [$SEG_TEXT];
   }
}

# sub FindResourceLists
#
# Abstract
#    Finds all substrings of the input string that look like
#    a resource list. It only returns the resource lists that
#    are not substrings of other resource lists.
#
sub FindResourceLists {
   local ($ptBaseStats, @splitInpStr) = @_;

   $ptResourceList = [];
   for ($start = 0; $start <= $#splitInpStr; $start++) {
      for ($end = $start + 1; $end <= $#splitInpStr; $end++) {

		# For each substring of the input string DO:
                #
         if ($traceFindResourceLists eq "on") {
            print "-- FindResourceLists: START SUBSTRING", $nl;
            for ($jj = $start; $jj <= $end; $jj++) {
               print ${$splitInpStr[$jj]} [$SEG_TEXT]
            };
            print $nl, "-- FindResourceLists: END SUBSTRING", $nl;
         };
		# Collect URL reference statistics for that substring
                #
         $ptRefStats = URLReferenceStats ($start, $end, @splitInpStr);

		# Compute the features for that substring and use them
		# to classify it
         $ptFeatures = ComputeFeatures (@splitInpStr);
         if (IsResourceList($ptFeatures)) {
                          # This substring looks like a list of WWW resources
                          # Update list of resource lists.
            $ptResourceList = 
                 UpdateResourceLists ($start, $end, $ptResourceList);
            if ($traceFindResourceLists eq "on") {
               print "-- FindResourceLists: Substring is a resource list", $nl;
            }
         };
         if ($traceFindResourceLists eq "on") {
            print "-- FindResourceLists: List of resource lists updated to:", 
                  $nl;
            foreach $aSubs (@$ptResourceList) {
               $start = $$aSubs[0];
               $end = $$aSubs[1];
               print "-- FindResourceLists: start, end = ", $start, " ", 
                     $end, $nl;
            };
         };
      }
   }


   if ($traceFindResourceLists eq "on") {
      print "-- FindResourceLists: Here are the maximal resource lists", 
            "found", $nl;
      foreach $aSubs (@$ptResourceList) {
         print "-- FindResourceLists: START RESOURCE LIST";
         $start = $$aSubs[0];
         $end = $$aSubs[1];
         for ($ii = $start; $ii <= $end; $ii++) {
            print ${$splitInpStr[$ii]} [$SEG_TEXT];
         };
         print "-- FindResourceLists: END RESOURCE LIST";
      }
   };
   $ptResourceList;
}


# sub URLReferenceStats {
#
# Abstract
#    Given specifications of a substring in the input string,
#    returns some stats on the relative importance of URL reference text
#    and non URL reference text in that substring.
#
# Inputs
#    $startInd   : Index in @splitInpStr of the start substring.
#    $endInd     : Index in @splitInpStr of end substring
#    @splitInpStr: Split input string. The array alternates URL reference
#                  substrings and non URL reference substrings.
#
#
# Outputs
#    URLReferenceStats: returns an associative array with following 
#                       keys
#
#                      'RefRatio': percentage of text between $startInd
#                                  and $endInd which consists of URl 
#                                  references
#                      'SrvRatio': number of distinct servers
#                                  between $startInd and $endInd
#                                  as a percentage of total
#                                  number of URL refs 
#                      'RefTotal': total number of references 
#                      'CharsTotal': total number of characrters in substring
#                      'Length': number of segments in the substring
#

sub URLReferenceStats {
   local ($startInd, $endInd, @splitInpStr) = @_;

   if ($traceURLReferenceStats eq "on") {
      print $nl, "-- URLReferenceStats: startInd, endInd = ", $startInd, 
            " ", $endInd, $nl;
  };

   $totalChars = 0;        # Total number of characters
   $refChars = 0;          # Total number of characters in URL references
   $numRefs = 0;           # Total number of URL references

                           # Set of distinct Servers found
   undef (%$ptServersFound); $$ptServersFound{'NumFound'} =  0;   

   $ind = $startInd;
   while ($ind <= $endInd) {
      $aSegment = $splitInpStr[$ind];
      if ($traceURLReferenceStats eq "on") {
         print "-- URLReferenceStats: Segment is:", $$aSegment[$SEG_TEXT], $nl;
      };
      if ($$aSegment[$SEG_IS_REF] == 1) {
                      # this is a URL reference
         if ($traceURLReferenceStats eq "on") {
             print "-- URLReferenceStats: Server name is:", $$aSegment[$SEG_SRV_NAME], $nl;
         };
         $refChars = $refChars + $$aSegment[$SEG_CHAR_COUNT];
         $ptServersFound 
            = UpdateServersSet ($$aSegment[$SEG_SRV_NAME], $ptServersFound);
         $numRefs++;
      }
      $totalChars = $totalChars + $$aSegment[$SEG_CHAR_COUNT];
      $ind++;
   };

   $numSrvFound = $$ptServersFound{'NumFound'};
   if ($traceURLReferenceStats eq "on") {
      print "-- URLReferenceStats: refChars, totalChars, ", 
            "numSrvFound, numRefs = ", $nl, "    ",
            $refChars, " ", $totalChars, " " , $numSrvFound, " ",
            $numRefs, $nl;
   };

   $refRatio = ($refChars / $totalChars) * 100;
   if ($numSrvFound == 0 && $numRefs == 0) {
      $srvRatio = 0
   } else {
      $srvRatio = ($numSrvFound / $numRefs) * 100
   }

   if ($traceURLReferenceStats eq "on") {
      print "-- URLReferenceStats: refRatio, srvRatio = ",
            $refRatio, " ", $srvRatio, $nl;
   };

   $$ptRefStats{'RefRatio'} = $refRatio;
   $$ptRefStats{'SrvRatio'} = $srvRatio;
   $$ptRefStats{'SrvTotal'} = $numSrvFound;
   $$ptRefStats{'SrvCharsRatio'} = ($numSrvFound / $totalChars) * 100;
   $$ptRefStats{'RefTotal'} = $numRefs;
   $$ptRefStats{'CharsTotal'} = $totalChars;
   $$ptRefStats{'Length'} = $endInd - $startInd + 1;
   return $ptRefStats;
}

# sub UpdateServersSet
#
# Abstract
#    Adds a server to the set of servers found (if it's
#    not already in it.
#
sub UpdateServersSet {
   local ($address, $ptServersFound) = @_;
   
		# add this address to the set
   $alreadyEncountered = $$ptServersFound{$address};
   if (! $alreadyEncountered) {
      $$ptServersFound{$address} = 1;
      $$ptServersFound{'NumFound'} = $$ptServersFound{'NumFound'} + 1; 
   }
   return $ptServersFound;
}


# sub UpdateResourceLists
#
# Adds a resource list substring  to the list of resource list 
# substrings.
#
# In the process, eliminate substrings which are dominated by (i.e.
# are substrings of) the substring to be added.
#
# Conversly, if the substring to be added is dominated by an other 
# substring, do not add it to the list.
#
sub UpdateResourceLists {
   local ($start, $end, $ptResourceList) = @_;

   $newSubs = [$start, $end];
   @resourceList = @$ptResourceList;
   $lastList = $#resourceList;
   for ($ii = 0; $ii <= $lastList; $ii++) {
      $aSubs = $$ptResourceList[$ii];
      $aSubsStart = $$aSubs[0];
      $aSubsEnd = $$aSubs[1];
      if (($aSubsStart <= $start) && ($aSubsEnd >= $end)) {
               # new substring is dominated by an existing one.
               # do not add it to the list
         return $ptResourceList;
      } elsif (($aSubsStart >= $start) && ($aSubsEnd <= $end)) {
               # new substring dominates an existing one.
               # Remove it from the list.
         splice (@$ptResourceList, $ii, 1);
      };
   };
               # new substring is not dominated by any substring
               # in the list. Add it to the list
  push (@$ptResourceList, $newSubs);
  $ptResourceList;
}


#
# Hansol's version (also the one used to generate benchmark)
#
sub ParseHRefServerOld {
   local ($aHRef) = @_;

               # Find the URL
   $aURL = $aHRef;
   @parsed = split(/$pattAddressPrefix/i, $aURL);
   $aURL = $parsed[1];
   @parsed = split(/$pattAddressSuffix/i, $aURL);
   $aURL = $parsed[0];

               # Then parse the URL
	       #
	       # We use the 'parse' function developed by Roy Fielding
	       # for most URLs, but use a special function 
	       # 'ParseExceptionURLs' for URLs that 'parse' cannot
	       # deal with. These are:
	       #     
	       #     'mailto:' URL (email address)
	       #     '#' URL (address of a label in the same html doc)
	       #
   ($address, $isException) = ParseExceptionURLs ($aURL);
   if ($isException == 0) {
      ($scheme, $address, $port, $path, $query, $frag) = parse($aURL);
      if ($scheme eq '' && $address eq '' && $port eq '' && $path eq '' && 
         $query eq '' && $frag) {
         print 'ERROR: Sub ParseHRefServer could not parse HREF ', $aHRef;
      };
   };
   return $address   
}


#
# my corrected version
#
sub ParseHRefServer {
   local ($aHRef) = @_;

               # Find the URL
   $aURL = ParseHRefUrl ($aHRef);

 	       # Then parse its server
   $address = ParseURLServer($aURL);
   return $address;
}

sub ParseHRefUrl {
   local ($aHref) = @_;

               # Find the URL
    $aHref =~ /$hyperlinkPattern/i;
    return $1
}

sub ParseURLServer {
   local ($aUrl) = @_;


    # Parse the server from a URL
    #
    # We use the 'parse' function developed by Roy Fielding
    # for most URLs, but use a special function 
    # 'ParseExceptionURLs' for URLs that 'parse' cannot
    # deal with. These are:
    #     
    #     'mailto:' URL (email address)
    #     '#' URL (address of a label in the same html doc)
    #

   ($address, $isException) = ParseExceptionURLs ($aURL);
   if ($isException == 0) {
      ($scheme, $address, $port, $path, $query, $frag) = parse($aURL);
      if ($scheme eq '' && $address eq '' && $port eq '' && $path eq '' && 
         $query eq '' && $frag) {
         print 'ERROR: Sub ParseURLServer could not parse HREF ', $aHRef;
      };
   };
   return $address   
}


# sub ParseExceptionURLs ($aURL)
#
# Abstract
#     This function extracts the server name of special URLs that 
#     are not parsed by the 'parse' function. These are:
#
#        'mailto:' URLs (email address)
#        '#' URLs (references to a label in the same html doc)
#
# Inputs
#     $aURL : URL to be parsed
#
# Outputs
#     returns array ($srv, $isException) where
#
#     $srv : name of the server ('' if the server is the same as that
#            of the document containing the URL)
#
#     $isException : = 1  iif the URL was a "special" one
#                      0  otherwise
#

sub ParseExceptionURLs {
   local ($aURL) = @_;

   $isException = 0;

   $mailtoPattern = ' *mailto *: */* *';
   $localPattern = ' *#';

   if ($aURL =~ $mailtoPattern) {

              # This is a 'mailto:' URL
      @tmp = split(/$mailtoPattern/, $aURL);
      @tmp = split(/\@/, $tmp[1]);
      $srv = $tmp[1];
      $isException = 1;
   } elsif ($aURL =~ $localPattern) {

              # This is a local '#' URL
      $srv = '';
      $isException = 1;
   };

   return ($srv, $isException)
   
}

sub PrintResourceList {
   local ($start, $end, $splitInpStr) = @_;

   $ptRefStats = URLReferenceStats ($start, $end, @splitInpStr);
   PrintURLRefStats ($ptRefStats); print $nl;
   print "==========  START RESOURCE LIST ==========", $nl;
   for ($ii = $start; $ii <= $end; $ii++) {
      print $splitInpStr[$ii];
   };
   print $nl,"========== END RESOURCE LIST ==========", $nl;
}


sub PrintURLRefStats {
   local ($ptRefStats) = @_;

   print "URL Reference Stats: RefRatio, SrvRatio, RefTotal, CharsTotal ",
         " SrvTotal, SrvCharsRatio = ", 
         $$ptRefStats{'RefRatio'}, " ", $$ptRefStats{'SrvRatio'}, " ", 
         $$ptRefStats{'RefTotal'}, " ", $$ptRefStats{'CharsTotal'}, " ",
         $$ptRefStats{'SrvTotal'}, " ", $$ptRefStats{'SrvCharsRatio'};
}


sub DumpURLRefStats {
   local ($ptRefStats) = @_;
   $sep = ', ';
   print $$ptRefStats{'RefRatio'}, $sep, $$ptRefStats{'SrvRatio'}, $sep, 
         $$ptRefStats{'RefTotal'}, $sep, $$ptRefStats{'CharsTotal'}, $sep,
         $$ptRefStats{'SrvTotal'}, $sep, $$ptRefStats{'SrvCharsRatio'}
}



sub RetrievePage {
    local ($cache, $url, $refer, $agent, $accept, $file) = @_;
    my ($patternError);

    $patternError = '\*\s*\d+\sError';
    $page = Retrieve_Pair($cache, $url);
    if( $page eq '' || (!defined($cache)) ) { 
	$page = robot_url_get( $url, $refer, $agent, $accept, $file);
        if (!  ($page =~ /$patternError/)) {

	    $page = $page.' ';
	    &Add_Pair($cache,$url, $page);
	}
    }
    return $page;
}


sub MySplit {
    local ($pattern, $inputString, $keepMatches) = @_;
    my (@splitResults);

    $remaining = $inputString;
    while ($remaining ne '') {
	if ($remaining =~ /$pattern/i) {
	    push(@splitResults, $`);
	    if ($keepMatches) {push(@splitResults, $&);}
	    $remaining = $';
	} else {
	    push(@splitResults, $remaining);
	    $remaining = '';
	}
    }
    return @splitResults;
}
