#!/usr/local/bin/perl5 
#
# Abstract
#   This script takes as input a source file, project file or directory and 
#   creates a Dragon Dictate vocabulary with sentences for all the special 
#   expressions it contains.
#  


#
# Useful constants
#
$COMMENTS_SINGLE_LINE_START = 0;
$COMMENTS_BRACKETED_START  = 1;
$COMMENTS_BRACKETED_END  = 2;
$COMMENTS_BRACKETED = 3;
$COMMENTS_SINGLE_LINE = 4;
$SYMBOLS = 5;

#
# Home directory of VoiceCode 
#
$voiceCodeHome = $ENV{'VCHOME'};

#
# This pattern will match any string in a greedy/non-greedy fashion
#
$anythingGreedy = '[\s\S]*';
$anythingNonGreedy = '[\s\S]*?';

#
# Read abbreviations and their expansions from file
#
%abbreviations = ReadAbbreviationsFile();

#
# Compile the symbols vocabulary
#
%symbols = CompileSymbols(@ARGV);

#
# Print vocabulary 
#
PrintVocabularyFile(%symbols);
1;



sub ReadAbbreviationsFile {
    $pattAbbreviation = "($anythingNonGreedy)\\s+($anythingNonGreedy)\\s*\$";

#
# Read the file
#
    $fileName = "$voiceCodeHome/Configuration/default.abr";
    if (!open(INPUTFILE, $fileName)) {
	die "Could not open file: $fileName\n";
    }
    while ($anAbbreviation = <INPUTFILE>) {
	$anAbbreviation  =~ /$pattAbbreviation/;
	$abbreviations {$1} = $2;
    }
    foreach $anAbbreviation  (keys(%abbreviations)) {
	$upCasedAbbreviation = Uppercase($anAbbreviation);
	$abbreviations{$upCasedAbbreviation} = $abbreviations{$anAbbreviation};
	$capitalizedAbbreviation = Capitalize($anAbbreviation);
	$abbreviations{$capitalizedAbbreviation} = $abbreviations{$anAbbreviation};
    }
    return %abbreviations;
}


sub CompileSymbols {
    local (@arguments) = @_;
    my (%symbols);

    #
    # Get arguments and options
    #
    # $fileName: path of the file for which we want to compile a vocabulary
    #
    # $language: programming language in which the file is written
    #
    # $caseAfterUppercasedTerm: 
    #
    #    used to set a preference in how to deal with an ambiguity in the 
    #    hungarian notation. 
    #
    #    When  an uppercase letter is preceded by an uppercase letter and followed 
    #    by a lower case letter, this could mean
    #    that the uppercase letter is the beginning of a new term 
    #    (e.g. the "N" in "aVARIABLEName"), or the end of a term in upper case 
    #    (e,g, the "E" in "aVARIABLEname").
    #
    #    In the first case, set $caseAfterUppercasedTerm='u' (this is the default)
    #    and  $caseAfterUppercasedTerm='l' for the second case.
    #
    ($fileName, $language, $caseAfterUppercasedTerm, $fileType) 
        = GetArgsOptions(@arguments);

    print "Compiling vocabulary for file: $fileName. \n   Options are -l$language -c$caseAfterUppercasedTerm -o$fileType\n";

    #
    # Set Patterns for the specific language in the file
    #
    @patterns = SetPatterns($language);
#    print "patterns are: "; foreach $aPattern (@patterns) {print "$aPattern\n";};

    #
    # Different processing for diferent file types 
    #
    if ($fileType eq 's') {
        #
        # compile vocabulary for a source file
        #
        %symbols = CompileSourceVocabulary($fileName);    

    } elsif ($fileType eq 'p') {
        #
        # Compile vocabulary for all the files listed in a project file
        #
        %symbols = CompileProjectVocabulary($fileName);

    } elsif ($fileType eq 'd') {
        #
        # compile vocabulary for all the file names under a directory. Do 
        # not go into subdirectories
        #
        %symbols = CompileDirectoryVocabulary($fileName);

    } elsif ($fileType eq 't') {
        #
        # compile vocabulary for all the file names in the directory tree
        # rooted at $fileName
        #
        %symbols = CompileTreeVocabulary($fileName);
    }
    return %symbols;
}


#
# Compile a vocabulary with words for each symbol encountered in a source file
#
sub CompileSourceVocabulary {
    local ($fileName) = @_;
    my (%symbols);
#
# Read the file
#
    if (!open(INPUTFILE, $fileName)) {
	die "Could not open file: $fileName\n";
    }
    @inputArray = <INPUTFILE>;
    $inputString = join('', @inputArray);
#
# Parse the symbols
#
    %symbols = ParseSymbols($inputString);
    return (%symbols);
}

#
# Compile vocabulary for all the files listed in a project file
#
sub CompileProjectVocabulary {
    local ($fileName) = @_;
    my (%symbols);

    #
    # Read the project file
    #
    if (!open(INPUTFILE, $fileName)) {
	die "Could not open file: $fileName\n";
    }
    @lines = <INPUTFILE>;
    $filesString = join('', @lines);
    @files = split(/\s+/, $filesString);

    foreach $aFile (@files) {

#
# If substitute environment variables in file name
#
	if ($aFile =~ /^\$($anythingNonGreedy)\//) {
	    $environmentValue = $ENV{$1};
	    if ($environmentValue) {
		$aFile = "$environmentValue/$'";
	    } else {
		die "Environment variable \$$1 not defined.";
	    }
	}
	#
	# Compile a vocabulary for this file
	#
	@arguments = ($aFile);
	%symbolsThisFile = CompileSymbols(@arguments);

	#
	# Add this vocabulary to the current one
	#
 	foreach $aKey (keys(%symbolsThisFile)) {
 	    $symbols{$aKey} = $symbolsThisFile{$aKey};
 	}
    }

    return %symbols;
}

#
# compile vocabulary for all the file names under a directory. Do 
# not go into subdirectories
#
sub CompileDirectoryVocabulary {
    local ($dir) = @_;

    chdir ($dir) || die "Error: CompileDirectoryVocabulary: Could not find directory $dir\n" ;

    opendir(DIR, '.') || die "Error: CompileDirectoryVocabulary: Could not open $dir";
    local (@fileNames) = readdir(DIR);
    closedir(DIR);

    #
    # Then create symbols for every children of this directory
    #
    foreach $aFile (@fileNames) {
	%newSymbols = ParseSymbols("$dir/$aFile");

	#
	# Add this vocabulary to the current one
	#
 	foreach $aKey (keys(%newSymbols)) {
 	    $symbols{$aKey} = $newSymbols{$aKey};
 	}
    }
    return %symbols;
}

#
# Create a vocabulary with words for each symbol contained in the name of 
# of file in the subtree rooted at $fileName
#
sub CompileTreeVocabulary {
    local ($fileName) = @_;
    my (%symbols);

    #
    # Create symbols for every descendants of this directory
    #
    LoopDir('CreatePathSymbols', $fileName);
    %symbols = %symbolsCreatePath;
    return (%symbols);
}


#
# Get the arguments and options
#
sub GetArgsOptions {
    local (@cmdLineArgs) = @_;
    
#
# First, get the file name
#
    $fileName = pop(@cmdLineArgs);

#
# Then, get the options. First, set default values
#
    ($language, $caseAfterUppercasedTerm, $fileType) = ('C', 'u', 's');


#
# First set automatic options based on the file name, then overide them
# with explicit options given at the command line.
#
    ($language, $caseAfterUppercasedTerm, $fileType) = GetAutoOptions($fileName);
    ($language, $caseAfterUppercasedTerm, $fileType) = GetOptions(@cmdLineArgs);

    return ($fileName, $language, $caseAfterUppercasedTerm, $fileType);
}

sub GetOptions {
    local (@commandLine) = @_;

#
# Read options from a list of command line arguments
#
    while ($commandLine[0] =~ /-([a-zA-Z])($anythingNonGreedy)$/) {
	if ($1  eq 'l') {
#	    if ($2 =~ /perl|C|lisp|unix/) {
		$language = $2;
#	    } else {
#		die "Invalid modifier for -l option: $2";
#	    }
	} elsif ($1 eq 'c') {
	    if (($2 eq 'u') || ($2 eq 'l')) {
		$caseAfterUppercasedTerm = $2;
	    } else {
		die "Invalid modifier for -c option: $2\n";
	    }
	} elsif ($1 eq 'o') {
	    if (($2 eq 's') || ($2 eq 'p') || ($2 eq 'd') || ($2 eq 't')) {
		$fileType = $2;
	    } else {
		die "Invalid modifier for -o option: $2\n";
	    }
	} else {
	    die "Invalid option: $1\n";
	}
	shift(@commandLine);
    }
    return ($language, $caseAfterUppercasedTerm, $fileType);
}


sub GetAutoOptions {
    local ($fileName) = @_;

#
# Read the automatic options for different file types
#
    @fileTypeOptions = ReadAutoOptions ();

#
# If the file is a directory set its default file type to 't'
#
     if (isDirectory($fileName))  {
 	$fileType = 't';
	$language = 'unix';
     }

#
# Get options for that file type
#
    for ($ii=0; $ii<$#fileTypeOptions; $ii = $ii + 2) {
	 if ($fileName =~ /$fileTypeOptions[$ii]/) {
	     @dummyCmdLineArgs = split(/ +/, $fileTypeOptions[++$ii]);
	     ($language, $caseAfterUppercasedTerm, $fileType) = GetOptions(@dummyCmdLineArgs);
         }
    }

    return ($language, $caseAfterUppercasedTerm, $fileType);
}


sub ReadAutoOptions {

    $pattAnAutoOption = "\\s*($anythingNonGreedy)\\s+'($anythingNonGreedy)'\$";
    #
    # Read the auto options file
    #
    $autoOptionsFileName = "$voiceCodeHome/Configuration/default.aut";
    if (!open(AUTO_OPTIONS_FILE, $autoOptionsFileName)) {
	die "Could not open file: $autoOptionsFileName\n";
    }


    while ($anAutoOption = <AUTO_OPTIONS_FILE>) {
	$anAutoOption =~ /$pattAnAutoOption/;
	$aPattern = $1;
	$theOptions = $2;
	$aPattern  =~ s/\*/$anythingGreedy/;
	$aPattern = "$aPattern\$";
	push (@fileTypeOptions, $aPattern, $theOptions);
    }

    return @fileTypeOptions;
}


sub SetPatterns {
    local ($language) = @_;

    #
    # Pattern for parsing a language definition entry
    #   
    $pattLanguageEntry = "\\s*(COMMENTS_SINGLE_LINE_START|COMMENTS_BRACKETED_START|COMMENTS_BRACKETED_END|SYMBOLS)\\s+(\\'$anythingGreedy\\')\\s*\$";

    
    $languagePatternFileName = "$voiceCodeHome/Configuration/$language.lng";

    #
    # Open the language definition file
    #
    if (!open(LANGUAGE_PATTERN_FILE, $languagePatternFileName)) {
	die "Could not open language definition file: $languagePatternFileName\n";
    }

   #
   # Read patterns from that file
   #
    undef (@patterns);
    while ($aLanguageEntry = <LANGUAGE_PATTERN_FILE>) {
	if ($aLanguageEntry =~ /$pattLanguageEntry/) {
	    $expression = "\$patterns[\$$1] = $2";
	    eval ($expression);
	} else {
	    die "Invalid entry in language definition file $languagePatternFileName: $aLanguageEntry\n";
	}
    }

    $patterns[$COMMENTS_SINGLE_LINE] = 	
	"$patterns[$COMMENTS_SINGLE_LINE_START]$anythingNonGreedy\\n";

    $patterns[$COMMENTS_BRACKETED] = "$patterns[$COMMENTS_BRACKETED_START]$anythingNonGreedy$patterns[$COMMENTS_BRACKETED_END]";

    return (@patterns);
}



sub ParseSymbols {
    local ($inputString) = @_;
    my (%symbols);

    #
    # Remove all comments from the file
    #
    $inputNoComments = RemoveComments($inputString);

    #
    # Find all symbols in the file
    #
    %symbols = FindSymbols($inputNoComments);
    return %symbols;
}


sub RemoveComments {
    local ($inputString) = @_;
    my ($noComments);

#
# Remove bracket style comments
#
    $noComments = $inputString;
    $noComments =~ s/$patterns[$COMMENTS_BRACKETED]//g;

#
# Remove single line style comments
#
#    print "\n\n-- pattCommentsSingleLine=$pattCommentsSingleLine\n";
    $noComments =~ s/$patterns[$COMMENTS_SINGLE_LINE]//g;

    return $noComments;
}


sub FindSymbols {
    local ($inputNoComments) = @_;
    my (%symbols);

#
# Are these symbols to be put in the 'Symbols' or the 'Files' word group
#
    if ($fileType eq 's') {
	$wordGroup = 'Symbols';
    } else {
	$wordGroup = 'Files';
    }

    while ($inputNoComments =~ /($patterns[$SYMBOLS])/) {
        $symbols{"$wordGroup $&"} = 1; 
	$inputNoComments = $';
    };

    return %symbols;
}



sub PrintVocabularyFile {
    local (%symbol) = @_;

    #
    # Open output file
    #
    $outputFileName = "\>$voiceCodeHome/Load/new.ddx";
    if (!open(OUTPUT_FILE, $outputFileName)) {
        die "Could not open file: $outputFileName for output\n";
    }


#
# Split words into file names symbols and source symbols
#
    @sortedKeys = sort(keys(%symbol));
    foreach $aKey (@sortedKeys) {
	if ($aKey =~ /Files ($anythingGreedy)/) {
	    push (@fileSymbols, $1);
	} elsif ($aKey =~ /Symbols ($anythingGreedy)/) {
	    push (@sourceSymbols, $1);
	}
    }

#
# Initialise output string
#
    $outputString = '';

# 
# Print the Files symbols vocabulary
#
    $outputString = PrintAVocabulary ($outputString, 'Files', @fileSymbols);

#
# Print the Source symbols vocabulary
#
    $outputString = PrintAVocabulary ($outputString, 'Symbols', @sourceSymbols);

#
# Substitute abbreviations by their expansions
#
    $outputString = SubstituteAbbreviations($outputString, @abbreviations);

     print OUTPUT_FILE $outputString;

#
# Close output file
#
    close(OUTPUT_FILE);
}


#
# Print a group of words in the vocabulary
#
sub PrintAVocabulary {
    local ($outputString, $groupName, @symbolList) = @_;

    $outputString = $outputString."switch-to-vocabulary /create /module VoiceCode-$groupName VoiceCode-$groupName \n";

    foreach $aSymbol (@symbolList) {
	$spokenSymbol = CreateSpokenSymbol($aSymbol);
#
# Print macro if it is non-null.
# Avoid creating duplicate macros in case two different symbols expand to 
# the same ponounceable macro.
#
	if (!($macrosPrinted{$spokenSymbol})) {
	    if ($spokenSymbol) {
		$outputString = $outputString."add-word  \"$spokenSymbol\" /keys \"$aSymbol\" /nsc\n";
		$macrosPrinted{$spokenSymbol} = 1;
	    }
	}
    }
    return $outputString;
}


sub SubstituteAbbreviations {
   local ($outputString, @abbreviations) = @_;

    $allAbbreviations = join('|', keys(%abbreviations));
    $substituteWhat = '(\[|\s+)('.$allAbbreviations.')(\]|\s+)';
    $substituteFor =  '$1$abbreviations{$2}$3';
    $substituteCommand = "\$outputString =~ s\/$substituteWhat\/$substituteFor\/gi";

    eval ($substituteCommand);
    return $outputString;
}

sub SubstituteAbbreviationsOld {
   local ($outputString, @abbreviations) = @_;

    for ($ii=0; $ii<$#abbreviations; $ii = $ii + 2) {
	$anAbbrev = $abbreviations[$ii];
 	$substitution = $abbreviations[$ii+1];
        $outputString =~ s/(\[|\s+)$anAbbrev(\]|\s+)/$1$substitution$2/gi;
     }

    return $outputString;
}



#
# Create a pronounceable  dragon dictate macro for this symbol
#

sub CreateSpokenSymbol {
    local ($aSymbol) = @_;
    my ($spokenWord, $first, $nextTerm);

    $spokenWord = $aSymbol;

#
# All non-alphanumeric characters are term separators
#
    $spokenWord =~ s/[\W_]/ /g;

#
# So are digits
#
    $spokenWord =~ s/(\d)/ $1 /g;

#
# Deal with hungarian notation.
#
 
#
# An uppercase letter preceded by a lowercase letter marks the beginnng of a 
# new term
#
    $spokenWord =~ s/([a-z]{1})([A-Z])/$1 $2/g;

#
# When  an uppercase letter is preceded by an uppercase letter and followed 
# by a lower case letter, this could mean
# that the uppercase letter is the beginning of a new term 
# (e.g. the "N" in "aVARIABLEName"), or the end of a term in upper case 
# (e,g, the "E" in "aVARIABLEname").
#
#
    if ($caseAfterUppercasedTerm eq 'l') {
	$spokenWord =~ s/([A-Z])([A-Z])([a-z]{1})/$1$2 $3/g;
    } else {
	$spokenWord =~ s/([A-Z])([A-Z])([a-z]{1})/$1 $2$3/g;
    }


#
# Remove consecutive blanks
#
    $spokenWord =~ s/ +/ /g;

#
# Remove trailing/leading blanks
    $spokenWord =~ s/^ +//g;
    $spokenWord =~ s/ +$//g;
#

#
# Enclose by brackets (unless word is null);
#
    if ($spokenWord) {
	$spokenWord = "[$spokenWord]";
    }
    return $spokenWord;
}



sub CopyVocabularyToHome {
    local ($fileName) = @_;
    $command = "cp $fileName.ddx \$VCHOME/new.ddx";
    `$command`;
}


#
# Recursively goes through the subtree rooted at $dir, executing command
# $command for each file it finds
#
sub LoopDir {
   local ($command, $dir) = @_;

   $oldDir = `pwd`;
   $oldDir = substr($oldDir, 0, $#oldDir);
   DoLoopDir($command, $dir);
   chdir ("$oldDir");
};


sub DoLoopDir {
   my ($command, $dir, $nlink) = @_;
   local ($dev, $ino, $mode, $subcount);


   chdir ($dir) || die "Error: DoLoopDir: Could not find directory $dir\n" ;

   ($dev, $ino, $mode, $nlink) = stat('.') unless $nlink;

   opendir(DIR, '.') || die "Can't open $dir";
   local (@filenames) = readdir(DIR);
   closedir(DIR);

   if ($nlink == 2) {
      for (@filenames) {
         next if $_ eq '.';
         next if $_ eq '..';
         $name = "$dir/$_";
         ProcessFile($name, $command);
      }
   } else {
      $subcount = $nlink - 2;
      for (@filenames) {
         next if $_ eq '.';
         next if $_ eq '..';
         $name = "$dir/$_";
         ProcessFile($name, $command);
         next if $subcount == 0;
        
         ($dev, $ino, $mode, $nlink) = lstat($_);
         next unless -d _;

         chdir $_ || die "Can't cd to $_";
         DoLoopDir($command, $name, $nlink);
         chdir '..';
         --$subcount;
      };
   }
}

#
# Processes files and ignores directories.
#
sub ProcessFile {

   local ($file, $command) = @_;

   ($dev, $ino, $mode, $ndir) = stat($file);

   #
   # Is this a file or a subdirectory?
   # 
   if ($ndir == 1) {
      if ($traceProcessFile eq 'on') {print "Processing file: $file\n";};
      $tmp = $command."\(\'".$file."\'\)";
      eval ($tmp);
   };
}



sub CreatePathSymbols {
    local ($file) = @_;

    %newSymbols = ParseSymbols($file);
    #
    # Add this vocabulary to the current one
    #
    foreach $aKey (keys(%newSymbols)) {
	$symbolsCreatePath{$aKey} = $newSymbols{$aKey};
    }
}

sub Capitalize {
    local ($aString) = @_;
    $firstLetter = substr($aString, 0, 1);
    $firstLetter =~ tr/[a-z]/[A-Z]/;
    $rest = substr($aString, 1, length($aString));
    $aString = $firstLetter.$rest;
    return $aString;
}

sub Uppercase {
    local ($aString) = @_;
    $aString =~ tr/[a-z]/[A-Z]/;
    return $aString;
}


sub SortedHashPrint {
    local (%hash) = @_;
    @theKeys = keys(%hash);
    @theKeys = sort(@theKeys);
    foreach $aKey (@theKeys) {
	print "$aKey: $hash{$aKey}\n";
    }
}

sub isDirectory {
    local ($fileName) = @_;
    opendir(DUMMY_HANDLE, $fileName)

}
