/*
 * Decompiled with CFR 0.152.
 */
package tsg.corpora;

import java.io.BufferedReader;
import java.io.File;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import java.util.Scanner;
import settings.Parameters;
import tsg.ConvertFragmentsToCFGRules;
import tsg.TSNodeLabel;
import tsg.TSNodeLabelStructure;
import tsg.corpora.ConstCorpus;
import tsg.kernels.CommonSubtreesMUBFreqNew;
import tsg.parser.ConvertGrammarInBitParFormat;
import util.FileUtil;
import util.PrintProgress;
import util.Utility;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class TUT09 {
    public static String corpusPath = String.valueOf(Parameters.corpusPath) + "Evalita09/Treebanks/Constituency/";
    public static File trainFile = new File(String.valueOf(corpusPath) + "TUTinPENN-train.readable.penn");
    public static File trainFileNoTraces = new File(String.valueOf(corpusPath) + "TUTinPENN-train.readable.notraces.penn");
    public static File trainFileNoTracesNoSemTags = new File(String.valueOf(corpusPath) + "TUTinPENN-train.readable.notraces.noSemTags.penn");
    public static File testFile = new File(String.valueOf(corpusPath) + "TUTinPENN-Evalita09-testset14-9-09.penn");
    public static String lexPosSeparationString = "^";
    public static char lexPosSeparationChar = lexPosSeparationString.charAt(0);
    static int nBest = 1000;
    static String bitparApp = "/home/fsangati/SOFTWARE/BitPar_Web/bitpar";
    static String bitparArgs = "-vp -b " + nBest + " -o -s TOP ";
    public static String[] changingAllowed = new String[]{"-RRB-", "''", "...", "-LRB-", ".", ",", ";", ":", "?"};
    static PrintWriter pwLog;
    static String viterbProbPrefix;
    static int viterbProbPrefixLength;

    static {
        viterbProbPrefix = "vitprob=";
        viterbProbPrefixLength = viterbProbPrefix.length();
    }

    public static void cleanCorpus() {
        ConstCorpus trainingCorpus = new ConstCorpus(trainFile, "UTF-8");
        trainingCorpus.removeTraces("-NONE-");
        trainingCorpus.removeNumbersInLables();
        trainingCorpus.toFile_Complete(trainFileNoTraces, false);
        trainingCorpus.removeSemanticTags();
        trainingCorpus.toFile_Complete(trainFileNoTracesNoSemTags, false);
    }

    public static void buildKernelFragmetnFile() throws Exception {
        System.out.println("Max depth: " + CommonSubtreesMUBFreqNew.maxDepth);
        String dirPath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/SemTagOn/";
        File inputFile = new File(String.valueOf(dirPath) + "TUTinPENN-train.readable.notraces_quotesFixed.penn");
        File learningCurveFileC0 = new File(String.valueOf(dirPath) + "learningFragments_C0_MUB_freq_all.txt");
        File learningCurveFileC1 = new File(String.valueOf(dirPath) + "learningFragments_C1_MUB_freq_all.txt");
        File learningCurveFileTot = new File(String.valueOf(dirPath) + "learningFragments_Tot_MUB_freq_all.txt");
        ArrayList<TSNodeLabelStructure> treebank = TSNodeLabelStructure.readTreebank(inputFile, "UTF-8", 20000);
        System.out.println("Treebank size: " + treebank.size());
        CommonSubtreesMUBFreqNew cs = new CommonSubtreesMUBFreqNew(treebank);
        cs.extractFromTreebankAndLearningCurve(learningCurveFileC0, learningCurveFileC1, learningCurveFileTot);
        String fragmentDepthReport = cs.reportFragmentDepth();
        FileUtil.append(fragmentDepthReport, new File(String.valueOf(dirPath) + "fragmentDepthReport_MUB_freq_all.txt"));
        System.out.println(fragmentDepthReport);
        cs.printFragmentsToFile(new File(String.valueOf(dirPath) + "fragments_MUB_freq_all.txt"));
    }

    private static void addOneInKernelFragmentFileAndSwapColumns() {
        String dirPath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/SemTagOn/";
        File fragmentFile = new File(String.valueOf(dirPath) + "fragments_MUB_freq_all.txt");
        File outputFile = new File(String.valueOf(dirPath) + "fragments_MUB_freq_all_freqFirst_plusOne.txt");
        Scanner scan = FileUtil.getScanner(fragmentFile, "UTF-8");
        PrintWriter pw = FileUtil.getPrintWriter(outputFile, "UTF-8");
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            if (line.equals("")) continue;
            String[] freqTree = line.split("\t");
            String tree = freqTree[0];
            int newFreq = Integer.parseInt(freqTree[1]) + 1;
            pw.println(String.valueOf(newFreq) + "\t" + tree);
        }
        pw.println();
        scan.close();
        pw.close();
    }

    public static void extractCFGFileDepths() throws Exception {
        boolean semTag = true;
        File inputFile = semTag ? trainFileNoTraces : trainFileNoTracesNoSemTags;
        String basePath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/";
        String outputFolder = String.valueOf(basePath) + (semTag ? "SemTagOn/" : "SemTagOff/");
        File outputFile = new File(String.valueOf(outputFolder) + "fragments_CFG_freq_all.txt");
        Hashtable<String, int[]> cfgRules = new Hashtable<String, int[]>();
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        for (TSNodeLabel t : treebank) {
            ArrayList<TSNodeLabel> nodes = t.collectAllNodes();
            for (TSNodeLabel n : nodes) {
                if (n.isLexical) continue;
                String rule = n.cfgRule();
                Utility.increaseStringIntArray(cfgRules, rule);
            }
        }
        PrintWriter pw = FileUtil.getPrintWriter(outputFile);
        for (Map.Entry e : cfgRules.entrySet()) {
            String rule = (String)e.getKey();
            int freq = ((int[])e.getValue())[0];
            TSNodeLabel ruleTree = new TSNodeLabel(rule, false);
            pw.println(String.valueOf(freq) + "\t" + ruleTree.toString(false, true));
        }
        pw.close();
    }

    public static void extractFragmentsFileDepths() throws Exception {
        int maxDepth = 1;
        int maxProle = 100;
        String dirPath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/SemTagOn/";
        File inputFile = new File(String.valueOf(dirPath) + "TUTinPENN-train.readable.notraces_quotesFixed.penn");
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(inputFile);
        Hashtable[] fragmentsTablesDepths = new Hashtable[maxDepth];
        int i = 0;
        while (i < maxDepth) {
            fragmentsTablesDepths[i] = new Hashtable();
            ++i;
        }
        PrintProgress.start("Extractrig fragments up to depth " + maxDepth + " max braching " + maxProle + ": ");
        for (TSNodeLabel t : treebank) {
            PrintProgress.next();
            ArrayList<String> fragments = t.allSubTrees(maxDepth, maxProle);
            for (String s : fragments) {
                TSNodeLabel treeFragm = new TSNodeLabel(s, false);
                int depth = treeFragm.maxDepth();
                Utility.increaseStringIntArray(fragmentsTablesDepths[depth - 1], s);
            }
        }
        PrintProgress.end();
        i = 0;
        while (i < maxDepth) {
            int depth = i + 1;
            File outputFile = new File(String.valueOf(dirPath) + "fragments_Depth_" + depth + "_maxProle_" + maxProle + "_freq_all.txt");
            PrintWriter pw = FileUtil.getPrintWriter(outputFile);
            for (Map.Entry e : fragmentsTablesDepths[i].entrySet()) {
                String fragmentString = (String)e.getKey();
                int freq = ((int[])e.getValue())[0];
                TSNodeLabel fragmentTree = new TSNodeLabel(fragmentString, false);
                pw.println(String.valueOf(freq) + "\t" + fragmentTree.toString(false, true));
            }
            pw.close();
            ++i;
        }
    }

    public static void prepareTest() throws Exception {
        String dirPath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/";
        File trainigFile = new File(String.valueOf(dirPath) + "TUTinPENN-train.readable.notraces_quotesFixed.penn");
        File testFile = new File(String.valueOf(dirPath) + "TUTinPENN-Evalita09-testset14-9-09_quotesFixed.penn");
        File testFileFixed = new File(String.valueOf(dirPath) + "TUTinPENN-Evalita09-testset14-9-09_quotesFixed_posFixed.penn");
        File testFileFixedLog = new File(String.valueOf(dirPath) + "TUTinPENN-Evalita09-testset14-9-09_quotesFixed_posFixed.log");
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainigFile, "UTF-8", 10000);
        Hashtable<String, HashSet<String>> lexPosTable = TSNodeLabel.getLexPosTableFromTreebank(treebank);
        HashSet<String> posSet = TSNodeLabel.getPosSetFromTreebank(treebank);
        ArrayList<String[]> testSentencesPosWord = TUT09.getTestSentencesPosWord(testFile);
        PrintWriter pw = FileUtil.getPrintWriter(testFileFixed, "UTF-8");
        HashSet<String> changingWords = new HashSet<String>(Arrays.asList(changingAllowed));
        PrintWriter pwLog = FileUtil.getPrintWriter(testFileFixedLog, "UTF-8");
        pwLog.println("Changing allowed:\n" + changingWords.toString() + "\n\n");
        int sentenceIndex = 1;
        for (String[] sentencePosWord : testSentencesPosWord) {
            pwLog.println("Sentence: " + sentenceIndex++);
            int posWordIndex = 1;
            String[] stringArray = sentencePosWord;
            int n = sentencePosWord.length;
            int n2 = 0;
            while (n2 < n) {
                String posWord = stringArray[n2];
                int lexSepIndex = posWord.indexOf(lexPosSeparationChar);
                String pos = posWord.substring(0, lexSepIndex);
                String word = posWord.substring(lexSepIndex + 1);
                if (word.equals("(")) {
                    pwLog.println("Fixed open bracket '(' to '-LRB-'");
                    word = "-LRB-";
                }
                if (word.equals(")")) {
                    pwLog.println("Fixed closed bracket ')' to '-RRB-'");
                    word = "-RRB-";
                }
                HashSet<String> storedPosWord = lexPosTable.get(word);
                String newPos = null;
                if (storedPosWord == null) {
                    newPos = pos;
                    pwLog.println("Unknown word: " + word + " (" + newPos + ")");
                    if (!posSet.contains(pos)) {
                        pwLog.println("Unknown pos: " + pos);
                    }
                } else if (storedPosWord.contains(pos)) {
                    newPos = pos;
                } else {
                    pwLog.print("Known word '" + word + "' with unseen pos: '" + pos + "' ");
                    if (!posSet.contains(pos)) {
                        pwLog.print("(Unknown pos) ");
                    }
                    if (storedPosWord.size() == 1) {
                        if (changingWords.contains(word) || !posSet.contains(pos)) {
                            newPos = storedPosWord.iterator().next();
                            pwLog.println("Change to " + newPos);
                        } else {
                            pwLog.println("No change, word not allowed, probably unknown.");
                            newPos = pos;
                        }
                    } else {
                        newPos = pos;
                        pwLog.println("No change, multiple choice: " + storedPosWord);
                    }
                }
                pw.println(String.valueOf(posWordIndex++) + " " + word + " (" + newPos + ")");
                ++n2;
            }
            pwLog.println();
            pw.println();
        }
        pw.close();
        pwLog.close();
    }

    private static void reportLine(String line) {
        System.out.println(line);
        pwLog.println(line);
    }

    public static void buildAndParseUniqueGrammar() throws Exception {
        nBest = 1000;
        String parentPath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/SemTagOff/";
        String ouputPath = String.valueOf(parentPath) + "MediumParse1000Unique/";
        File testFile = new File(String.valueOf(parentPath) + "TUTinPENN-Evalita09-testset14-9-09_quotesFixed_posFixed.penn");
        File fragmentsFile = new File(String.valueOf(ouputPath) + "ALL_FRAGMENTS.txt");
        File fragmentsUniqueFile = new File(String.valueOf(ouputPath) + "ALL_FRAGMENTS_UNIQUE.txt");
        File probGrammarFile = new File(String.valueOf(ouputPath) + "CFG_FREQ_GRAMMAR.txt");
        File cfgRuleFragmentMappingFile = new File(String.valueOf(ouputPath) + "CFG_RULE_FRAGMENT_MAPPING.txt");
        File ouputBitParPosProcessedFile = new File(String.valueOf(ouputPath) + "BITPAR_OUTPUT_POSTPROCESSED_BEST.txt");
        File grammarFile = new File(String.valueOf(ouputPath) + "GRAMMAR.txt");
        File lexiconFile = new File(String.valueOf(ouputPath) + "LEXICON.txt");
        File parseLog = new File(String.valueOf(ouputPath) + "parse.log");
        pwLog = FileUtil.getPrintWriter(parseLog, "UTF-8");
        TUT09.makeFragmentsUniqueFile(fragmentsFile, fragmentsUniqueFile);
        File ambiguousFragmentsLogFile = new File(String.valueOf(ouputPath) + "log_ambiguousFragmetnsCFG.txt");
        ConvertFragmentsToCFGRules converter = new ConvertFragmentsToCFGRules(fragmentsUniqueFile, ambiguousFragmentsLogFile);
        converter.printGrammarFile(probGrammarFile);
        converter.printRuleBestFragmentMappingFile(cfgRuleFragmentMappingFile);
        Hashtable<String, TSNodeLabel> ruleFragmentTable = converter.getRuleBestFragmentMappingTable();
        HashSet<String> testSentencesPosWordSet = TUT09.getTestSentencesPosWordSet(testFile);
        TUT09.reportLine("Preparing unique grammar for all sentences.");
        long startTime = System.currentTimeMillis();
        ConvertGrammarInBitParFormat.probGrammarForAllSentences(probGrammarFile, testSentencesPosWordSet, grammarFile, lexiconFile, lexPosSeparationChar);
        float tookSec = (float)(System.currentTimeMillis() - startTime) / 1000.0f;
        TUT09.reportLine("Took " + tookSec);
        String bitParOuputPath = String.valueOf(ouputPath) + "bitparOutput/";
        new File(bitParOuputPath).mkdir();
        ArrayList<String[]> testSentencesPosWord = TUT09.getTestSentencesPosWord(testFile);
        int maxCounterLength = Integer.toString(testSentencesPosWord.size()).length();
        PrintWriter pwFinal = FileUtil.getPrintWriter(ouputBitParPosProcessedFile, "UTF-8");
        int sentenceIndex = 1;
        for (String[] sentencePosWord : testSentencesPosWord) {
            startTime = System.currentTimeMillis();
            TUT09.reportLine("Parsing sentence " + sentenceIndex);
            String sentenceIndexPad = Utility.padZero(maxCounterLength, sentenceIndex);
            String sentenceFolder = String.valueOf(bitParOuputPath) + "Sentence" + sentenceIndexPad + "/";
            new File(sentenceFolder).mkdir();
            File testSenteceBitParFormat = new File(String.valueOf(sentenceFolder) + "sentence_" + sentenceIndexPad + "_words.txt");
            File outputBitParSentenceFile = new File(String.valueOf(sentenceFolder) + "sentence_" + sentenceIndexPad + "_bitParOut.txt");
            TUT09.createTestSentenceFile(sentencePosWord, testSenteceBitParFormat);
            TUT09.runBitPar(grammarFile, lexiconFile, testSenteceBitParFormat, outputBitParSentenceFile, sentenceFolder);
            TUT09.posprocessNbest(outputBitParSentenceFile, ruleFragmentTable, pwFinal);
            tookSec = (float)(System.currentTimeMillis() - startTime) / 1000.0f;
            TUT09.reportLine("Finished parsing. Took " + tookSec + " sec.");
            TUT09.reportLine("");
            ++sentenceIndex;
        }
        pwFinal.close();
        pwLog.close();
    }

    public static void buildAndParse() throws Exception {
        String parentPath = String.valueOf(Parameters.resultsPath) + "TSG/TSGkernels/TUT09/SemTagOn/";
        String ouputPath = String.valueOf(parentPath) + "EasyParse1000L1/";
        File testFile = new File(String.valueOf(parentPath) + "TUTinPENN-Evalita09-testset14-9-09_quotesFixed_posFixed.penn");
        File fragmentsFile = new File(String.valueOf(ouputPath) + "ALL_FRAGMENTS.txt");
        File fragmentsUniqueFile = new File(String.valueOf(ouputPath) + "ALL_FRAGMENTS_UNIQUE.txt");
        File probGrammarFile = new File(String.valueOf(ouputPath) + "CFG_FREQ_GRAMMAR.txt");
        File cfgRuleFragmentMappingFile = new File(String.valueOf(ouputPath) + "CFG_RULE_FRAGMENT_MAPPING.txt");
        File ouputBitParPosProcessedFile = new File(String.valueOf(ouputPath) + "BITPAR_OUTPUT_POSTPROCESSED_BEST.txt");
        File parseLog = new File(String.valueOf(ouputPath) + "parse.log");
        pwLog = FileUtil.getPrintWriter(parseLog, "UTF-8");
        TUT09.makeFragmentsUniqueFile(fragmentsFile, fragmentsUniqueFile);
        File ambiguousFragmentsLogFile = new File(String.valueOf(ouputPath) + "log_ambiguousFragmetnsCFG.txt");
        ConvertFragmentsToCFGRules converter = new ConvertFragmentsToCFGRules(fragmentsUniqueFile, ambiguousFragmentsLogFile);
        converter.printGrammarFile(probGrammarFile);
        converter.printRuleBestFragmentMappingFile(cfgRuleFragmentMappingFile);
        Hashtable<String, TSNodeLabel> ruleFragmentTable = converter.getRuleBestFragmentMappingTable();
        ArrayList<String[]> testSentencesPosWord = TUT09.getTestSentencesPosWord(testFile);
        String bitParOuputPath = String.valueOf(ouputPath) + "bitparOutput/";
        new File(bitParOuputPath).mkdir();
        int sentenceIndex = 0;
        int maxCounterLength = Integer.toString(testSentencesPosWord.size()).length();
        PrintWriter pwFinal = FileUtil.getPrintWriter(ouputBitParPosProcessedFile, "UTF-8");
        for (String[] sentencePosWord : testSentencesPosWord) {
            TUT09.reportLine("Preparing grammar for sentence: " + ++sentenceIndex);
            long startTime = System.currentTimeMillis();
            String sentenceIndexPad = Utility.padZero(maxCounterLength, sentenceIndex);
            String sentenceFolder = String.valueOf(bitParOuputPath) + "Sentence" + sentenceIndexPad + "/";
            new File(sentenceFolder).mkdir();
            File testSenteceBitParFormat = new File(String.valueOf(sentenceFolder) + "sentence_" + sentenceIndexPad + "_words.txt");
            TUT09.createTestSentenceFile(sentencePosWord, testSenteceBitParFormat);
            File sentenceGrammarFile = new File(String.valueOf(sentenceFolder) + "GRAMMAR.txt");
            File sentenceLexiconFile = new File(String.valueOf(sentenceFolder) + "LEXICON.txt");
            ConvertGrammarInBitParFormat.probGrammarForOneSentenceNew(probGrammarFile, sentencePosWord, sentenceGrammarFile, sentenceLexiconFile, lexPosSeparationChar);
            float tookSec = (float)(System.currentTimeMillis() - startTime) / 1000.0f;
            TUT09.reportLine("Took " + tookSec);
            startTime = System.currentTimeMillis();
            TUT09.reportLine("Parsing sentence...");
            File outputBitParSentenceFile = new File(String.valueOf(sentenceFolder) + "sentence_" + sentenceIndexPad + "_bitParOut.txt");
            TUT09.runBitPar(sentenceGrammarFile, sentenceLexiconFile, testSenteceBitParFormat, outputBitParSentenceFile, sentenceFolder);
            TUT09.posprocessNbest(outputBitParSentenceFile, ruleFragmentTable, pwFinal);
            tookSec = (float)(System.currentTimeMillis() - startTime) / 1000.0f;
            TUT09.reportLine("Finished parsing. Took " + tookSec + " sec.");
            TUT09.reportLine("");
        }
        pwFinal.close();
        pwLog.close();
    }

    private static void createTestSentenceFile(String[] flatWordArrayPosTerminal, File testSentencesPreprocessed) {
        PrintWriter flatPW = FileUtil.getPrintWriter(testSentencesPreprocessed);
        flatPW.println(String.valueOf(Utility.joinStringArrayToString(flatWordArrayPosTerminal, "\n")) + "\n");
        flatPW.close();
    }

    private static void runBitPar(File grammarFile, File lexiconFile, File testSentencesWordsFile, File outputBitParSentenceFile, String workingDir) {
        try {
            String line;
            Process p = Runtime.getRuntime().exec(String.valueOf(bitparApp) + " " + bitparArgs + " " + grammarFile + " " + lexiconFile + " " + testSentencesWordsFile + " " + outputBitParSentenceFile, null, new File(workingDir));
            BufferedReader input = new BufferedReader(new InputStreamReader(p.getInputStream()));
            while ((line = input.readLine()) != null) {
                TUT09.reportLine(line);
            }
            input = new BufferedReader(new InputStreamReader(p.getErrorStream()));
            while ((line = input.readLine()) != null) {
                TUT09.reportLine(line);
            }
            input.close();
        }
        catch (Exception err) {
            err.printStackTrace();
        }
    }

    private static void posprocessNbest(File outputBitParSentenceFile, Hashtable<String, TSNodeLabel> ruleFragmentTable, PrintWriter pwFinal) throws Exception {
        File outputBitParSentencePosProcessedFile = new File(String.valueOf(outputBitParSentenceFile.getParent()) + "/" + FileUtil.getFileNameWithoutExtensions(outputBitParSentenceFile) + "_posprocessed.txt");
        Scanner scan = FileUtil.getScanner(outputBitParSentenceFile, "UTF-8");
        PrintWriter pw = FileUtil.getPrintWriter(outputBitParSentencePosProcessedFile, "UTF-8");
        double prob = -1.0;
        Hashtable<String, double[]> parseProbTable = new Hashtable<String, double[]>();
        if (!scan.hasNextLine()) {
            TUT09.reportLine("EMPTY FILE!!!!");
        }
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            if (line.equals("")) continue;
            if (line.startsWith(viterbProbPrefix)) {
                prob = Double.parseDouble(line.substring(viterbProbPrefixLength));
                pw.println(line);
                continue;
            }
            line = line.replaceAll("\\\\", "");
            TSNodeLabel tree = null;
            String fragmentConvertedString = null;
            if (line.startsWith("No parse for: ")) {
                tree = new TSNodeLabel("(FIX FIX)");
                TUT09.reportLine("NO PARSE SENTENCE!!");
                fragmentConvertedString = tree.toString();
            } else {
                tree = new TSNodeLabel(line);
                TUT09.fixLexiconInTree(tree);
                fragmentConvertedString = tree.replaceRulesWithFragments(ruleFragmentTable).toString();
            }
            pw.println(fragmentConvertedString);
            Utility.increaseStringDoubleArray(parseProbTable, fragmentConvertedString, prob);
        }
        pwFinal.println(Utility.getMaxKey(parseProbTable));
        pw.close();
        scan.close();
    }

    private static void fixLexiconInTree(TSNodeLabel tree) {
        ArrayList<TSNodeLabel> lexicon = tree.collectLexicalItems();
        for (TSNodeLabel l : lexicon) {
            if (l.label().equals(l.parent.label())) {
                l.parent.daughters = null;
                l.parent.isLexical = true;
                continue;
            }
            String label = l.label();
            String newLabel = label.substring(label.indexOf(lexPosSeparationChar) + 1);
            l.relabel(newLabel);
        }
    }

    private static void makeFragmentsUniqueFile(File fragmentsFile, File fragmentsUniqueFile) {
        Hashtable<String, int[]> fragFreq = new Hashtable<String, int[]>();
        Scanner scan = FileUtil.getScanner(fragmentsFile, "UTF-8");
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            if (line.equals("")) continue;
            String[] freqTree = line.split("\t");
            int freq = Integer.parseInt(freqTree[0]);
            String tree = freqTree[1];
            int[] storedFreq = (int[])fragFreq.get(tree);
            if (storedFreq == null) {
                storedFreq = new int[]{freq};
                fragFreq.put(tree, storedFreq);
                continue;
            }
            if (storedFreq[0] >= freq) continue;
            storedFreq[0] = freq;
        }
        PrintWriter pw = FileUtil.getPrintWriter(fragmentsUniqueFile, "UTF-8");
        for (Map.Entry e : fragFreq.entrySet()) {
            pw.println(String.valueOf(((int[])e.getValue())[0]) + "\t" + (String)e.getKey());
        }
        pw.println();
        pw.close();
    }

    private static ArrayList<String[]> getTestSentencesPosWord(File testFile) {
        ArrayList<String[]> result = new ArrayList<String[]>();
        Scanner scan = FileUtil.getScanner(testFile, "UTF-8");
        ArrayList<String> sentencePosWords = new ArrayList<String>();
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            if (line.equals("") || !line.matches("^\\d+.+")) {
                if (sentencePosWords.isEmpty()) continue;
                result.add(sentencePosWords.toArray(new String[sentencePosWords.size()]));
                sentencePosWords.clear();
                continue;
            }
            String[] lineSplit = line.split("\\s+");
            String word = lineSplit[1];
            String pos = lineSplit[2];
            pos = pos.substring(1, pos.length() - 1);
            sentencePosWords.add(String.valueOf(pos) + lexPosSeparationString + word);
        }
        return result;
    }

    private static HashSet<String> getTestSentencesPosWordSet(File testFile) {
        HashSet<String> result = new HashSet<String>();
        Scanner scan = FileUtil.getScanner(testFile, "UTF-8");
        while (scan.hasNextLine()) {
            String line = scan.nextLine();
            if (line.equals("") || !line.matches("^\\d+.+")) continue;
            String[] lineSplit = line.split("\\s+");
            String word = lineSplit[1];
            String pos = lineSplit[2];
            pos = pos.substring(1, pos.length() - 1);
            result.add(String.valueOf(pos) + lexPosSeparationString + word);
        }
        return result;
    }

    public static void getLexStatistics() throws Exception {
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainFile);
        File lexTableFile = new File(String.valueOf(corpusPath) + "lexiconPosFreq.txt");
        Hashtable<String, int[]> lexTable = new Hashtable<String, int[]>();
        for (TSNodeLabel t : treebank) {
            ArrayList<TSNodeLabel> lex = t.collectLexicalItems();
            for (TSNodeLabel l : lex) {
                Utility.increaseStringIntArray(lexTable, String.valueOf(l.parent.label()) + "_" + l.label());
            }
        }
        PrintWriter pw = FileUtil.getPrintWriter(lexTableFile);
        for (Map.Entry e : lexTable.entrySet()) {
            pw.println(String.valueOf(((int[])e.getValue())[0]) + " " + (String)e.getKey());
        }
        pw.close();
    }

    public static void getFragmentFreq() throws Exception {
        ArrayList<TSNodeLabel> treebank = TSNodeLabel.getTreebank(trainFileNoTracesNoSemTags);
        File fragmentFile = new File("/Users/fsangati/Documents/UNIVERSITY/UVA/Papers/Evalita09/constPaper/Fragments/fragments_MUB_freq_all_sorted_VMA_\u00e8.txt");
        ArrayList<TSNodeLabel> treebankFragments = TSNodeLabel.getTreebank(fragmentFile, false);
        int i = 0;
        for (TSNodeLabel target : treebankFragments) {
            int freq = TSNodeLabel.countFragmentInTreebank(treebank, target);
            System.out.println(target + "\t" + freq);
            if (++i == 100) break;
        }
    }

    public static void main(String[] args) throws Exception {
        TUT09.getFragmentFreq();
    }
}

