/* Statistical labeller - revised version.

   07-12-92	Created
   24-12-92	Restructured
   01-04-93	Change from using FSMs to parser
   14-04-93	FSMs reinstated
   05-04-93	Tag inference option added
   27-01-95     Unknown word handler added

   Copyright (C) David Elworthy 1995

   Usage:
	label corpus options
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <locale.h>
#include "common.h"
#include "diction.h"
#include "trans.h"
#include "label.h"
#include "map.h"
#include "low.h"
#include "unkcommon.h"
#include "unkdiction.h"
#ifdef Analyse
#include "analyse.h"
#endif
#ifdef Use_Parser
#include "parser.h"
#endif
#ifdef Use_FSM
#include "fsm.h"
#endif

#define DICTLEN (60000)

/* Initialisation codes (input option 'I') */
#define Init_d_ntag	(1)
#define	Init_d_tagmax	(2)
#define	Init_t_tagmax	(4)
#define	Init_d_1	(8)
#define	Init_t_1	(16)
#define Init_d_s	(32)

/* File name and structure for unknown word features. */
Features  features;

/* Use tag groups when compiling output statistics. */
BOOL use_tag_groups;

/* Get a word structure */
extern BOOL fetch_word(FILE *fp, Dict *dict, Dict *skip_dict, Word word);

/*-----------------------------------------------------------------------------
    check_names

    check dict and tran names have not already been specified.
-----------------------------------------------------------------------------*/

static BOOL check_names(char *dict, char *tran, char *kind)
{
    BOOL ok = TRUE;

    if (dict != NULL && dict[0] != 0)
    {
	fprintf(stderr, "%s dictionary specified more than once\n", kind);
	ok = FALSE;
    }

    if (tran != NULL && tran[0] != 0)
    {
	fprintf(stderr, "%s transitions specified more than once\n", kind);
	ok = FALSE;
    }

    return ok;
}

/*-----------------------------------------------------------------------------
    init_trans

    Initialise the trans and pi (but not gamma) arrays.
-----------------------------------------------------------------------------*/

static void init_trans(Trans *trans, Score s, BOOL mul)
{
    int size = trans->size;

    set_trans(trans->trans, s, size, mul);
    set_pi(trans->pi, s, size, mul);
}

/*-----------------------------------------------------------------------------
    set_up_options

    Set all the option flags, file names and numerical parameters.
-----------------------------------------------------------------------------*/

static void set_up_options(int argc, char *argv[],
		int *iterations, int *initialise, int *dict_size,
		char *dict, char *tran, char *odict, char *otran,
		char *out, char *map, char *skip, char *reduce,
		char *fsm, char *grammar, char *infer, char *ukw,
	        char *ofeatures, char *obadword, char *unktaggroup )
{
    char opt;
    int  arg = 2, i = 0;
    char root[MAXFN];
    BOOL error = FALSE;

    /* The following string defines the known option letters. ':' indicates
       that an argument is needed. */
#ifdef BT
    char *opt_string = "aA:bB:c:C:d:D:fFi:I:lm:M:nNo:O:pr:R:St:T:uVwx:XzZ";
#else
    char *opt_string =
	"aA:bB:c:C:d:D:e:EfFgGh:H:i:I:j:J:k:K:lL:m:M:nNo:O:pPq:Qr:R:s:St:T:uUVwx:XYzZ";
#endif

    *iterations = 1;
    *initialise = 0;
    *dict_size  = 0;
    SetOutOpt(out_word);
    SetInOpt(tagged);
    dict[0] = tran[0] = odict[0] = otran[0] = out[0] = map[0] =
    fsm[0] = grammar[0] = infer[0] = skip[0] = reduce[0] = 
    ofeatures[0] = obadword[0] = 0;

    /* Analyse the options */
    while ((opt = get_option(argc, argv, opt_string, &arg, &i)) != 0)
    {
	switch (opt)
	{
	    case 'a': SetOption(anchored); break;
	    case 'A':
		if (!get_opt_int(argv, dict_size, &arg, &i, opt))
		    *dict_size = 0;
		break;
	    case 'b': SetOption(num_stabilise); break;
	    case 'B':
		SetOption(reestimate);
		if (!get_opt_int(argv, iterations, &arg, &i, opt))
		    *iterations = 1;
		break;
	    case 'c':
	    {
		Score threshold;

		if (get_opt_double(argv, &threshold, &arg, &i, opt))
		{
		    SetOption(use_threshold);
		    set_output_threshold(threshold);
		}
		break;
	    }
	    case 'C':
		if (!get_opt_int(argv, &options.in, &arg, &i, opt))
		    options.in = tagged;
		break;
	    case 'd':
		if (check_names(dict, NULL, "Input"))
		    get_opt_string(argv, dict, MAXFN, &arg, &i, opt);
		break;
	    case 'D':
		if (check_names(odict, NULL, "Output"))
		    get_opt_string(argv, odict, MAXFN, &arg, &i, opt);
		break;
	    case 'e':
		/* Have to get string so arg and i are straight */
		get_opt_string(argv, fsm, MAXFN, &arg, &i, opt);
#ifdef Use_FSM
		SetOption(use_fsm);
#else
		fprintf(stderr, "Option 'e' ignored (no FSMs)\n");
#endif
		break;
	    case 'E':
#ifdef Use_FSM
		SetOption(fsm_trace);
#else
		fprintf(stderr, "Option 'E' ignored (no FSMs)\n");
#endif
		break;
	    case 'f': SetOption(most_freq); break;
	    case 'F': SetOption(fb_tagging); break;
	    case 'g': SetOption(good_turing); break;
	    case 'G': SetOption(good_turing_lex); break;
	    case 'h':
		if (!get_opt_int(argv, &(features->maxunkwords), &arg, &i, opt))
		    features->maxunkwords = MAXUNKWORDS;
		break;
	    case 'H':
	        SetOption(unknown_morph);
	        if ( features == NULL )
		  {
		    /* Set up and initialize the features structure */
		    Allocate(features, sizeof(FeatureSt), "feature structure - unk words");
		  }
		if (!get_opt_int(argv, &(features->maxsuffix), &arg, &i, opt))
		    features->maxsuffix = MinSuffixLen;
		break;
	    case 'i':
		get_opt_string(argv, infer, MAXFN, &arg, &i, opt);
		break;
	    case 'I':
		if (!get_opt_int(argv, initialise, &arg, &i, opt))
		    *initialise = 0;
		break;
	    case 'j':
		if (unktaggroup[0] != 0)
		    fprintf(stderr,
			"Unknown word tag group file name specified more than once\n");
		else
		    get_opt_string(argv, unktaggroup, MAXFN, &arg, &i, opt);
		break;
	    case 'J':
		if (check_names(ofeatures, NULL, "Output"))
		    get_opt_string(argv, ofeatures, MAXFN, &arg, &i, opt);
		break;
	    case 'k':
	        SetOption(unknown_morph);
	        if ( features == NULL )
		  {
		    /* Set up and initialize the features structure */
		    Allocate(features, sizeof(FeatureSt), "feature structure - unk words");
		  }
		if (!get_opt_int(argv, &(features->maxprefcut), &arg, &i, opt))
		    features->maxprefcut = MinPrefixLen;
		break;
	    case 'K':
	        SetOption(unknown_morph);
	        if ( features == NULL )
		  {
		    /* Set up and initialize the features structure */
		    Allocate(features, sizeof(FeatureSt), "feature structure - unk words");
		  }
		if (!get_opt_int(argv, &(features->maxsuffcut), &arg, &i, opt))
		    features->maxsuffcut = MinSuffixLen;
	        break;
	    case 'l': SetOption(training); break;
	    case 'L':
	    {
		Score threshold;
		if (get_opt_double(argv, &threshold, &arg, &i, opt))
		{
		    SetOption(reest_threshold);
		    set_re_est_threshold(threshold);
		}
		break;
	    }
	    case 'm':
		if (map[0] != 0)
		    fprintf(stderr,
			"Map file name specified more than once\n");
		else
		    get_opt_string(argv, map, MAXFN, &arg, &i, opt);
		break;
	    case 'M':
		if (reduce[0] != 0)
		    fprintf(stderr,
			"Reduced tag set file specified more than once\n");
		else
		{
		    get_opt_string(argv, reduce, MAXFN, &arg, &i, opt);
		    SetOption(reduced_tags);
		}
		break;
	    case 'n': SetOption(any_digit); break;
	    case 'N': SetOption(parsed_number); break;
	    case 'o':
		if (out[0] != 0)
		    fprintf(stderr,
			"Output file name specified more than once\n");
		else
		    get_opt_string(argv, out, MAXFN, &arg, &i, opt);
		break;
	    case 'O':
		if (!get_opt_int(argv, &options.out, &arg, &i, opt))
		    options.out = out_word;
		if (!no_output) SetOutOpt(out_word);
		break;
	    case 'p': SetOption(product); break;
	    case 'P':
#ifdef Phrasal
		SetOption(anchor_bracket);
#else
		fprintf(stderr, "Option 'P' ignored (not phrasal)\n");
#endif
		break;
	    case 'q':
		get_opt_string(argv, grammar, MAXFN, &arg, &i, opt);
#ifdef Use_Parser
		SetOption(use_parser);
#else
		fprintf(stderr, "Option 'q' ignored (no parser)\n");
#endif
		break;
	    case 'Q':
#ifdef Use_Parser
		SetOption(parser_trace);
#else
		fprintf(stderr, "Option 'Q' ignored (no parser)\n");
#endif
		break;
	    case 'r':
		if (check_names(dict, tran, "Input") &&
			get_opt_string(argv, root, MAXFN, &arg, &i, opt))
		    make_names(root, dict, tran, MAXFN);
		break;
	    case 'R':
		if (check_names(odict, otran, "Output") &&
			get_opt_string(argv, root, MAXFN, &arg, &i, opt))
		    make_names(root, odict, otran, MAXFN);
		break;
	    case 's':
	      if (check_names(obadword, NULL, "Output"))
		get_opt_string(argv, obadword, MAXFN, &arg, &i, opt);
	      break;
	    case 'S': SetOption(report_stats); break;
	    case 't':
		if (check_names(NULL, tran, "Input"))
		    get_opt_string(argv, tran, MAXFN, &arg, &i, opt);
		break;
	    case 'T':
		if (check_names(NULL, otran, "Output"))
		    get_opt_string(argv, otran, MAXFN, &arg, &i, opt);
		break;
	    case 'u': SetOption(report_unknown); break;
	    case 'U':
		if (get_opt_string(argv, ukw, MAXFN, &arg, &i, opt))
		  {
		    SetOption(unknown_rules);
		    if ( features == NULL )
		      {
			/* Set up and initialize the features structure */
			Allocate(features, sizeof(FeatureSt), "feature structure - unk words");
		      }
		  }
	        break;
	    case 'V': SetOption(Viterbi); break;
	    case 'w': SetOption(use_wordlist); break;
	    case 'x':
		if (check_names(NULL, skip, "Skip list"))
		{
		    get_opt_string(argv, skip, MAXFN, &arg, &i, opt);
		    SetOption(skip_list);
		}
		break;
	    case 'X': SetOption(special); break;
	    case 'Y': SetOption(unkdebug); break;
	    case 'z': SetOption(verbose); break;
	    case 'Z': SetOption(debug); break;
	}
    }

    /* Set up default for mapping */
    if (map[0] == 0) strcpy(map, "tags.map");

    /* Fiddle iterations if training */
    if (Option(training))
    {
	if (*iterations < 1)	*iterations = 1;
	if (Option(Viterbi))    *iterations = 2;
    	if (Option(fb_tagging) && !Option(reestimate))	*iterations = 2;
	if (*dict_size <= 0)	*dict_size = DICTLEN;
    }

    /* If there were no other tagging options, set f-b */
    if (!Option(most_freq) && !Option(Viterbi))
	SetOption(fb_tagging);

    /* Verification of options */
    /* Check numbers */
    if (Option(any_digit) && Option(parsed_number))
    {
	fprintf(stderr, "Both number options specified\n");
	error = TRUE;
    }

    /* Check input files */
    if (Option(training) && Option(most_freq))
    {
	fprintf(stderr,
	"Training must be a separate run from 'most frequent' tagging\n");
	error = TRUE;
    }

    if (Option(training) && Option(unknown_rules))
    {
	fprintf(stderr,
     "Training must be a separate run from tagging with unknown word rules\n");
	error = TRUE;
    }

    if (!Option(training) && (Option(good_turing) || Option(good_turing_lex)))
    {
	fprintf(stderr,	"Good-Turing adjustment only applies when training\n");
	ClearOption( good_turing );
	ClearOption( good_turing_lex );
	/* Just a warning! */
    }

    if (dict[0] == 0 && (!Option(training) || Option(use_wordlist)))
    {
	fprintf(stderr, "Must specify input dictionary (unless training)\n");
	error = TRUE;
    }

    if (tran[0] == 0 && !Option(training) &&
	!(*initialise & Init_t_1 & Init_t_tagmax 
	& (Init_d_ntag | Init_d_tagmax | Init_d_s)))
    {
	fprintf(stderr,
	"Must specify either transitions file or initialisation option\n");
	error = TRUE;
    }
    if (Option(training) && !Option(use_wordlist) &&
		(dict[0] != 0 || tran[0] != 0))
    {
	fprintf(stderr,
	"Dictionary/transitions input file ignored for training run\n");
	dict[0] = tran[0] = 0;
    }
    if (Option(use_wordlist) && dict[0] == 0)
    {
	fprintf(stderr, "Dictionary must be specified for wordlist\n");
	error = TRUE;
    }

    if (dict[0] == 0 && infer[0] != 0)
    {
	fprintf(stderr,
	"Tag inference may only be specified with an input dictionary\n");
	error = TRUE;
    }

    /* Check corpus */
    if (Option(training) && !Option(use_wordlist) && InOpt(untagged_input))
    {
	fprintf(stderr, "Training requires a tagged corpus\n");
	error = TRUE;
    }

    /* Check output files and options */
    if (Option(Viterbi) && (OutOpt(all_tags) || OutOpt(out_scores)))
    {
	fprintf(stderr, "Viterbi run: output options ignored\n");
	ClearOutOpt(all_tags | out_scores);
    }
    if (InOpt(untagged_input))
    {
	ClearOutOpt(err_only);
        ClearOption(report_stats);
        ClearOutOpt(prob_dist);
#ifdef Analyse
	ClearOutOpt(analyse);
#endif
    }
    if (Option(training) && (*iterations == 1) &&
		odict[0] == 0 && otran[0] == 0)
    {
	fprintf(stderr, "Warning: training run with no output files\n");
    }

    /* Check tagging options */
    if (Option(Viterbi) && (*iterations != (Option(training) ? 2 : 1)))
    {
	fprintf(stderr, "Iterations parameter ignored for Viterbi run\n");
	*iterations = 1;
    }
    if (*iterations < 1)
    {
	fprintf(stderr, "Number of iterations must be 1 or more\n");
	error = TRUE;
    }
    if (Option(training) && (*iterations == 1) && !no_output)
    {
	fprintf(stderr,
		"Warning: 'No output' set since training and 1 iteration\n");
	options.out = no_out_opt;
	ClearOption(report_stats);
	ClearOutOpt(prob_dist);
    }
    if (Option(most_freq) && (Option(Viterbi) || Option(reestimate)))
    {
	fprintf(stderr, "'Most frequent option set: ignoring others\n");
	ClearOption(Viterbi);
	ClearOption(reestimate);
    }

    if (Option(reest_threshold) && !Option(reestimate))
    {
	fprintf(stderr, "Re-estimation threshold ignored\n");
	ClearOption(reest_threshold);
    }

    if (OutOpt(prob_dist) &&
	((Option(training) && *iterations == 1) || Option(most_freq) ||
	Option(Viterbi)))
    {
	fprintf(stderr, "Probability distribution option ignored\n");
	ClearOutOpt(prob_dist);
    }    

    if (Option(use_threshold) &&
	((Option(training) && *iterations == 1) || Option(most_freq) ||
	Option(Viterbi)))
    {
	fprintf(stderr, "Thresholding distribution option ignored\n");
	ClearOption(use_threshold);
    }    

    /* Check initialisation options */
    if (Option(product) & !Option(training))
    {
	fprintf(stderr,
		"Product option ignored except when training\n");
	ClearOption(product);
    }

    /* Check we made it through unscathed */
    if (error) get_out();
}

/*
==============================================================================
Feature memory deallocation functions.
*/

/*-----------------------------------------------------------------------------
    free_transform_list

    Free a transform list's memory.
-----------------------------------------------------------------------------*/

void free_transform_list( TagTrans *tagtrans )
{
  TagTrans this_tagtrans, next_tagtrans;

  if ( tagtrans != NULL )
    {
      for ( this_tagtrans = *tagtrans, next_tagtrans = this_tagtrans->next; this_tagtrans != NULL; this_tagtrans = next_tagtrans )
	{
	  free( this_tagtrans );
          this_tagtrans = NULL;

	  if ( next_tagtrans != NULL )
	    {
	      next_tagtrans = next_tagtrans->next;
	    }
	}

      *tagtrans = NULL;
    }
}

/*-----------------------------------------------------------------------------
    free_integrated_transform_list

    Free an integrated transform list's memory.
-----------------------------------------------------------------------------*/

void free_integrated_transform_list( TagTrans *tagtrans )
{
  TagTrans this_tagtrans, next_tagtrans;

  if ( tagtrans != NULL )
    {
      for ( this_tagtrans = *tagtrans, next_tagtrans = this_tagtrans->next; this_tagtrans != NULL; this_tagtrans = next_tagtrans )
	{
	  free_tagscore_list( &(this_tagtrans->source_tag) );
	  free_tagscore_list( &(this_tagtrans->transform_tags) );
	  free( this_tagtrans );
          this_tagtrans = NULL;

	  if ( next_tagtrans != NULL )
	    {
	      next_tagtrans = next_tagtrans->next;
	    }
	}

      *tagtrans = NULL;
    }
}

/*-----------------------------------------------------------------------------
    free_taglist

    Free a tag list's memory.
-----------------------------------------------------------------------------*/

void free_taglist( TagList *taglist )
{
  int    i;
  TagTag s;

  if ( taglist != NULL )
    {
      /* Free the tag list allocations */
      s  = taglist->s;
      for ( i = 0 ; i < taglist->maxsize ; i++, s++ )
	{
	  if ( s->tagtext != NULL )
	    {
	      free( s->tagtext );
	      if ( s->group != NULL )
		{
		  free_tagscore_list( &(s->group) );
		}
	    }
	}

	free( taglist->s );
	free( taglist->key );

      taglist = NULL;
    }
}

/*-----------------------------------------------------------------------------
    free_afflist

    Free an affix list's memory.
-----------------------------------------------------------------------------*/

void free_afflist( AffList *afflist )
{
  int    i;
  TagAff s;

  if ( afflist != NULL )
    {
      /* Free the affix list allocations */
      s  = afflist->s;
      for ( i = 0 ; i < afflist->maxsize ; i++, s++ )
	{
	  if ( s->affix != NULL )
	    {
	      free( s->affix );
	    }

	  if ( s->vanilla_tagscore_list != NULL )
	    {
	      free_integrated_transform_list( &(s->vanilla_tagscore_list) );
	    }

	  if ( s->integrated_tagscore_list != NULL )
	    {
	      free_tagscore_list( &(s->integrated_tagscore_list) );
	    }
	}

	free( afflist->s );
	free( afflist->key );

      afflist = NULL;
    }
}

/*-----------------------------------------------------------------------------
    free_cutlist

    Free a cut list's memory.
-----------------------------------------------------------------------------*/

void free_cutlist( CutList *cutlist )
{
  int    i;
  TagCut s;

  if ( cutlist != NULL )
    {
      /* Free the cut list allocations */
      s  = cutlist->s;
      for ( i = 0 ; i < cutlist->maxsize ; i++, s++ )
	{
	  if ( s->cut != NULL )
	    {
	      free( s->cut );
	    }

	  if ( s->transform_list != NULL )
	    {
	      free_transform_list( &(s->transform_list) );
	    }

	  if ( s->integrated_transform_list != NULL )
	    {
	      free_integrated_transform_list( &(s->integrated_transform_list) );
	    }

	  if ( s->special_tags != NULL )
	    {
	      free_tagscore_list( &(s->special_tags) );
	    }
	}

	free( cutlist->s );
	free( cutlist->key );

      cutlist = NULL;
    }
}

/*-----------------------------------------------------------------------------
    free_wordlist

    Free a transform list's memory.
-----------------------------------------------------------------------------*/

void free_wordlist( IndexWord *start )
{
  IndexWord this_word, next_word;

  if ( start != NULL )
    {
      for ( this_word = *start, next_word = this_word->next; this_word != NULL; this_word = next_word )
	{
	  free( this_word );
          this_word = NULL;

	  if ( next_word != NULL )
	    {
	      next_word = next_word->next;
	    }
	}
    }
}

/*-----------------------------------------------------------------------------
    free_indexlist

    Free a index list's memory.
-----------------------------------------------------------------------------*/

void free_indexlist( IndexList *indexlist )
{
  int    i;
  Index  s;

  if ( indexlist != NULL )
    {
      /* Free the index list allocations */
      s  = indexlist->s;
      for ( i = 0 ; i < indexlist->maxsize ; i++, s++ )
	{
	  if ( s->prefix != NULL )
	    {
	      free( s->prefix );
	    }

	  if ( s->wordlist_start != NULL )
	    {
	      free_wordlist( &(s->wordlist_start) );
	    }
	}

	free( indexlist->s );
	free( indexlist->key );

      indexlist = NULL;
    }
}

/*-----------------------------------------------------------------------------
    free_features

    Free the feature structure memory.
-----------------------------------------------------------------------------*/

void free_features( Features *features )
{
  if ( features != NULL )
    {
      free_afflist( &((*features)->sufflist ));
      free_afflist( &((*features)->variable_sufflist ));
      free_afflist( &((*features)->separator_sufflist ));
      free_afflist( &((*features)->variable_separator_sufflist ));


      free_cutlist( &((*features)->cut_list) );
      free_cutlist( &((*features)->container_cut_list) );
      free_cutlist( &((*features)->replacement_cut_list) );
      free_cutlist( &((*features)->special_cut_list) );

      free_cutlist( &((*features)->sep_cut_list) );
      free_cutlist( &((*features)->sep_container_cut_list) );
      free_cutlist( &((*features)->sep_replacement_cut_list) );
      free_cutlist( &((*features)->sep_special_cut_list) );

      free_dict( &((*features)->featdict) );
      free_dict( &((*features)->unigramdict) );
      free_dict( &((*features)->unkdict) );
      free_dict( &((*features)->capdict) );
      free_dict( &((*features)->sepdict) );
      free_dict( &((*features)->unkstatdict) );

      free_indexlist( &((*features)->indexlist) );
      free_indexlist( &((*features)->enclosure_indexlist) );
      free_indexlist( &((*features)->partialcap_indexlist) );
      free_indexlist( &((*features)->separator_indexlist) );

      free_taglist( &((*features)->tag_group_list) );

      free( features );

      *features = NULL;
    }
}

/*
==============================================================================
Feature print functions.
*/

/*-----------------------------------------------------------------------------
    print_tags

    Print tags and scores to a file.
-----------------------------------------------------------------------------*/

void print_tags( FILE *ofile, TagScore start, char *heading )
{
  TagScore     next_tagscore;

  if ( start != NULL )
    {
      if ( heading != NULL )
	{
	  fprintf(ofile, "[%s] ", heading);
	}

      for ( next_tagscore = start; next_tagscore != NULL; next_tagscore = next_tagscore->next )
	{
	  fprintf(ofile, "%s %.6g ", unmap_tag(next_tagscore->tag), next_tagscore->score);
	}

      fprintf(ofile, "\n");
    }
}

/*-----------------------------------------------------------------------------
    print_taglist

    Print a tag list to a file.
-----------------------------------------------------------------------------*/

void print_taglist( FILE *ofile, TagList start )
{
  TagTag   *k = start.key;
  int       i;

  /* Work through the tag list */
  for (i = 0 ; i < start.size ; i++, k++)
    {
      TagTag s = *k;

      if ( s->tagtext != NULL )
	{
	  fprintf(ofile, "%s %g ", s->tagtext, s->score );
	}
    }
}

/*-----------------------------------------------------------------------------
    print_affixes

    Print affixes to a file.
-----------------------------------------------------------------------------*/

void print_affixes( FILE *ofile, AffList start )
{
  TagAff   *k = start.key;
  int       i;

  /* Work through the affix list */
  for (i = 0 ; i < start.size ; i++, k++)
    {
      TagAff s = *k;

      if ( s->integrated_tagscore_list != NULL )
	{
	  fprintf(ofile, "%s(%g) ", s->affix, s->total_score);
	  print_tags( ofile, s->integrated_tagscore_list, NULL );
	}
    }
}

/*-----------------------------------------------------------------------------
    print_transforms

    Print transforms to a file.
-----------------------------------------------------------------------------*/

void print_transforms( FILE *ofile, TagTrans start )
{
  TagTrans next_trans;

  for ( next_trans = start; next_trans != NULL; next_trans = next_trans->next )
    {
      if ( next_trans->transform_tags != NULL )
	{
	  fprintf( ofile, "    %s(%g): ", unmap_tag(next_trans->source_tag->tag), next_trans->total_score );
	  print_tags( ofile, next_trans->transform_tags, NULL );
	  fprintf( ofile, "\n" );
	}
    }
}

/*-----------------------------------------------------------------------------
    print_special_cuts

    Print special cuts to a file.
-----------------------------------------------------------------------------*/

void print_special_cuts( FILE *ofile, CutList start )
{
  TagCut   *k = start.key;
  int       i;

  /* Work through the special cuts list */
  for (i = 0 ; i < start.size ; i++, k++)
    {
      TagCut s = *k;

      if ( s->special_tags != NULL )
	{
	  fprintf(ofile, "%s(%g):\n", s->cut, s->special_total_score);
	  print_tags( ofile, s->special_tags, NULL );
	  fprintf( ofile, "\n" );
	}
    }
}

/*-----------------------------------------------------------------------------
    print_cuts

    Print cuts to a file.
-----------------------------------------------------------------------------*/

void print_cuts( FILE *ofile, CutList start )
{
  TagCut   *k = start.key;
  int       i;

  /* Work through the cuts list */
  for (i = 0 ; i < start.size ; i++, k++)
    {
      TagCut s = *k;

      if ( s->integrated_transform_list != NULL )
	{
	  fprintf(ofile, "%s:\n", s->cut);
	  print_transforms( ofile, s->integrated_transform_list );
	  fprintf( ofile, "\n" );
	}
    }
}

/*-----------------------------------------------------------------------------
    print_words

    Print words to a file.
-----------------------------------------------------------------------------*/

void print_words( FILE *ofile, IndexWord start )
{
  IndexWord next_word;

  for ( next_word = start; next_word != NULL; next_word = next_word->next )
    {
      fprintf( ofile, "%s ", next_word->word->text );
    }

  fprintf( ofile, "\n" );
}

/*-----------------------------------------------------------------------------
    print_indexes

    Print indexes to a file.
-----------------------------------------------------------------------------*/

void print_indexes( FILE *ofile, IndexList start )
{
  Index   *k = start.key;
  int     i;

  /* Work through the indexes list */
  for (i = 0 ; i < start.size ; i++, k++)
    {
      Index s = *k;

      if ( s->wordlist_start != NULL )
	{
	  fprintf(ofile, "%s(%d):\n", s->prefix, s->wordnum);
	  print_words( ofile, s->wordlist_start );
	  fprintf( ofile, "\n" );
	}
    }
}

/*
==============================================================================
Feature affix gathering functions.
*/

/*----------------------------------------------------------------------------
    add_affix

    Add affixes to an affix list.
----------------------------------------------------------------------------*/

BOOL add_affix( BOOL true_capital, BOOL pseudo_capital, BOOL mixed_capital, int mode, AffList *afflist, int suffixlen, uchar *text, int testpos, TagScore tagscore )
{
  TagAff  s;
  BOOL    success;
  uchar   *affix = NULL;
  uchar   modifier[MAX_MODIFIER_LEN], *modified_affix;

  if ( mode == VARIABLE_SUFFIX )
    {
      success = get_variable_suffix( (text+testpos), &affix );
    }
  else
    {
      success = get_affix( mode, (text+testpos), &affix, suffixlen, NULL );
    }

  if ( success )
    {
      set_modifier( modifier, true_capital, pseudo_capital, mixed_capital );

      if ( modifier[0] != '\0' )
	{
	  /* Add modifier to the affix: ! indicates a capital, # indicates a pseudo capital. */
	  modified_affix = add_chars( affix, modifier );
	  s = find_affix( afflist, modified_affix );
	  free( modified_affix );
	}
      else
	{
	  /* Add the affix to the affix list (or find its hash) */
	  s = find_affix(afflist, affix);
	}

      if ( s == NULL )
	error_exit1("Out of memory creating affix: %s\n", affix);

      /* Add the vanilla tagscore list to the affix record */
      add_transforms( &(s->vanilla_tagscore_list), NULL, tagscore );

      free( affix );
    }

  return success;
}

/*
==============================================================================
Feature index gathering functions.
*/

/*----------------------------------------------------------------------------
    add_word_indexes

    Add word indexes to an index.
----------------------------------------------------------------------------*/

void add_word_indexes( DictWord word, IndexWord *start, IndexWord *end )
{
  IndexWord index_list;
  int       testpos;

/*
 * If end is empty, add index word to start, else add it to the end.
*/
  /* Create new entry */
  Allocate(index_list, sizeof(IndexWordSt)*sizeof(uchar), "add word indexes chain");
  index_list->word   = word;
  is_initial( word->text, &testpos );
  index_list->length = (int)strlen((char *)(word->text+testpos));
  index_list->next   = NULL;
  
  if ( *end == NULL )
    {
      *start = index_list;
      *end   = index_list;
    }
  else
    {
      (*end)->next   = index_list;
      *end           = index_list;
    }
  
}

/*----------------------------------------------------------------------------
    add_index

    Add indexes to an index list.
----------------------------------------------------------------------------*/

BOOL add_index( BOOL downcase_prefix, DictWord word, int testpos, int minlen, IndexList *indexlist )
{
  Index   s;
  uchar   *prefix, *down_prefix;
  BOOL    success = TRUE;

  success = get_affix( PREFIX, (word->text+testpos), &prefix, minlen, NULL );

  if ( success )
    {
      /* Add the index to the index list (or find its hash) */
      if ( downcase_prefix )
        {
	  down_prefix = downcase( prefix );
	  s = find_index(indexlist, down_prefix);
	  free( down_prefix );
	}
      else
        {
	  s = find_index(indexlist, prefix);
	}

      if ( s == NULL )
	error_exit1("Out of memory creating index prefix: %s\n", prefix);

      (s->wordnum)++;

      /* Add the word indexes to the index record */
      add_word_indexes( word, &(s->wordlist_start), &(s->wordlist_end) );

      free( prefix );
    }

  return success;
}

/*
==============================================================================
Cut gathering functions.
*/

/*-----------------------------------------------------------------------------
    make_special_cuts

    Perform cuts on the specified word for the special affix mode.
-----------------------------------------------------------------------------*/

void make_special_cuts( BOOL true_capital, BOOL pseudo_capital, BOOL mixed_capital, DictWord d, Dict *dict, uchar *text, int testpos, CutList *special_cut_list )
{
  int      i, textlen;
  DictWord suffix_dictword = NULL, prefix_dictword = NULL;

  textlen = (int)strlen((char *)(text+testpos));
  for ( i = MinTestLen; (textlen-i >= MinTestLen); i++ )
    {
      BOOL     compound_word_found = FALSE, partial_word = FALSE;

      get_special_cut( dict, i, text, testpos, &prefix_dictword, &suffix_dictword );

      compound_word_found = (suffix_dictword != NULL) && (prefix_dictword != NULL);
      partial_word = (suffix_dictword != NULL);
      if ( partial_word )
	{
	  /* Add the suffix to the special suffix cut list */
	  add_cut( true_capital, pseudo_capital, mixed_capital, suffix_dictword->text, special_cut_list, NULL, d->unktag );
	}

      if ( compound_word_found )
	{
	  uchar *prefix = add_chars( "-", prefix_dictword->text );

	  /* Add the prefix to the prefix cut list */
	  add_cut( true_capital, NO_CAPITAL, mixed_capital, prefix, special_cut_list, suffix_dictword->unktag, d->unktag );

	  free( prefix );
	}
    }
}

/*-----------------------------------------------------------------------------
    make_cuts

    Perform cuts on the specified word.
-----------------------------------------------------------------------------*/

void make_cuts( BOOL true_capital, BOOL pseudo_capital, BOOL mixed_capital, DictWord d, Dict *dict, uchar *text, int testpos, IndexList indexlist, CutList *cut_list )
{
  DictWord  new_dictword = NULL;
  int       i, j, textlen;
  uchar     *cut = NULL, *smart_cut = NULL;

/* Perform prefix cuts. */
  if ( (cut = get_cut( PREFIX, indexlist, dict, text, testpos, MinPrefixLen, features->maxprefcut, &new_dictword )) != NULL )
    {
      if ( new_dictword != NULL  )
        {
	  add_cut( true_capital, pseudo_capital, mixed_capital, cut, cut_list, d->unktag, new_dictword->unktag );

          if ( (smart_cut = get_smart_cut( new_dictword->text, cut )) != NULL )
            {
	      add_cut( true_capital, pseudo_capital, mixed_capital, smart_cut, cut_list, d->unktag, new_dictword->unktag );

	      free( smart_cut );
	    }
        }

      free( cut );
    }

  if ( is_allalpha( (text+testpos) ) )
    {
      /* Perform root cuts. */

      textlen = (int)strlen((char *)(text+testpos));

      for ( i = 1, j = 1; (textlen-(i+j) >= MinRootLen); i++, j++ )
        {
	  uchar *cut = NULL;

	  if ( (cut = get_root_cut( dict, i, j, text, testpos, &new_dictword )) != NULL )
	    {
	      if ( new_dictword != NULL )
	        {
		  add_cut( true_capital, pseudo_capital, mixed_capital, cut, cut_list, d->unktag, new_dictword->unktag );
	        }

	      free( cut );
	    }
        }
    }
}

/*-----------------------------------------------------------------------------
    process_cut

    Compare strings and determine the type of cut, if any.
-----------------------------------------------------------------------------*/

BOOL process_cut( BOOL true_capital, BOOL pseudo_capital, BOOL mixed_capital, IndexWord scan_word, IndexWord base_word, CutList *cut_list, CutList *container_cut_list, CutList *replacement_cut_list )
{
  BOOL  cut_found = FALSE, scan_compress, base_compress;
  int   i, cutlen;
  int   scan_testpos, base_testpos;
  uchar *cut = NULL, *reverse_cut = NULL;
  uchar *scan_text = (scan_word->word)->text, *base_text = (base_word->word)->text;
  uchar *comptext1, *comptext2;

  scan_compress = ((comptext1 = compress_word( scan_text )) != NULL);
  if ( scan_compress )
    {
      scan_text = comptext1;
    }

  base_compress = ((comptext2 = compress_word( base_text )) != NULL);
  if ( base_compress )
    {
      base_text = comptext2;
    }

  is_initial( base_text, &base_testpos );
  is_initial( scan_text, &scan_testpos );

  cutlen = (int)strlen( (char *)(scan_text+scan_testpos) ) - (int)strlen( (char *)(base_text+base_testpos) );
  if ( (cutlen > 0) && (cutlen <= features->maxsuffcut) )
    {
      cut_found = (strstr( (char *)(scan_text+scan_testpos), (char *)(base_text+base_testpos) ) != NULL);
    }

  if ( cut_found )
    {
      /* Get the word remaining after removing the suffix */
      if ( get_affix( SUFFIX, (scan_text+scan_testpos), &cut, cutlen, NULL ) )
        {
	  uchar *smart_cut = NULL;

	  /* Add cut to the container cut list. */
	  add_cut( true_capital, pseudo_capital, mixed_capital, cut, container_cut_list, (base_word->word)->unktag, (scan_word->word)->unktag );

	  /* Add cut to the cut list. */
	  add_cut( true_capital, pseudo_capital, mixed_capital, cut, cut_list, (scan_word->word)->unktag, (base_word->word)->unktag );

	  if ( (smart_cut = get_smart_cut( (base_text+base_testpos), cut )) != NULL )
	    {
		/* Add cut to the smart cut list. */
		add_cut( true_capital, pseudo_capital, mixed_capital, smart_cut, cut_list, (scan_word->word)->unktag, (base_word->word)->unktag );

	      free( smart_cut );
	    }

	  free( cut );
        }
    }
  else if ( replacement_cut_list != NULL )
    {
      BOOL  comparison_valid = TRUE;
      uchar *base_suffix, *scan_suffix;
      int   scanlen, baselen;

      for ( i = MinTestLen; (scan_text+scan_testpos)[i] == (base_text+base_testpos)[i]; i++ )
        {
	  if ( ((scan_text+scan_testpos)[i] == '\0') || ((base_text+base_testpos)[i] == '\0') )
	    {
	      comparison_valid = FALSE;
	      break;
	    }
        }

      scanlen = (scan_word->length-i);
      baselen = (base_word->length-i);

      if ( (scanlen <= 0) || (baselen <= 0) )
        {
	  comparison_valid = FALSE;
	}

      if ( (scanlen <= features->maxsuffcut) && (baselen <= features->maxsuffcut) && comparison_valid && (i >= MinCutLen) )
	{
	  cut_found = TRUE;

	  if ( get_affix( SUFFIX, (base_text+base_testpos), &base_suffix, baselen, NULL ) )
	    {
	      if ( get_affix( SUFFIX, (scan_text+scan_testpos), &scan_suffix, scanlen, NULL ) )
	        {
		  cut = add_chars( scan_suffix, base_suffix );
		  add_cut( true_capital, pseudo_capital, mixed_capital, cut, replacement_cut_list, (base_word->word)->unktag, (scan_word->word)->unktag );
		  free( cut );

		  reverse_cut = add_chars( base_suffix, scan_suffix );
		  add_cut( true_capital, pseudo_capital, mixed_capital, reverse_cut, replacement_cut_list, (scan_word->word)->unktag, (base_word->word)->unktag );
		  free( reverse_cut );

		  free( scan_suffix );
	        }

	      free( base_suffix );
	    }
	}
    }

  if ( scan_compress )
    {
      free( comptext1 );
    }

  if ( base_compress )
    {
      free( comptext2 );
    }

  return cut_found;
}

/*-----------------------------------------------------------------------------
    compare_words

    Compare words in a list.
-----------------------------------------------------------------------------*/

void compare_words( IndexWord base_word, IndexWord word_list, IndexWord *stop_word, CutList *cut_list, CutList *container_cut_list, CutList *replacement_cut_list )
{
  IndexWord scan_word;

  for ( scan_word = word_list; scan_word != NULL; scan_word = scan_word->next )
    {		  
      if ( scan_word == *stop_word )
	{
	  break;
	}

      if ( !process_cut( NO_CAPITAL, NO_CAPITAL, NO_CAPITAL, scan_word, base_word, cut_list, container_cut_list, replacement_cut_list ) )
	{
	  *stop_word = scan_word;
	  break;
	}

    }
}

/*-----------------------------------------------------------------------------
    compare_capital_words

    Compare words in a list.
-----------------------------------------------------------------------------*/

void compare_capital_words( IndexWord base_word, IndexWord word_list, IndexWord *stop_word, CutList *cut_list, CutList *container_cut_list, CutList *replacement_cut_list )
{
  BOOL      skip = FALSE, scan_capital = FALSE, scan_mixed_capital = FALSE, base_capital = FALSE, base_mixed_capital = FALSE, scan_compress = FALSE, base_compress = FALSE;
  IndexWord scan_word;
  int       scan_testpos, base_testpos;
  uchar     *scan_text, *base_text;
  uchar     *comptext1, *comptext2;

  base_text = (base_word->word)->text;
  base_compress = ((comptext2 = compress_word( base_text )) != NULL);
  if ( base_compress )
    {
      base_text = comptext2;
    }

  is_initial( base_text, &base_testpos );
  base_capital = (search_chain( (base_word->word)->tag, features->max_capital->tag ) != 0);
  if ( !base_capital )
    {
      base_mixed_capital = (contains_capitals( (base_text+base_testpos) ) && !contains_numbers( (base_text+base_testpos) ) && (strpbrk( (char *)(base_text+base_testpos), SpecialChars ) != NULL));
    }

  for ( scan_word = word_list; scan_word != NULL; scan_word = scan_word->next )
    {		  
      if ( scan_word == *stop_word )
	{
	  break;
	}

      scan_text = (scan_word->word)->text;
      scan_compress = ((comptext1 = compress_word( scan_text )) != NULL);
      if ( scan_compress )
        {
	  scan_text = comptext1;
        }
     
      skip = scan_capital = scan_mixed_capital = FALSE;

      is_initial( scan_text, &scan_testpos );
      scan_capital = (search_chain( (scan_word->word)->tag, features->max_capital->tag ) != 0);
      if ( !(scan_capital || base_mixed_capital) )
        {
	  scan_mixed_capital = (contains_capitals( (scan_text+scan_testpos) ) && !contains_numbers( (scan_text+scan_testpos) ) && (strpbrk( (char *)(scan_text+scan_testpos), SpecialChars ) != NULL));
        }


      if ( (base_testpos && !scan_testpos) || (!base_testpos && scan_testpos) )
	{
	  skip = (strcmp( (char *)(scan_text+scan_testpos), (char *)(base_text+base_testpos) ) == 0);
	}

      if ( !skip )
	{
	 skip = ( (scan_capital && !base_capital) || (!scan_capital && base_capital) );
	}

      if ( !skip )
	{
	  if ( !process_cut( base_capital, NO_CAPITAL, (base_mixed_capital || scan_mixed_capital), scan_word, base_word, cut_list, container_cut_list, replacement_cut_list ) )
	    {
	      if ( !scan_testpos && !base_testpos )
		{
		  *stop_word = scan_word;
		  break;
		}
	      else if ( !scan_testpos )
		{
		  if ( (strcmp( (char *)scan_text, (char *)(base_text+base_testpos) ) > 0) )
		    {
		      break;
		    }
		}
	    }
	}
    }

  if ( scan_compress )
    {
      free( comptext1 );
    }

  if ( base_compress )
    {
      free( comptext2 );
    }
}

/*-----------------------------------------------------------------------------
    make_other_cuts

    Perform replacement and container cuts on the specified word, using the word indexes.
-----------------------------------------------------------------------------*/

void make_other_cuts( IndexList indexlist, CutList *cut_list, CutList *container_cut_list, CutList *replacement_cut_list )
{
  BOOL      capital_list;
  Index     *k = indexlist.key;
  IndexWord next_word, stop_word;
  int       i;
  uchar     *capital_test;

  /* Work through the indexes list */
  for (i = 0 ; i < indexlist.size ; i++, k++)
    {
      Index s = *k;

      capital_list = FALSE;
      capital_test = string_dup( s->prefix );
      capital_test[MinTestLen] = '\0';
      if ( is_capital( capital_test ) || is_allcaps( capital_test ) )
	{
	  capital_list = TRUE;
	}

      free( capital_test );


      if ( (s->wordnum > 1) && (s->wordlist_start != NULL) )
	{
	  stop_word = NULL;
	  for ( next_word = s->wordlist_start; next_word != NULL; next_word = next_word->next )
	    {
	      if ( next_word == stop_word )
		{
		  stop_word = NULL;
		}

	      if ( capital_list )
		{
		  compare_capital_words( next_word, next_word->next, &stop_word, cut_list, container_cut_list, replacement_cut_list );
		}
	      else
		{
		  compare_words( next_word, next_word->next, &stop_word, cut_list, container_cut_list, replacement_cut_list );
		}
	    }
	}
    }
}

/*
==============================================================================
Feature analysis functions.
*/

/*-----------------------------------------------------------------------------
    log_feature

    Log selected features into the feature structure.
-----------------------------------------------------------------------------*/

void log_feature( BOOL skip_closed, BOOL feat_exists, char *featname, TagScore tagscore )
{
  if ( feat_exists )
    {
	add_unkword( &(features->featdict), featname, skip_closed, tagscore, NULL );
    }
}

/*-----------------------------------------------------------------------------
    read_taggroup

    Read the tag groups in the unknown tag groups file.
-----------------------------------------------------------------------------*/

void read_taggroup( FILE *unktaggroupfile )
{
  TagTag       s;
  static       uchar *term = " \t\n";
  TagScore     next_tag, addtag, tagscore = NULL;

  /* Set up tag group list */
  InitList( features->tag_group_list );

  create_taglist( &(features->tag_group_list), MaxTagNum );
  clear_taglist( &(features->tag_group_list) );

  Allocate(addtag, sizeof(TagScoreSt), "addtag: read_taggroup");

  /* Process the unknown word tag groups file */
  while (!feof(unktaggroupfile))
    {
      uchar buffer[MaxGLine], *token;

      /* Get a line */
      if (fgets(buffer, MaxGLine, unktaggroupfile) == NULL) break;

      /* Get the tag */
      token = strtok(buffer, term);

      free_tagscore_list( &tagscore );
      
      do
	{
	  addtag->tag = map_tag(token);
	  addtag->score = 1;
	  add_chain_tag( ADD, &tagscore, addtag, "read_taggroup1" );
	  if ((token = strtok(NULL, term)) == NULL) break;
	} while (forever);

      for ( next_tag = tagscore; next_tag != NULL; next_tag = next_tag->next )
	{
	  /* Add the tag to the tag list (or find its hash) */
	  s = find_tagtag( &(features->tag_group_list), unmap_tag(next_tag->tag) );
	  if ( s == NULL )
	    error_exit1("Out of memory creating tag: %s\n", unmap_tag(next_tag->tag));
	  s->score = 1;
	  add_chain_tags( ADD, &(s->group), NULL, tagscore, NO_SKIP_CLOSED_TAGS, NULL, "read_taggroup2" );
	}      
    }

  free( addtag );
  free_tagscore_list( &tagscore );
}

/*-----------------------------------------------------------------------------
    unknown_word_handling_initialization

    Initialize various feature items.
-----------------------------------------------------------------------------*/

void unknown_word_handling_initialization( void )
{
    features->unigramtotal = features->unigram_open_total = 0;

    /* Create and clear dictonaries. */
    /* Already processed unknown words. */
    InitDict((features->unkdict));
    create_dict(&(features->unkdict), MAXUNKWORDS);
    clear_dict(&(features->unkdict));

    /* Unigram tag probabilities. */
    InitDict((features->unigramdict));
    create_dict(&(features->unigramdict), tags_max);
    clear_dict(&(features->unigramdict));
}

/*-----------------------------------------------------------------------------
    initialize_features

    Initialize various feature items.
-----------------------------------------------------------------------------*/

void initialize_features( Dict *dict )
{
    features->badwordfile_open = FALSE;
    features->all_wordnum = features->cap_wordnum = features->aff_wordnum = features->cut_wordnum = features->container_cut_wordnum = features->separator_wordnum = features->separator_aff_wordnum = features->separator_cut_wordnum = features->separator_container_cut_wordnum = 0;

    /* Create and clear dictonaries. */
    /* Word features. */
    InitDict((features->featdict));
    create_dict(&(features->featdict), MAXFEATURES);
    clear_dict(&(features->featdict));

    /* Unknown word feature statistics. */
    InitDict((features->unkstatdict));
    create_dict(&(features->unkstatdict), MAXFEATURES);
    clear_dict(&(features->unkstatdict));

    /* Set up main index list */
    InitList( features->indexlist );
    create_indexlist( &(features->indexlist), dict->size );
    clear_indexlist( &(features->indexlist) );

    InitList( features->enclosure_indexlist );
    create_indexlist( &(features->enclosure_indexlist), dict->size );
    clear_indexlist( &(features->enclosure_indexlist) );

    InitList( features->partialcap_indexlist );
    create_indexlist( &(features->partialcap_indexlist), dict->size );
    clear_indexlist( &(features->partialcap_indexlist) );
}

/*-----------------------------------------------------------------------------
    initialize_other_features

    Initialize various other feature items.
-----------------------------------------------------------------------------*/

void initialize_other_features( void )
{
    int cutlist_multiplier;

    /* Separator word endings. */
    InitDict((features->sepdict));
    create_dict(&(features->sepdict), (features->separator_wordnum));
    clear_dict(&(features->sepdict));

    /* Pure forms of capital words. */
    InitDict((features->capdict));
    create_dict( &(features->capdict), (features->cap_wordnum) );
    clear_dict(&(features->capdict));

    /* Set up separator index list */
    InitList( features->separator_indexlist );
    create_indexlist( &(features->separator_indexlist), (int)(0.25 * (float)(features->separator_wordnum)) );
    clear_indexlist( &(features->separator_indexlist) );

    /* Set up affix lists */
    InitList( features->sufflist );
    create_afflist( &(features->sufflist), (int)(1.0 * (float)(features->aff_wordnum)) );
    clear_afflist( &(features->sufflist) );

    InitList( features->variable_sufflist );
    create_afflist( &(features->variable_sufflist), (int)(1.0 * (float)(features->aff_wordnum)) );
    clear_afflist( &(features->variable_sufflist) );

    InitList( features->separator_sufflist );
    create_afflist( &(features->separator_sufflist), (int)(1.0 * (float)(features->separator_aff_wordnum)) );
    clear_afflist( &(features->separator_sufflist) );

    InitList( features->variable_separator_sufflist );
    create_afflist( &(features->variable_separator_sufflist), (int)(1.0 * (float)(features->separator_aff_wordnum)) );
    clear_afflist( &(features->variable_separator_sufflist) );

    /* Set up cut lists */
    InitList( features->cut_list );
    InitList( features->container_cut_list );
    InitList( features->replacement_cut_list );
    InitList( features->special_cut_list );

    InitList( features->sep_cut_list );
    InitList( features->sep_container_cut_list );
    InitList( features->sep_replacement_cut_list );
    InitList( features->sep_special_cut_list );

    cutlist_multiplier = (int)(((float)features->maxsuffcut + 0.5) / 2) + 1;

    create_cutlist( &(features->cut_list), (cutlist_multiplier * features->cut_wordnum) );
    create_cutlist( &(features->container_cut_list), (int)(1.0 * (float)(features->cut_wordnum)) );
    create_cutlist( &(features->replacement_cut_list), (cutlist_multiplier * features->cut_wordnum) );
    create_cutlist( &(features->special_cut_list), (int)(0.5 * (float)(features->cut_wordnum)) );

    create_cutlist( &(features->sep_cut_list), (int)(1.0 * (float)(features->separator_cut_wordnum)) );
    create_cutlist( &(features->sep_container_cut_list), (int)(1.0 * (float)(features->separator_cut_wordnum)) );
    create_cutlist( &(features->sep_replacement_cut_list), (int)(1.0 * (float)(features->separator_cut_wordnum)) );
    create_cutlist( &(features->sep_special_cut_list), (int)(0.5 * (float)(features->separator_cut_wordnum)) );

    clear_cutlist( &(features->cut_list) );
    clear_cutlist( &(features->container_cut_list) );
    clear_cutlist( &(features->replacement_cut_list) );
    clear_cutlist( &(features->special_cut_list) );

    clear_cutlist( &(features->sep_cut_list) );
    clear_cutlist( &(features->sep_container_cut_list) );
    clear_cutlist( &(features->sep_replacement_cut_list) );
    clear_cutlist( &(features->sep_special_cut_list) );
}

/*-----------------------------------------------------------------------------
    write_features

    Write out features file.
-----------------------------------------------------------------------------*/

void write_features( void )
{
    DictWord     *k = (features->featdict).key;
    DictWord     *u = (features->unigramdict).key;
    int          i;

    /* Collected features */
    for ( i = 0 ; i < (features->featdict).size ; i++, k++ )
      {
	DictWord d = *k;
	
	print_tags( features->ofeaturesfile, d->tag, d->text );
	fprintf(features->ofeaturesfile, "\n");
      }

    /* Unigram Frequencies */
    fprintf(features->ofeaturesfile, "[unigram frequencies]\n");
    fprintf(features->ofeaturesfile, "Tag Total: %g\n\n", features->unigramtotal);
    for ( i = 0 ; i < (features->unigramdict).size ; i++, u++ )
      {
	DictWord d = *u;

	print_tags( features->ofeaturesfile, d->tag, d->text );
	fprintf(features->ofeaturesfile, "\n");
      }

    /* Indexes */
    fprintf(features->ofeaturesfile, "[indexes]\n");
    print_indexes( features->ofeaturesfile, features->indexlist );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[enclosure indexes]\n");
    print_indexes( features->ofeaturesfile, features->enclosure_indexlist );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[partial capital indexes]\n");
    print_indexes( features->ofeaturesfile, features->partialcap_indexlist );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[separator indexes]\n");
    print_indexes( features->ofeaturesfile, features->separator_indexlist );
    fprintf(features->ofeaturesfile, "\n\n");

    /* Cuts */
    fprintf(features->ofeaturesfile, "[cuts]\n");
    print_cuts( features->ofeaturesfile, features->cut_list );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[container cuts]\n");
    print_cuts( features->ofeaturesfile, features->container_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[replacement cuts]\n");
    print_cuts( features->ofeaturesfile, features->replacement_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[special cuts]\n");
    print_special_cuts( features->ofeaturesfile, features->special_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");

    fprintf(features->ofeaturesfile, "[separator cuts]\n");
    print_cuts( features->ofeaturesfile, features->sep_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[separator container cuts]\n");
    print_cuts( features->ofeaturesfile, features->sep_container_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[separator replacement cuts]\n");
    print_cuts( features->ofeaturesfile, features->sep_replacement_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");
    fprintf(features->ofeaturesfile, "[separator special cuts]\n");
    print_special_cuts( features->ofeaturesfile, features->sep_special_cut_list );
    fprintf(features->ofeaturesfile, "\n\n");

    /* Affixes */
    fprintf(features->ofeaturesfile, "[affixes]\n");
    print_affixes( features->ofeaturesfile, features->sufflist );
    fprintf(features->ofeaturesfile, "\n\n");

    fprintf(features->ofeaturesfile, "[variable suffixes]\n");
    print_affixes( features->ofeaturesfile, features->variable_sufflist );
    fprintf(features->ofeaturesfile, "\n\n");

    fprintf(features->ofeaturesfile, "[separator suffixes]\n");
    print_affixes( features->ofeaturesfile, features->separator_sufflist );
    fprintf(features->ofeaturesfile, "\n\n");
    
    fprintf(features->ofeaturesfile, "[variable separator suffixes]\n");
    print_affixes( features->ofeaturesfile, features->variable_separator_sufflist );
    fprintf(features->ofeaturesfile, "\n\n");
}

/*-----------------------------------------------------------------------------
    check_pure_word

    Determine if an unknown word needs to be store in its "pure" form.
-----------------------------------------------------------------------------*/

BOOL check_pure_word( BOOL needs_compression, DictWord d, uchar *vanilla_text, uchar *down_vanilla_text, int testpos )
{
  BOOL success = FALSE;

  if ( contains_capitals( (vanilla_text+testpos) ) )
    {
      TagScore captags = NULL;

      /* Store pure word form(s) in capital dictionary */ 
      captags = filter_tags( d->tag );
      add_unkword( &(features->capdict), (down_vanilla_text+testpos), SKIP_CLOSED_TAGS, captags, NULL );

      if ( needs_compression )
        {
	  uchar *down_comptext = NULL;

	  down_comptext = compress_word( (down_vanilla_text+testpos) );
	  add_unkword( &(features->capdict), down_comptext, SKIP_CLOSED_TAGS, captags, NULL );
	  free( down_comptext );
        }

      free_tagscore_list( &captags );
    }

  return success;
}

/*-----------------------------------------------------------------------------
    gather_unigram_freqs

    First run through the lexicon to gather unigram frequencies.
-----------------------------------------------------------------------------*/

void gather_unigram_freqs( Dict *dict )
{
  DictWord *k = dict->key;
  TagScore next_tagscore = NULL;
  int      i;

  /* Work through the dictionary to get feature frequencies */
  for ( i = 0 ; i < dict->size ; i++, k++ )
    {
      DictWord d = *k;

      /* Get unigram frequencies and total tag score */
      for ( next_tagscore = d->tag; next_tagscore != NULL; next_tagscore = next_tagscore->next )
        {
	  TagScore tagscore;
	  
	  Allocate(tagscore, sizeof(TagScoreSt), "tagscore: gather_unigram_freqs");
	  
	  tagscore->tag = next_tagscore->tag;
	  tagscore->score = next_tagscore->score;
	  tagscore->next = NULL;

	  features->unigramtotal += tagscore->score;
	  
	  if ( !is_closed( next_tagscore->tag ) )
	    {
	      features->unigram_open_total += tagscore->score;
	    }

	  add_unkword( &(features->unigramdict), unmap_tag(next_tagscore->tag), NO_SKIP_CLOSED_TAGS, tagscore, NULL );

	  free( tagscore );
        }
    }

  sort_dict( &(features->unigramdict) );
}

/*-----------------------------------------------------------------------------
    gather_initial_freqs

    Second run through the lexicon to gather initial feature frequencies.
-----------------------------------------------------------------------------*/

void gather_initial_freqs( Dict *dict )
{
  DictWord *k = dict->key, feature_entry = NULL, feature_entry1 = NULL;
  BOOL     initial, capital, allcaps, true_capital, valid_length = FALSE;
  int      i, testpos, textlen;
  uchar    *text, *vanilla_text;

  /* Work through the dictionary to get feature frequencies */
  for ( i = 0 ; i < dict->size ; i++, k++ )
    {
      DictWord d = *k;
      BOOL     has_numbers = FALSE, all_numbers = FALSE, has_alpha = FALSE, needs_compression = FALSE, has_separator = FALSE;
      char     *compressable_ptr = strpbrk( (char *)d->text, CompressableChars );
      char     *sepptr = strpbrk( (char *)d->text, SeparatorChars );
      uchar    *repeating_chars = NULL, *new_suffix_word = NULL, *new_prefix_word = NULL;
      uchar    *end_text = NULL;
      int      sepcharnum = 0;

      vanilla_text = d->text;
      text = d->text;

      /* Check for sentence-initial marker */
      /* testpos is starting position in text for comparison. */
      initial = is_initial( vanilla_text, &testpos );
      textlen = (int)strlen((char *)(vanilla_text+testpos));
      valid_length = (textlen >= MinTestLen);
      has_numbers = contains_numbers( vanilla_text );
      has_alpha = contains_alpha( vanilla_text );
      all_numbers = is_allnums( (vanilla_text+testpos) );

      if ( sepptr != NULL )
	{
	  end_text = (uchar *)strrchr( vanilla_text, *sepptr );
        }

      if ( end_text != NULL )
	{
	  end_text++;
	}

      if ( initial )
        {
	  features->initials_exist = TRUE;
        }

      /* Check if there is a separator in the text. */
      if ( (has_alpha || has_numbers) && (sepptr != NULL) )
	{
	  has_separator = TRUE;
	  sepcharnum = contains_this_char( vanilla_text, *sepptr );
	}

      /* Take snapshot of dictionary tags with unadjusted scores */
      add_chain_tags( ADD, &(d->unktag), NULL, d->tag, NO_SKIP_CLOSED_TAGS, NULL, "gather_initial_freqs" );

      /* Singletons */
      log_feature( SKIP_CLOSED_TAGS, (d->ntag == 1 && (d->tag)->score == 1), "singletons", d->tag );
	
      /* Add word to the index. */
      if ( !has_numbers && has_alpha )
        {
	  if ( add_index( NO_DOWNCASE, d, testpos, MinTestLen, &(features->indexlist) ) )
	    {
	      (features->all_wordnum)++;

	      if ( (int)strlen( (vanilla_text+testpos) ) >= MinCutLen )
	        {
		  (features->cut_wordnum)++;
		}

	      if ( (int)strlen( (vanilla_text+testpos) ) >= MinAffWord )
	        {
		  (features->aff_wordnum)++;
		}

	      if (  (strchr( MixedCapitalChars, (vanilla_text+testpos)[0] ) != NULL) && contains_capitals( (vanilla_text+testpos) ) )
		{
		  add_index( DOWNCASE, d, testpos, MinTestLen, &(features->partialcap_indexlist) );
		}
	    }
	}
      else if ( is_enclosure( (vanilla_text+testpos) ) )
        {
	  add_index( NO_DOWNCASE, d, testpos, MinTestLen, &(features->enclosure_indexlist) );
	}

      /* Count all alpha end words for words with separators. */
      if ( end_text != NULL )
        {
	  BOOL  end_text_cut_valid = FALSE, end_text_affix_valid = FALSE;

	  end_text_cut_valid = ((int)strlen((char *)end_text) >= MinCutLen) && is_allalpha( end_text );
	  end_text_affix_valid = ((int)strlen((char *)end_text) >= MinAffWord) && is_allalpha( end_text );

	  if ( is_allalpha( end_text ) )
	    {
	      (features->separator_wordnum)++;

	      if ( (int)strlen( end_text ) >= MinCutLen )
	        {
		  (features->separator_cut_wordnum)++;
	        }

	      if ( (int)strlen( end_text ) >= MinAffWord )
	        {
		  (features->separator_aff_wordnum)++;
	        }
	    }
        }

      /* Check if a separator character is present */
      if ( has_separator )
        {
	  /* A separator character is present */
	  if ( make_affix_words( vanilla_text, testpos, &new_prefix_word, &new_suffix_word ) )
	    {
	      BOOL pref_capital, suff_capital, pref_allcaps, suff_allcaps, pref_allnum, suff_allnum, suff_lower, goodpreflen, goodsufflen;

	      /*Alphanumeric Features */

	      goodpreflen  = (int)strlen((char *)new_prefix_word) >= MinTestLen;
	      goodsufflen  = (int)strlen((char *)new_suffix_word) >= MinTestLen;

	      pref_allnum  = is_allnums( new_prefix_word );
	      suff_allnum  = is_allnums( new_suffix_word );

	      log_feature( NO_SKIP_CLOSED_TAGS, (pref_allnum && (!suff_allnum && goodsufflen)), "number-string", d->tag );
	      
	      log_feature( NO_SKIP_CLOSED_TAGS, (pref_allnum && suff_allnum), "number-number", d->tag );

	      if ( sepcharnum == 1 )
	        {
		  /*Alphanumeric Features */
		  pref_capital = is_capital( new_prefix_word );
		  suff_capital = is_capital( new_suffix_word );

		  pref_allcaps = is_allcaps( new_prefix_word );
		  suff_allcaps = is_allcaps( new_suffix_word );

		  suff_lower   = (is_allalpha( new_suffix_word ) && !is_allcaps( new_suffix_word ) && !is_capital( new_suffix_word ));

		  log_feature( NO_SKIP_CLOSED_TAGS, (pref_capital && suff_capital), "capital-capital", d->tag );
		  
		  log_feature( NO_SKIP_CLOSED_TAGS, (pref_allcaps && suff_allcaps), "allcaps-allcaps", d->tag );
		  
		  log_feature( NO_SKIP_CLOSED_TAGS, (pref_allcaps && suff_lower), "allcaps-lowercase", d->tag );

		  log_feature( NO_SKIP_CLOSED_TAGS, (pref_allcaps && suff_allnum), "allcaps-numbers", d->tag );
	        }
	    }
        }

      /* Capitalization */
      if ( valid_length )
        {
	  if ( (valid_length && (has_alpha || has_numbers) && (compressable_ptr != NULL)) )
	    {
	      /* Text needs compression */
	      needs_compression = (text = compress_word( vanilla_text )) != NULL;
	    }

	  /* Count mixed words containing capitals. */
	  if ( contains_capitals( (vanilla_text+testpos) ) )
	    {
	      (features->cap_wordnum)++;

	      if ( needs_compression )
	        {
		  (features->cap_wordnum)++;
		}
	    }
	}

      capital         = is_capital( (text+testpos) );
      allcaps         = is_allcaps( (text+testpos) );
      true_capital    = (capital && !initial);

      log_feature( SKIP_CLOSED_TAGS, true_capital, "true capital", d->tag );
      log_feature( SKIP_CLOSED_TAGS, (!(capital || allcaps) && is_allalpha( (text+testpos) )), "not capital", d->tag );

      if ( all_numbers )
        {
	  /* All numeric characters */
	  log_feature( NO_SKIP_CLOSED_TAGS, TRUE, "contains numbers only", d->tag );
        }
      else if ( (repeating_chars = contains_repeating_consecutives( (vanilla_text+testpos) )) != NULL )
        {
	  /* Repeating consecutive characters */
	  log_feature( NO_SKIP_CLOSED_TAGS, TRUE, repeating_chars, d->tag );
	  free ( repeating_chars );
        }
      else if ( check_time_format( (vanilla_text+testpos) ) )
        {
	  /* Time format */
	  log_feature( NO_SKIP_CLOSED_TAGS, TRUE, "time format", d->tag );
        }
      else if ( check_currency_format( (vanilla_text+testpos), 2, 2 ) )
        {
	  /* Currency format */
	  log_feature( NO_SKIP_CLOSED_TAGS, TRUE, "currency format", d->tag );
        }
      else if ( check_ordinal( (vanilla_text+testpos), OrdinalSuffix ) )
        {
	  /* Ordinal number */
	  log_feature( NO_SKIP_CLOSED_TAGS, TRUE, "ordinal number", d->tag );
        }
      else if ( check_cardinal( (vanilla_text+testpos) ) )
        {
	  /* Cardinal number */
	  log_feature( NO_SKIP_CLOSED_TAGS, TRUE, "cardinal number", d->tag );
        }

      if ( new_suffix_word != NULL )
        {
	  free( new_suffix_word );
        }

      if ( new_prefix_word != NULL )
        {
	  free( new_prefix_word );
        }

      if ( needs_compression )
        {
	  free( text );
        }
    } /* End of working through the dictionary */

  /* Sort the feature dictionaries */
  sort_dict( &(features->featdict) );
  sort_dict( &(features->unigramdict) );

  /* Sort the index lists */
  sort_indexlist( &(features->indexlist) );
  sort_indexlist( &(features->enclosure_indexlist) );
  sort_indexlist( &(features->partialcap_indexlist) );

  /* Save the capital tag which has the maximum score to determine if a word is a "true" capital. */ 
  features->max_capital = NULL;
  feature_entry = search_unkdict( &(features->featdict), "true capital" );
  if ( feature_entry != NULL )
    {
      features->max_capital = get_max_tag( feature_entry->tag );
      features->max_capital->next = NULL;
    }

  /* Filter the non-capital tags to set the initial tag hypotheses for lowercase unknown words. */
  feature_entry1 = search_unkdict( &(features->featdict), "not capital" );
  if ( feature_entry1 != NULL )
    {
      TagScore noncaptags;

      noncaptags = filter_tags( feature_entry1->tag );
      log_feature( SKIP_CLOSED_TAGS, TRUE, "not capital - filtered", noncaptags );

      free_tagscore_list( &noncaptags );
    }
}

/*-----------------------------------------------------------------------------
    gather_other_freqs

    Using the initial feature frequencies, go through the lexicon again to get other feature frequencies.
-----------------------------------------------------------------------------*/

void gather_other_freqs( Dict *dict )
{
    BOOL     initial, capital, allcaps, initial_capital, true_capital, valid_length = FALSE;
    DictWord *k = dict->key;
    int      i, testpos, textlen;
    uchar    *comptext, *text, *vanilla_text;

    /* Work through the dictionary to get frequencies for selected features */
    for ( i = 0 ; i < dict->size ; i++, k++ )
      {
	DictWord d = *k;
	BOOL     all_alpha, all_numbers, has_numbers = contains_numbers( d->text ), has_alpha = contains_alpha( d->text ), needs_compression = FALSE, has_separator = FALSE, pseudo_capital = FALSE, has_special_char = FALSE;
	BOOL     use_separator_suffix = FALSE, separator_suffix_capital = FALSE;
	BOOL     mixed_capital = FALSE;
	char     *compressable_ptr = strpbrk( (char *)d->text, CompressableChars );
	char     *sepptr = strpbrk( (char *)d->text, SeparatorChars );
	char     *specptr;
	uchar    *new_suffix_word = NULL, *new_prefix_word = NULL, *down_vanilla_text = NULL, *down_text = NULL;
	uchar    *end_text = NULL;
	int      sepcharnum = 0;

	vanilla_text = d->text;
	text = d->text;

	/* Check for sentence-initial marker */
	/* testpos is starting position in text for comparison. */
	initial = is_initial( vanilla_text, &testpos );
	textlen = (int)strlen((char *)(vanilla_text+testpos));
	valid_length = (textlen >= MinTestLen);
	all_alpha = is_allalpha( (vanilla_text+testpos) );
	all_numbers = is_allnums( (vanilla_text+testpos) );
	specptr = strpbrk( (char *)(vanilla_text+testpos), SpecialChars );
	has_special_char = (specptr != NULL);
        if ( sepptr != NULL )
	  {
	    end_text = (uchar *)strrchr( vanilla_text, *sepptr );
          }

	if ( end_text != NULL )
	  {
	    end_text++;
	  }

	/* Get a compressed version of the text. */
	if ( (valid_length && (has_alpha || has_numbers) && (compressable_ptr != NULL)) )
	  {
	    needs_compression = ((comptext = compress_word( vanilla_text )) != NULL);
	  }

	if ( needs_compression )
	  {
	    text = comptext;
	  }

	/* Check if there is a separator in the text. */
	if ( (has_alpha || has_numbers) && (sepptr != NULL) )
	  {
	    has_separator = TRUE;
	    sepcharnum = contains_this_char( vanilla_text, *sepptr );
	  }

	/* Get a downcased version of the text. */
	down_vanilla_text = downcase( vanilla_text );
	down_text = downcase( text );

	/* Capitalization */
	/* See if word is a "true" capital, i.e., if it contains the max capital tag. */
	true_capital = (search_chain( d->tag, features->max_capital->tag ) != 0);
	allcaps         = is_allcaps( (text+testpos) ) && true_capital;
	capital         = is_capital( (text+testpos) );
	initial_capital = (initial && capital) && true_capital;
	pseudo_capital = ((capital && !initial) && !true_capital);
	mixed_capital = (contains_capitals( (text+testpos) ) && !has_numbers && (strpbrk( (char *)(vanilla_text+testpos), MixedCapitalChars ) != NULL));

	if ( valid_length )
	  {
	    log_feature( SKIP_CLOSED_TAGS, (initial_capital), "initial capital", d->tag );
	    log_feature( SKIP_CLOSED_TAGS, (allcaps), "all capitals", d->tag );
	    log_feature( SKIP_CLOSED_TAGS, (initial && allcaps), "initial all capitals", d->tag );

	    check_pure_word( needs_compression, d, vanilla_text, down_vanilla_text, testpos );
	  }

	if ( all_alpha )
	  {
	    int   m;
	    uchar *msg = NULL;

	    /* All alpha characters. */
	    /* Length features */
	    Allocate(msg, MAXFEATNAME*sizeof(uchar), "msg: gather_initial_freqs");
	    for( m = 1; m <= MinTestLen; m++ )
	      {
		if ( (int)strlen((text+testpos)) == m )
		  {
		    sprintf( msg, "%d letters (capital)", m );
		    log_feature( SKIP_CLOSED_TAGS, (true_capital), msg, d->tag );
		    sprintf( msg, "%d letters (mixed capital)", m );
		    log_feature( SKIP_CLOSED_TAGS, (mixed_capital), msg, d->tag );
		    sprintf( msg, "%d letters (initial capital)", m );
		    log_feature( SKIP_CLOSED_TAGS, (initial_capital), msg, d->tag );
		    sprintf( msg, "%d letters (all capital)", m );
		    log_feature( SKIP_CLOSED_TAGS, (allcaps), msg, d->tag );
		    sprintf( msg, "%d letters", m );
		    log_feature( NO_SKIP_CLOSED_TAGS, (!(true_capital || initial_capital) && !allcaps), msg, d->tag );
		  }
	      }
	    
	    free ( msg );
	  }
	else if ( has_separator )
	  {
	    /* A separator character is present */
	    if ( make_affix_words( vanilla_text, testpos, &new_prefix_word, &new_suffix_word ) )
	      {
		/* Separator Suffixes */
		if ( end_text != NULL )
		  {
		    BOOL suff_alpha   = is_allalpha( end_text );
		    BOOL goodsepsufflen = ((int)strlen((char *)end_text) >= MinAffWord);

		    if ( suff_alpha && goodsepsufflen )
		      {
			use_separator_suffix = TRUE;
			separator_suffix_capital = is_capital( end_text ) || is_allcaps( end_text );
		      }
		  }
	      }
	  }

	/* Add end words of words with separators to separator dictionary. */
	if ( end_text != NULL )
          {
	    if ( is_allalpha( end_text ) )
	      {
		TagScore septags = NULL;
		  
		septags = filter_tags( d->tag );
		add_unkword( &(features->sepdict), end_text, NO_SKIP_CLOSED_TAGS, septags, NULL );
		free_tagscore_list( &septags );
	      }
          }

	if ( valid_length && has_alpha && !all_alpha) /* Alphanumeric & special char mixture */
	  {
	    uchar  *featname;

	    if ( needs_compression )
	      {
		uchar *sepchar, *msg = NULL;

		/* Compression character features. */
		sepchar = string_dup( sepptr );
		sepchar[1] = '\0';

		Allocate(msg, MAXFEATNAME*sizeof(uchar), "msg: gather_initial_freqs");

		if ( allcaps )
		  {
		    featname = add_chars( sepchar, "all capitals " );
		  }
		else if ( true_capital )
		  {
		    featname = add_chars( sepchar, "true capital " );
		  }
		else if ( mixed_capital )
		  {
		    featname = add_chars( sepchar, "mixed capital " );
		  }
		else if ( features->initials_exist && initial_capital )
		  {
		    featname = add_chars( sepchar, "initial capital " );
		  }
		else
		  {
		    featname = add_chars( sepchar, "plain " );
		  }
		
		if ( has_numbers && (sepcharnum == 1) )
		  {
		    sprintf( msg, "%s [contains numbers]", featname );
		  }
		else if ( has_numbers && (sepcharnum > 1) )
		  {
		    sprintf( msg, "%s (>1) [contains numbers]", featname );
		  }
		else if ( !has_numbers && (sepcharnum == 1) )
		  {
		    sprintf( msg, "%s", featname );
		  }
		else if ( !has_numbers && (sepcharnum > 1) )
		  {
		    sprintf( msg, "%s (>1)", featname );
		  }

		log_feature( NO_SKIP_CLOSED_TAGS, TRUE, msg, d->tag );
	      
		free( featname );
		free( sepchar );
		free( msg );
	      }
	    else if ( !needs_compression && !has_numbers )
	      {
		BOOL  char_found = FALSE;
		char  *specgroup = SpecialChars;
		int   j;
		uchar *specchar;
		
		/* Special character features. */
		for ( j = 0; (specgroup[j] != '\0') && !char_found; j++ )
		  {
		    if ( contains_this_char( (vanilla_text+testpos), specgroup[j] ) )
		      {
			specchar = string_dup( &(specgroup[j]) );
			specchar[1] = '\0';
		    
			char_found = TRUE;
		      }
		  }
		
		if ( char_found )
		  {
		    if ( contains_allcaps( (vanilla_text+testpos) ) )
		      {
			featname = add_chars( specchar, "contains (allcaps) " );
		      }
		    else if ( true_capital || mixed_capital )
		      {
			featname = add_chars( specchar, "contains (capital) " );
		      }
		    else
		      {
			featname = add_chars( specchar, "contains " );
		      }
		    
		    log_feature( NO_SKIP_CLOSED_TAGS, TRUE, featname, d->tag );
		    
		    free( featname );
		    free( specchar );
		  }
	      }
	  }

	/* Suffixes */
	if ( strlen((char *)(text+testpos)) >= MinAffWord )
	  {
	    uchar *local_text = text;

	    if ( use_separator_suffix )
	      {
		add_affix( separator_suffix_capital, NO_CAPITAL, NO_CAPITAL, VARIABLE_SUFFIX, &(features->variable_separator_sufflist), 0, new_suffix_word, NO_TESTPOS, d->unktag );

		add_affix( separator_suffix_capital, NO_CAPITAL, NO_CAPITAL, SUFFIX, &(features->separator_sufflist), features->maxsuffix, new_suffix_word, NO_TESTPOS, d->unktag );
	      }
	    else if ( !has_numbers && !has_separator )
	      {
		if ( !true_capital )
	          {
		    local_text = down_text;
	          }

		add_affix( true_capital, NO_CAPITAL, mixed_capital, VARIABLE_SUFFIX, &(features->variable_sufflist), 0, local_text, testpos, d->unktag );

		add_affix( true_capital, NO_CAPITAL, mixed_capital, SUFFIX, &(features->sufflist), features->maxsuffix, local_text, testpos, d->unktag );
	      }
	  }

	/* Cuts */
	if ( !has_numbers && ((int)strlen((char *)(text+testpos)) >= MinCutLen) )
	  {
	    uchar *local_text = text;

	    if ( !true_capital )
	      {
		local_text = down_text;
	      }

	    make_cuts( true_capital, NO_CAPITAL, mixed_capital, d, dict, local_text, testpos, features->indexlist, &(features->cut_list) );

	    /* Perform special cuts. */
	    make_special_cuts( true_capital, pseudo_capital, mixed_capital, d, dict, local_text, testpos, &(features->special_cut_list) );
	  }

	/* Separator Cuts */
	if ( has_separator && ((features->sepdict).size > 0) && (end_text != NULL) )
	  {
	    end_text++;

	    if ( end_text != NULL )
	      {
		if ( is_allalpha( end_text ) && (int)strlen((char *)end_text) >= MinCutLen )
		{
		  uchar *local_text;

		  if ( true_capital )
		    {
		      local_text = string_dup( end_text );
		    }
		  else
		    {
		      local_text = downcase( end_text );
		    }

		  make_cuts( true_capital, NO_CAPITAL, NO_CAPITAL, d, &(features->sepdict), local_text, NO_TESTPOS, features->separator_indexlist, &(features->sep_cut_list) );

		  /* Perform separator special cuts. */
		  make_special_cuts( true_capital, NO_CAPITAL, NO_CAPITAL, d, dict, local_text, NO_TESTPOS, &(features->sep_special_cut_list) );

		  free( local_text );
	        }
	      }
	  }

	if ( new_suffix_word != NULL )
	  {
	    free( new_suffix_word );
	  }

	if ( new_prefix_word != NULL )
	  {
	    free( new_prefix_word );
	  }

	if ( needs_compression )
	  {
	    free( comptext );
	  }

	free( down_vanilla_text );
	free( down_text );

      } /* End of working through the dictionary */

    /* Sort the dictionaries */
    sort_dict( &(features->featdict) );
    sort_dict( &(features->capdict) );
    sort_dict( &(features->sepdict) );

  /* Make an index of the separator dictionary, if it has any entries. */
  if ( (features->sepdict).size > 0 )
    {
      DictWord  *k = (features->sepdict).key;

      /* Work through the separator dictionary */
      for ( i = 0 ; i < (features->sepdict).size ; i++, k++ )
        {
	  DictWord d = *k;

	  if ( is_allalpha( d->text ) )
	    {
	      /* Add word to the separator index. */
	      add_index( NO_DOWNCASE, d, NO_TESTPOS, MinTestLen, &(features->separator_indexlist) );
	    }
        }

      sort_indexlist( &(features->separator_indexlist) );

    }

    /* Sort the affix lists */
    sort_afflist( &(features->sufflist) );
    sort_afflist( &(features->variable_sufflist) );
    sort_afflist( &(features->separator_sufflist) );
    sort_afflist( &(features->variable_separator_sufflist) );

    /* Sort special cut lists */
    sort_cutlist( &(features->special_cut_list) );
    sort_cutlist( &(features->sep_special_cut_list) );

    /* Other Cuts */
    /* Perform replacement and container cuts. */
    make_other_cuts( features->indexlist, &(features->cut_list), &(features->container_cut_list), &(features->replacement_cut_list) );

    if ( (features->sepdict).size > 0 )
      {
	make_other_cuts( features->separator_indexlist, &(features->sep_cut_list), &(features->sep_container_cut_list), &(features->sep_replacement_cut_list) );
      }

    /* Sort the cut lists */
    sort_cutlist( &(features->cut_list) );
    sort_cutlist( &(features->container_cut_list) );
    sort_cutlist( &(features->replacement_cut_list) );

    sort_cutlist( &(features->sep_cut_list) );
    sort_cutlist( &(features->sep_container_cut_list) );
    sort_cutlist( &(features->sep_replacement_cut_list) );
}

/*-----------------------------------------------------------------------------
    analyze_features

    Analyze selected features of lexicon for use in generating tag probabilities for unknown words.
-----------------------------------------------------------------------------*/

void analyze_features( Dict *dict, char *ofeaturesname, char *obadwordname )
{
    /* Initialize various feature items. */
    initialize_features( dict );

    if (ofeaturesname[0] != 0)
      features->ofeaturesfile = open_file(ofeaturesname, "w");

    if (obadwordname[0] != 0)
      {
	features->obadwordfile = open_file(obadwordname, "w");
	features->badwordfile_open = TRUE;
      }

    /* First run through the lexicon to gather initial feature frequencies. */
    gather_initial_freqs( dict );

    /* Initialize cut list features. */
    initialize_other_features();

    /* Using the initial feature frequencies, go through the lexicon again to get other feature frequencies. */
    gather_other_freqs( dict );

    sort_dict( &(features->unkdict) );
}

/*-----------------------------------------------------------------------------
    main

    Entry point.
-----------------------------------------------------------------------------*/

int main(int argc, char *argv[])
{                    
    int      iterations, initialise, iter, dict_size;
    options_st saved_options;
    char     dictname[MAXFN], tranname[MAXFN], odictname[MAXFN], otranname[MAXFN], outname[MAXFN], mapname[MAXFN], skipname[MAXFN], reducename[MAXFN], infername[MAXFN];
    char     fsmname[MAXFN], grammarname[MAXFN], ukwname[MAXFN];
    char     ofeaturesname[MAXFN], obadwordname[MAXFN], unktaggroupname[MAXFN];
    Dict     dict, skip_dict;
    Trans    trans, newtrans;
    FILE     *corpfile, *unktaggroupfile;
    FILE     *outfile, *odictfile, *otranfile;
    BOOL     any_output;

    setlocale(LC_CTYPE, "iso_8858_1");
#ifdef SpecialMalloc
    /* Force fast allocation */
    set_small_allocation(100);
#endif

    /* Clear data structures */
    InitDict(dict)
    InitDict(skip_dict)
    InitTrans(trans)
    InitTrans(newtrans)
    odictfile = otranfile = NULL;

    /* Verify command line */
    if (argc <= 2)
	error_exit("Usage: label corpus options\n");

    /* Form options */
    InitOptions;
    
    set_up_options(argc, argv, &iterations, &initialise, &dict_size,
	dictname, tranname, odictname, otranname, outname, mapname,
	skipname, reducename, fsmname, grammarname, infername, ukwname,
	ofeaturesname, obadwordname, unktaggroupname);
    any_output = !no_output || Option(report_stats) || OutOpt(prob_dist);

    /* Read mappings */
    if (Option(verbose)) printf("Read mappings\n");
    read_mapping(mapname);

    /* Read tag reduction mappings */
    if (Option(reduced_tags))
    {
	if (Option(verbose)) printf("Read reduced tag set\n");
	read_reduce_mapping(reducename);
    }

#ifdef Use_Parser
    /* Read parse rules */
    if (Option(use_parser))
    {
	if (Option(verbose)) printf("Read parse rules\n");
	parser_read_named(grammarname);
    }
#endif
#ifdef Use_FSM
    /* Read FSM definitions */
    if (Option(use_fsm))
    {
	if (Option(verbose)) printf("Read FSMs\n");
	fsm_read_named(fsmname);
    }
#endif

    /* Read skip list */
    if (Option(skip_list))
    {
	if (Option(verbose)) printf("Read skip list\n");
	read_named_dict(skipname, &skip_dict, -1);
    }

    /* Read unknown word rules */
    if (Option(unknown_rules))
    {
	if (Option(verbose)) printf("Read unknown word rules\n");
	read_unknown_rules(ukwname);
    }

    /* Set up dictionary */
    if (dictname[0] == 0)
    {
	create_dict(&dict, dict_size);
	clear_dict(&dict);
    }
    else
    {
	if (Option(verbose)) printf("Read dictionary\n");
	read_named_dict(dictname, &dict, -1);
	if (infername[0] != 0)
	{
	    if (Option(verbose)) printf("Read inference rules\n");
	    infer_tags(infername, &dict);

	}
    }

    /* Open input corpus, '-' for stdin */
    if ( argv[1][0] == '-' ) corpfile = stdin;
    else corpfile = open_file(argv[1], "r");

    /* Set up transitions */
    if (tranname[0] == 0)
    {
	create_trans(&trans, tags_all);
	clear_trans_all(&trans);
    }
    else
    {
      if (Option(verbose)) printf("Read transitions\n");
      read_named_trans(tranname, &trans);
      
      /* Analyze selected features of lexicon to generate tag probabilities for unknown words. */
      use_tag_groups = FALSE;
      if ( Option(unknown_morph) || Option(unknown_rules))
	{
	  /* Initialize feature values */

	  Allocate(features->featuretags, sizeof(FeatureTagSt), "features->featuretags: main");
	  features->featuretags->next_open_slot = 0;

	  features->gamma = trans.gamma;

	  if ( features->maxsuffix == 0 )
	    features->maxsuffix = MinSuffixLen;
	  if ( features->maxunkwords == 0 )
	    features->maxunkwords = MAXUNKWORDS;
	  if ( features->maxprefcut == 0 )
	    features->maxprefcut = MinPrefixLen;
	  if ( features->maxsuffcut == 0 )
	    features->maxsuffcut = MinSuffixLen;

	  unknown_word_handling_initialization();
	  gather_unigram_freqs( &dict );
	}

      if ( Option(unknown_morph) )
	{
	  if (unktaggroupname[0] != 0)
	    {
	      /* Read unknown word tag group file. */
	      unktaggroupfile = open_file(unktaggroupname, "r");
	      use_tag_groups = TRUE;
	    
	      read_taggroup( unktaggroupfile );
	    }

	  analyze_features( &dict, ofeaturesname, obadwordname );
	}
    }

    set_special_words(&dict, features );

    /* Create space for re-estimation or training */
    if (Option(reestimate) || Option(training))
    {
	newtrans.gamma = trans.gamma; /* Share arrays */
	create_trans(&newtrans, tags_all);
    }

    /* Check output files can be opened */
    if (any_output)
	outfile = (outname[0] == 0) ? stdout : open_file(outname, "w");
    if (odictname[0] != 0)
	odictfile = open_file(odictname, "w");
    if (otranname[0] != 0)
	otranfile = open_file(otranname, "w");

    /* Set up anchor word */
    set_anchor(&dict);

    /* -------- The main tagging loop -------- */
    /* Clear output options except for last iteration */
    saved_options = options;
    if (iterations > 1) options.out = no_out_opt;
    
    for (iter = 1 ; iter <= iterations ; iter++)
    {
	if (Option(verbose))
		printf("\nIteration %d%s", iter,
			Option(training) ? " (training)\n" : "\n");
	reset_corpus(corpfile);

#if (0)
	/* Report current space usage */
	space_report(stdout);
#endif

	/* If training, clear other tagging options */
	if (Option(training))
	{
	    ClearOption(Viterbi);
	    ClearOption(fb_tagging);
	    ClearOption(most_freq);
	    ClearOption(num_stabilise);
	    if ( Option(good_turing) )
		init_good_turing_trans(&trans);
	    else
		clear_trans_all(&newtrans);
	}

	/* On the first iteration, apply initialisation codes (if appropriate);
	   otherwise do gamma adjustments. */
	if (!Option(training) && !Option(most_freq))
	{
	    if (iter == 1)
	    {
		/* Dict initialisation */
		Score mul, div;

		mul = (initialise & Init_d_1)    ? 1 : 0;
		div = (initialise & Init_d_ntag) ? 0 :
		      (initialise & Init_d_tagmax) ? tags_all :
		      (initialise & Init_d_s) ? -1 : 1;

		if (initialise & (Init_d_ntag | Init_d_tagmax | Init_d_s))
		{
		    set_up_scores(&dict, mul, div);
		}
		else
		{   /* Use gamma as divisor, maybe set to 1 first */
		    if (initialise & Init_d_1)
			set_up_scores(&dict, 1, 1);
		    adjust_dict(&dict, trans.gamma, FALSE);
		}

		/* Trans initialisation */
		if (initialise & Init_t_tagmax)
		{
		    init_trans(&trans, 1.0/tags_all,
				(initialise & Init_t_1) == 0);
		}
		else
		{
		    if (initialise & Init_t_1)
			init_trans(&trans, 1.0, FALSE);
		    adjust_trans(&trans, NULL);
		}

		initialise = 0;

	    }
	    else /* Do normal gamma adjustment */
	    {
		adjust_dict(&dict, trans.gamma, TRUE);
		adjust_trans(&trans, &newtrans);
		
	    }
	}

	/* Clear re-estimation parameters */
	if (Option(reestimate)) clear_trans_all(&newtrans);

	/* Now actually label the corpus */
	init_statistics();
	tag_corpus(corpfile, outfile, &dict, &skip_dict, &trans, &newtrans);

	/* Give results, except on iter 1 of training */
	if (!Option(training))
	{
	    if (Option(report_stats))
	    {
		fprintf(outfile, "\n\nIteration %d\n", iter);
		report_results(outfile);
		fflush(outfile);
	    }
	}

	/* If we were training, sort the dictionary */
	if (Option(training))
	{
	    if (Option(verbose)) printf("\nSorting dictionary\n");
	    sort_dict(&dict);

	    /* If there are more iterations after the training ... */
	    if (iterations > 1)
	    {
		/* Restore options */
		options = saved_options;

		/* Mark that we are no longer training */
		ClearOption(training);
		saved_options = options;
		options.out   = no_out_opt;
	    }
	}

	/* Restore global options on last loop */
	if (iter == iterations - 1)
	    options = saved_options;
    }

#if (0)
    /* Report current space usage */
    space_report(stdout);
#endif

    /* Call the results output routines */
#ifdef Analyse
    if (OutOpt(analyse))
	print_analysis(outfile);
#endif
    if (any_output && outfile != stdout)
	fclose(outfile);
    fclose(corpfile);

    /* Write new arrays */
    if (odictfile != NULL)
    {
	if (Option(verbose))
	    printf("Writing dictionary (%d entries)\n", dict.size);
	write_dict(odictfile, &dict, Option(training) || Option(reestimate));
	fclose(odictfile);
    }
    if (otranfile != NULL)
    {
	if (Option(verbose))
	    printf("Writing transitions\n");
	if (Option(training) || Option(reestimate))
	    write_trans(otranfile, &newtrans);
	else
	    write_trans(otranfile, &trans);
	fclose(otranfile);
    }

    /* If a features structure was created, write out the features file and then free its memory. */
    if ( Option(unknown_morph) )
      {
	if (ofeaturesname[0] != 0)
	  {
	    /* Write out features file */
	    write_features();

	    /* Close the features file */
	    fclose(features->ofeaturesfile);
	  }

	if (obadwordname[0] != 0)
	  {
	    /* Close the bad words file */
	    fclose(features->obadwordfile);
	  }
      }

    if ( Option(unknown_morph) || Option(unknown_rules) )
      {
	free_features( &features );
      }

    free_dict( &dict );

    return 0;
}
