/* Program to build phrase structure rules from the lancpars corpus.
   Uses the common I/O routines at the low level

   Usage:
    lanc corpus [map]

   Allows corpora with a single tag only; scores ignored.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common.h"
#include "map.h"
#include "low.h"

#define MaxBuff (1000)

/*============================================================================
  Phrase data structures
============================================================================*/

/* Phrases are recorded non-recursively, as a list of tags. Where such tags
   are also phrasal, their expansion will appear as a separate phrase.
*/
typedef struct tags_st TagsSt, *Tags;	/* List of tags */
struct tags_st
{
    Tags next;		/* Next phrase known */
    Tag  tag;		/* Tag of the phrase */
};

typedef struct phrase_st PhraseSt, *Phrase;
struct phrase_st
{
    Phrase  next;		/* Next phrase known */
    Tag     tag;		/* Tag of the phrase */
    Tags    body;		/* Definition of it */
    int     found, total;
    Score   prob;
};
static Phrase phrase_head = NULL;

/*============================================================================
  Freeing
============================================================================*/

/*-----------------------------------------------------------------------------
    free_tag_list

    Free a single phrase
-----------------------------------------------------------------------------*/

static void free_phrase(Phrase p)
{
    Tags t, tnext;

    for (t = p->body ; t != NULL ; t = tnext)
    {
	tnext = t->next;
	free(t);
    }

    free(p);
}

/*-----------------------------------------------------------------------------
    free_phrases

    Free all phrases
-----------------------------------------------------------------------------*/

static void free_phrases(void)
{
    Phrase p, next;

    for (p = phrase_head ; p != NULL ; p = next)
    {
	next = p->next;
	free_phrase(p);
    }
}

/*============================================================================
  Printing
============================================================================*/

/*-----------------------------------------------------------------------------
    print_phrases

    Print all phrases in the chain
-----------------------------------------------------------------------------*/

static void print_phrases(FILE *out)
{
    Phrase p;

    for (p = phrase_head ; p != NULL ; p = p->next)
    {
	Tags t;

#undef GiveTotal
#ifdef GiveTotal
	fprintf(out, "%g (%d,%d) %s ->", p->prob, p->found, p->total,
		unmap_tag(p->tag));
#else
	fprintf(out, "%g %s ->", p->prob, unmap_tag(p->tag));
#endif
	for (t = p->body ; t != NULL ; t = t->next)
	    fprintf(out, " %s", unmap_tag(t->tag));
	fprintf(out, "\n");

{
	static int mc = 1;
	static uchar *semicolon = "\\;";
	int    state = 1;
	fprintf(stderr, "%s-%d", unmap_tag(p->tag), mc++);

	for (t = p->body ; t != NULL ; t = t->next, state += 1)
	{
	    uchar *tag_text = unmap_tag(t->tag);
	    if (strcmp(tag_text, ";") == 0)
		tag_text = semicolon;

	    if (t->next)
	    {
		fprintf(stderr, "\t%x\t%s\t_ %x ;\n",
			state, tag_text, state+1);
	    }
	    else
	    {
		fprintf(stderr, "\t%x\t%s\t_ %s_%g ;\n",
			state, tag_text, unmap_tag(p->tag), p->prob);
	    }
	}
	fprintf(stderr, "end\n\n");
}
    }
}

/*-----------------------------------------------------------------------------
    print_phrases

    Set scores on all phrases in the chain
-----------------------------------------------------------------------------*/

static void set_phrase_scores(void)
{
    Phrase p;

    for (p = phrase_head ; p != NULL ; p = p->next)
    {
#if (1)
#if (0)
	p->prob = ((Score)p->found) / (p->total - p->found);
#else
	p->prob = ((Score)p->found) / p->total;
#endif
#else
	p->prob = 1.0;
#endif
    }
}

/*============================================================================
  Phrase construction
============================================================================*/

/*-----------------------------------------------------------------------------
    match_phrase

    Test whether the given phrase matches any others. If so, then free this
    one. Otherwise add it to the head of the phrase chain.
    On a match, we increment the count of places where the rule was found.
-----------------------------------------------------------------------------*/

static void match_phrase(Phrase phrase)
{
    Phrase p;

    /* Test each phrase in turn */
    for (p = phrase_head ; p != NULL ; p = p->next)
    {
	Tags t1, t2;

	/* Test the tags */
	for (t1 = phrase->body, t2 = p->body ;
	     t1 != NULL && t2 != NULL && t1->tag == t2->tag ;
	     t1 = t1->next, t2 = t2->next) ;

	if (t1 == NULL && t2 == NULL)
	{
	    /* Match found: free new phrase and increment count of new one. */
	    free_phrase(phrase);
	    p->found += 1;
	    return;
	}
    }

    /* Add new phrase to the chain */
    phrase->next  = phrase_head;
    phrase_head   = phrase;
    phrase->found = 1;
}

/*-----------------------------------------------------------------------------
    create_phrase

    Create a phrase in the global list
-----------------------------------------------------------------------------*/

static Phrase create_phrase(Tag tag)
{
    Phrase p;
    Allocate(p, sizeof(PhraseSt), "phrase");
    p->next   = NULL;
    p->tag    = tag;
    p->body   = NULL;
    p->found  = p->total = 0;
    p->prob   = 0;
    return p;
}

/*-----------------------------------------------------------------------------
    make_phrase

    Read a phrase with the given tag and build a structure for it. Gives
    FALSE at eof.
-----------------------------------------------------------------------------*/

static BOOL make_phrase(FILE *in, Tag phrase_tag)
{
    Phrase phrase;
    BOOL   not_end;
    Tags   *tag_at;

    /* Create a top level structure */
    phrase = create_phrase(phrase_tag);
    tag_at = &(phrase->body);

    /* Read tokens until we find the end */
    do
    {
	uchar word[MaxBuff];
	Tag   tag;
	Score score;
	int   ntags;

	/* Get a token */
	ntags = corpus_getword(in, word, MaxBuff, 1, &tag, &score);
	not_end = (ntags != -1);

	if (not_end)
	{
	    int l;

	    /* See what sort of token it was */
	    if (word[0] == LancPhraseStart)
	    {
		if (word[1] != 0)
		{
		    tag = map_tag(word+1);

		    /* Recurse on phrase */
		    not_end = make_phrase(in, tag);

		    /* Fake up a tag for it */
		    ntags = 1;
		}
	    }
	    else
	    {
		l = strlen(word)-1;

		if (word[l] == LancPhraseEnd && l != 0)
		{
		    word[l] = 0;

		    /* Check end tag matches */
		    if (phrase_tag != map_tag(word))
		    {
			fprintf(stderr,
			"Phrase start '%s' does not match phrase end '%s'\n",
				unmap_tag(phrase_tag), word);
			get_out();
		    }
		    else
		    {
			match_phrase(phrase);
			return not_end;
		    }
		}
	    }

	    /* Create space for the tag */
	    if (ntags > 0)
	    {
		Tags new_tag;
		Allocate(new_tag, sizeof(Tags), "new tag");
		*tag_at = new_tag;
		new_tag->tag  = tag;
		new_tag->next = NULL;
		tag_at        = &(new_tag->next);
	    }
	    /* else ignore */
	}
    }
    while (not_end);

    return FALSE; /* EOF */
}

/*-----------------------------------------------------------------------------
    scan_corpus

    Read a corpus and build phrases, which we then dump.
-----------------------------------------------------------------------------*/

static void scan_corpus(FILE *in)
{
    BOOL not_end;

    do
    {
	uchar  word[MaxBuff];
	Tag    tag;
	Score  score;

	/* Read a word */
	not_end = (corpus_getword(in, word, MaxBuff, 1, &tag, &score) != -1);
	if (not_end)
	{
	    /* Check for phrasal token */
	    if (word[0] == LancPhraseStart && word[1] != 0)
	    {
		/* Process a phrase */
		not_end = make_phrase(in, map_tag(word+1));
	    }
	}
    } while (not_end);
}

/*============================================================================
  Training
============================================================================*/

/* In training, we read the corpus and look for sequences of tags which match
a rule. Where there is a match, the total count for the rule is incremented.

The main data structure for training is a list of active rules, containing
pointers to tag lists to be matched, and a link back to the corresponding
phrase. The matching does not require the phrase tag to be matched: we are
just interested in the total number of occurrences of the sequence of tags.

The major complication involved embedded rules. The approach we take is that
when we see an open phrase bracket, we split the rule into two, one of which
continues on the basic tags and the other waits for the closing bracket and
then continues. There can be more than one thing waiting for the same end of
rule marker, so we represent the waiting list as a list of rule lists.
*/

/* Data structures: list of active rules */
typedef struct active_st ActiveSt, *Active;
struct active_st
{
    Active next;	/* Next rule */
    Tags   match;	/* Next thing to match, NULL if ended */
    Phrase phrase;	/* Pointer back to the phrase being matched */
};

/* Waiting rules list */
typedef struct wait_st WaitSt, *Wait;
struct wait_st
{
    Wait   prev;
    Active rules;
};

/*-----------------------------------------------------------------------------
    advance_rule

    Following a match, advance a rule or increment count if the rule has
    finished. Return the active rule or NULL on completion.
-----------------------------------------------------------------------------*/

static Active advance_rule(Active rule)
{
    Tags match = rule->match;

    if (match->next == NULL)
    {
	/* Completed */
	rule->phrase->total += 1;
	free(rule);
	return NULL;
    }
    else
    {
	rule->match = match->next;
	return rule;
    }
}

/*-----------------------------------------------------------------------------
    make_rule

    Make an active rule structure
-----------------------------------------------------------------------------*/

static Active make_rule(Tags match, Phrase phrase)
{
    Active new;

    Allocate(new, sizeof(ActiveSt), "active rule");
    new->next   = NULL;
    new->match  = match;
    new->phrase = phrase;
    return new;
}

/*-----------------------------------------------------------------------------
    make_wait
-----------------------------------------------------------------------------*/

static Active make_wait(Active w, Tags match, Phrase p)
{
    Active wait;
    wait = make_rule(match, p);
    wait->next = w;
    w = wait;
    return w;
}

#define Transfer(r, new) {r->next = new; new = r;}

/*-----------------------------------------------------------------------------
    train_rules

    Train the rules on a corpus.
-----------------------------------------------------------------------------*/

static void train_rules(FILE *in)
{
    BOOL not_end;
    Active rules   = NULL;	/* List of active rules */
    Wait   waiting = NULL;	/* List of waiting rule lists */

    do
    {
	uchar  word[MaxBuff];
	Tag    tag;
	Score  score;
	int    ntags;
	BOOL   phrase_start = FALSE;

	/* Read a word */
	ntags = corpus_getword(in, word, MaxBuff, 1, &tag, &score);
	not_end = (ntags != -1);

	if (not_end)
	{
	    /* Check for phrasal token */
	    if (word[0] == LancPhraseStart)
	    {
		if (word[1] != 0)
		{
		    /* Hack up a tag */
		    tag = map_tag(word+1);
		    ntags = 1;
		    phrase_start = TRUE;
		}
	    }
	    else
	    {
		int l = strlen(word)-1;

		if (word[l] == LancPhraseEnd && l != 0)
		{
		    Wait wait;
		    Tag  t;

		    word[l] = 0;
		    t = map_tag(word);

		    /* Continue waiting rules; wait can be NULL for rules
			matching at top level. */
		    wait = waiting;
		    if (wait != NULL)
		    {
			Active w, next;

			/* Continue each waiting rule */
			for (w = wait->rules ; w != NULL ; w = next)
			{
			    next = w->next;

			    if (w->match->tag != t)
			    {
				fprintf(stderr,
			      "End of rule consistency check failed at '%s'\n",
				word);
				get_out();
			    }

			    /* Advance the waiting rule */
			    w = advance_rule(w);
			    if (w != NULL)	/* Append to head */
			    {
				w->next = rules;
				rules   = w;
			    }
			}

			/* Restore to previous waiting list */
			waiting = waiting->prev;
			free(wait);
		    }
		}
	    }

	    if (ntags > 0)
	    {
		/* Advance any existing rules, transferring rules from "rules"
		   to "new_list" */
		Active r, new_list = NULL, next;
		Active new_wait = NULL;
		Phrase p;

		/* Advance any existing rules */
		for (r = rules ; r != NULL ; r = next)
		{
		    Tags match = r->match;
		    next = r->next;

		    /* Check if rule matches */
		    if (match->tag == tag)
		    {
			if (phrase_start)
			{
			    /* Copy the rule and place it in waiting */
			    new_wait = make_wait(new_wait, match, r->phrase);
			}
			else
			{
			    /* Advance the rule */
			    if ((r = advance_rule(r)) != NULL)
				Transfer(r, new_list)
			}
		    }
		    else
		    {
			/* Kill rule unless phrase start */
			if (phrase_start)
			    Transfer(r, new_list)
			else
			    free(r);
		    }
		}

		/* Start any new rules */
		for (p = phrase_head ; p != NULL ; p = p->next)
		{
		    Tags body = p->body;

		    if (body->tag == tag)
		    {
			/* Create rule unless it has already ended */
			if (body->next == NULL && !phrase_start)
			{
			    p->total += 1;
			}
			else			/* Create the rule */
			{
			    /* If a phrase, create a rule on waiting list */
			    if (phrase_start)
			    {
				new_wait = make_wait(new_wait, body, p);
			    }
			    else
			    {
				Active new;
				new = make_rule(body->next, p);
				new->next = new_list;
				new_list  = new;
			    }
			}
		    }
		}

		/* Create new waiting list */
		if (new_wait != NULL)
		{
		    Wait wait;
		    Allocate(wait, sizeof(WaitSt), "wait list");
		    wait->prev = waiting;
		    wait->rules = new_wait;
		    waiting = wait;
		}

		/* Copy new list to old one */
		rules = new_list;
	    }
	}
    } while (not_end);
}

/*============================================================================
  Top level
============================================================================*/

int option;
int input_option ;
int output_option;

/*----------------------------------------------------------------------------
    main
----------------------------------------------------------------------------*/

int main(int argc, char *argv[])
{
    char  mapname[MAXFN];
    FILE  *corpus;

    InitOptions;
    SetInOpt(lancpars);
    if (argc < 2)
	error_exit("Usage: lanc corpus [map]\n");

    /* Get mappings */
    strcpy(mapname, (argc < 3) ? "tags.map" : argv[2]);
    read_mapping(mapname);

    /* Open corpus */
    corpus = open_file(argv[1], "r");

    /* Find the rules */
    reset_corpus(corpus);
    scan_corpus(corpus);

    /* Train the rules */
    reset_corpus(corpus);
    train_rules(corpus);

    /* Set scores on phrases */
    set_phrase_scores();

    /* Print the phrases */
    print_phrases(stdout);

    /* Free memory */
    free_phrases();

    /* Close files */
    fclose(corpus);

    return 0;
}
