/* Statistical labeller: Parser.

   31-03-93	Created
   07-04-93	Allow disjunctions of tags
   14-04-93	Labeller interface revised
   22-04-93	Integrate chart with main data structures

   Copyright (C) David Elworthy 1995

   Principal external functions:
	parser_read, parser_read_named
	parser_advance
	parser_dump_edge
	parser_init_hyp, parser_free_hyp
	parser_may_tag

   Syntax: each line of the parser file is
   [<score>] <term> -> <dtag>*
   where <dtag> is either a single tag or a list of tags separated by '_'
   characters.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "common.h"
#include "list.h"
#include "map.h"

#include "diction.h"

#include "label.h"
#include "stack.h"
#include "phrase.h"
#include "parser.h"

/*
==============================================================================
Rule reading
*/

/* Main data structure for rules. The rules are held in a list indicating the
constituents of the rule. The consistuents are also held in a list. In both
cases, the representation is as a tree structure, so that we can common up
initial sequences of rules, leading to a more compact representation. The
output symbol is recorded at the point the rule ends (this is marginally more
convenient for use in the parser itself) as well as in the rule itself.
*/

typedef struct rule_st RuleSt, *Rule;
typedef struct constit_st ConstitSt, *Constit;

struct rule_st
{
    Rule    next;		/* Next rule in the chain */
    Tag     term;		/* Output symbol of the rule */
    Constit body;		/* Body of the rule */
};

struct constit_st
{
    Constit next;	/* Next symbol that could be recognised here */
    Tag     sym;	/* Possible symbols to recognise */
    BOOL    may_end;	/* True if we may end with this symbol */
    Tag     term;	/* Output symbol of the rule */
    Score   prob;	/* Probability if we do end here */
    Constit cont;	/* Things that may continue the rule */
};

/* Rules list, sorted by term. */
static Rule rule_head = NULL;

/*-----------------------------------------------------------------------------
    find_rule

    Find a rule, and add a rule to the rules list if not found.
-----------------------------------------------------------------------------*/

static int rule_cmp(Key *c1, Key *c2)
{
    Tag *s1 = (Tag *)c1;
    Tag *s2 = (Tag *)c2;
    return (*s1 - *s2);
}

static Rule find_rule(Tag term, Rule *head)
{
    Rule new_rule;
    BOOL match;

    new_rule = list_search_and_add((void **)head, (Key *)&term, rule_cmp,
				sizeof(RuleSt), "rule", &match);

    if (!match)
    {
	new_rule->term = term;
	new_rule->body = NULL;
    }

    return new_rule;
}

/*-----------------------------------------------------------------------------
    add_constit

    Add a constituent to a list. Searches the list at "sym", and if found
    returns where. Otherwise creates a new symbol at the head of the list and
    returns it, also updating the head of list pointer.
-----------------------------------------------------------------------------*/

static Constit add_constit(Tag tag, Constit *sym)
{
    Constit c;

    for (c = *sym ; c != NULL ; c = c->next)
	if (c->sym == tag) return c;

    /* Not found: create new part */
    Allocate(c, sizeof(ConstitSt), "constituent");
    c->next    = *sym;
    c->sym     = tag;
    c->may_end = FALSE;
    c->cont    = NULL;
    *sym       = c;
    return c;
}

/*-----------------------------------------------------------------------------
    parser_read

    Read rules from a given file, merging them with an existing list.
-----------------------------------------------------------------------------*/

#define MaxLine (255)

void parser_read(FILE *in)
{
    uchar buffer[MaxLine];
    int   line_count = 0;
    uchar *term = " \t\n";

    while (fgets(buffer, MaxLine, in) != NULL)
    {
	int len = strlen(buffer);
	line_count += 1;

	if (len != 0 && len != 1)	/* Empty line if not */
	{
	    if (len >= MaxLine-1)
	    {
		fprintf(stderr, "Line overflows buffer:\n%s\n", buffer);
		get_out();
	    }
	    else
	    {
		uchar   *token;
		Score    prob;
		Constit *c, new;
		Rule     r;

		/* Get count */
		token = strtok(buffer, term);
		if (token != NULL)
		{
		    if (sscanf(token, score_format, &prob) != 1)
			prob = 1.0;
		    token = strtok(NULL, term);
		}

		/* Get non-terminal */
		if (token == NULL)
		{
		    fprintf(stderr, "Warning: terminal missing at line %d\n",
				line_count);
		    continue;
		}

		/* Find or create a rule */
		r = find_rule(map_tag(token), &rule_head);
		c = &(r->body);

		/* Get arrow */
		token = strtok(NULL, term);
		if (token == NULL || strcmp(token, "->") != 0)
		{
		    fprintf(stderr, "Warning: '->' missing at line %d\n",
				line_count);
		    continue;
		}

		/* Now process each token in the rule */
		if ((token = strtok(NULL, term)) == NULL)
		{
		    fprintf(stderr, "Warning: empty rule at line %d\n",
				line_count);
		}
		else do
		{
		    new = add_constit(map_tag(token), c);
		    if ((token = strtok(NULL, term)) != NULL)
			c = &(new->cont);
		    else
		    {
			new->may_end = TRUE;
			new->term    = r->term;
			new->prob    = prob;
		    }
		} while (token != NULL);
	    }
	}
    }
}

/*---------------------------------------------------------------------------
    parser_read_named

    Read parse rules, opening the file first.
----------------------------------------------------------------------------*/

void parser_read_named(char *name)
{
    FILE *file = open_file(name, "r");
    parser_read(file);
    fclose(file);
}

/*
==============================================================================
Parser.

The parser is implemented as a bottom up chart parser, in which the edges are
recorded as hypotheses on the stack.
*/

/* Data structure for an active edge: consists of a pointer into a rule, and a
pointer back to the previous state of the edge in the form of a hypothesis.
Since rules are commoned up, an active edge may be continued in more than one
way.
*/

struct actedge_st
{
    Constit rule;	/* Current state */
    SHyp    prev;	/* Previous state */
};

/*---------------------------------------------------------------------------
    dump_active
----------------------------------------------------------------------------*/

static void dump_active(FILE *out, Hyp hyp)
{
    if (hyp != NULL)
    {
	dump_active(out, hyp->p.active.rule->prev->hyp);
	fprintf(out, "%s ", unmap_tag(hyp->tag));
    }
}

/*---------------------------------------------------------------------------
    dump_rules

    Dump all rules from the given point.
----------------------------------------------------------------------------*/

static void dump_rules(FILE *out, Constit c)
{
    if (c != NULL)
    {
	Constit rule;

	fprintf(out, "{");
	for (rule = c ; rule != NULL ; rule = rule->next)
	{
	    fprintf(out, "%s", unmap_tag(rule->sym));
	    if (rule->may_end)
		fprintf(out, " [%s:%g]", unmap_tag(rule->term), rule->prob);
	    if (rule->cont)
		dump_rules(out, rule->cont);
	    if (rule->next != NULL)
		fprintf(out, " | ");
	}
	fprintf(out, "}");
    }
}

/*---------------------------------------------------------------------------
    parser_dump_edge

    Print a hypothesis representing an active edge.
----------------------------------------------------------------------------*/

void parser_dump_edge(FILE *out, Hyp hyp)
{
    /* Print the nodes of the edge */
    fprintf(out, "(%d,%d) ", hyp->lex_start->id, hyp->lex_end->id);

    /* Print matched context (recursively) */
    dump_active(out, hyp);
    fprintf(out, "* ");

    /* Print rest of rule. There can be more than one way of continuing, so we
       print all of the possibilities. */
    dump_rules(out, hyp->p.active.rule->rule);
}

/*---------------------------------------------------------------------------
    parser_init_hyp

    Initialise an active hyp.
----------------------------------------------------------------------------*/

void parser_init_hyp(Hyp hyp)
{
    ActEdge edge;
    Allocate(edge, sizeof(ActEdgeSt), "active edge");

    hyp->p.active.rule = edge;
    edge->rule = NULL;
    edge->prev = NULL;
}

/*---------------------------------------------------------------------------
    parser_free_hyp

    Free an active hyp.
----------------------------------------------------------------------------*/

void parser_free_hyp(Hyp hyp)
{
    free(hyp->p.active.rule);
}

/*---------------------------------------------------------------------------
    build_phrase

    Build a phrase on a completed rule, which resulted from the hypothesis
    'last' matching the rule 'c'. 'old' is either NULL, when 'c' is the first
    part of a rule, or provides the preceding context from which 'c' was
    taken. To create the phrasal hypothesis, we must create a node for each
    hypothesis subsumed by it, associating with each node links to subsumed
    hypotheses (on both the start and end links: separate links are needed,
    but the shyp may be shared). The phrase is placed on the queue on
    completion, but is not linked to start and end lists of the nodes that
    need to know about it yet.
----------------------------------------------------------------------------*/

static Link build_phrase(Constit c, SHyp last, SHyp old, Link queue)
{
    Node node;

    /* Create the phrasal hypothesis */
    Hyp  phrase = create_hyp(c->term, c->prob, PhraseHyp,
				NULL, last->hyp->lex_end);
    SHyp p_shyp = create_shyp(phrase, NULL, last->end);

    /* Dress it up in a scored link */
    queue = create_link(queue, TRUE, NULL, p_shyp, NULL);

    /* Create node for the last thing in it */
    node = build_subsumed_node(NULL, last->hyp);
    phrase->p.phrase.end = node;
    node->succ = NULL;

    /* Create nodes for all the other things in it */
    while (old)
    {
	node = build_subsumed_node(node, old->hyp->p.active.hyp->hyp);
	last = old;
	old  = old->hyp->p.active.rule->prev;
    }

    /* Link up start node */
    phrase->lex_start      = last->hyp->lex_start;
    phrase->p.phrase.start = node;
    node->pred    = NULL;
    p_shyp->start = last->start;
    link_to_base(phrase, p_shyp->start);

    return queue;
}


/*---------------------------------------------------------------------------
    advance_on_constit

    Attempt to advance using the rules listed at 'c' with the new hypothesis
    'shyp', using 'queue' for the results. 'old' indicates any previous match.
    The function is used both for starting rules (in which case old is NULL)
    and for continuing them (in which case old is where we got c from).
----------------------------------------------------------------------------*/

static Link advance_on_constit(SHyp shyp, Constit c, SHyp old, Link queue)
{
    Hyp hyp = shyp->hyp;

    /* Find the tag of the new hypothesis */
    Tag sym = TagOf(shyp);

    /* Test each thing on the constit list */
    while (c)
    {
	if (c->sym == sym)		/* Constit matches */
	{
	    /* See if we may end here */
	    if (c->may_end)
		queue = build_phrase(c, shyp, old, queue);

	    /* See if we may continue from here */
	    if (c->cont)
	    {
		/* Create an active edge */
		Hyp  active = create_hyp(sym, 0.0, ActiveHyp,
						hyp->lex_start, hyp->lex_end);
		SHyp act_s  = create_shyp(active, shyp->start, shyp->end);
		link_to_base(active, shyp->start);

		active->p.active.rule->rule = c->cont;
		active->p.active.rule->prev = old;
		active->p.active.hyp        = shyp;

		/* Dress it up in a scored link */
		queue = create_link(queue, TRUE, NULL, act_s, NULL);
	    }
	}

	c = c->next;
    }

    return queue;
}

/*---------------------------------------------------------------------------
    start_rules

    Start any rules we can with the given hypothesis and stick on the queue.
----------------------------------------------------------------------------*/

static Link start_rules(SHyp hyp, Rule rules, Link queue)
{
    while (rules)
    {
	queue = advance_on_constit(hyp, rules->body, NULL, queue);
	rules = rules->next;
    }

    return queue;
}

/*---------------------------------------------------------------------------
    advance_on_edge

    Attempt to advance the parse on an existing active edge
----------------------------------------------------------------------------*/

static Link advance_on_edge(SHyp hyp, SHyp old, Link queue)
{
    /* Simply try the given constituent */
    return advance_on_constit(hyp, old->hyp->p.active.rule->rule, old, queue);
}

/*---------------------------------------------------------------------------
    advance_active_edges

    Attempt to advance any active edges given a hypothesis. We find which
    edges to try by looking at the predecessor node. Returns an updated queue.
----------------------------------------------------------------------------*/

static Link advance_active_edges(SHyp hyp, Link queue)
{
    Link edge_list = hyp->start->pred->end;

    for ( ; edge_list ; edge_list = edge_list->next)
    {
	/* Only look at active edges */
	SHyp edge = edge_list->u.shyp;

	if (edge->hyp->type == ActiveHyp)
	    queue = advance_on_edge(hyp, edge, queue);
    }

    return queue;
}

/*---------------------------------------------------------------------------
    parser_advance

    Implements the top level of the chart parser, given a node. The list of
    hypotheses starting at the node is taken as an initial queue for the
    algorithm, which proceeds by repeatedly taking the head off the queue,
    starting any new rules and advancing any existing one in the context
    provided by it, resulting in a new queue. Not all members of the queue
    will span the same thing: for lexical hypotheses in the queue, we can find
    the preceding context from the predecessor of the node. A new list is
    formed from lexical edges when they have been processed. Completed phrases
    are pasted into the lists for the appropriate nodes. Active edges are also
    added with the appropriate span.

    If trans is non-NULL, phrases are tagged once they have been created.
----------------------------------------------------------------------------*/

void parser_advance(Node node, Trans *trans, Trans *new)
{
    /* Create the initial queue */
    Link queue = node->start;

    /* Detach the queue from the node */
    node->start = NULL;

    /* Process the queue */
    while (queue != NULL)
    {
	/* Take the head off the queue */
	Link head = queue;
	SHyp shyp = head->u.shyp;
	Hyp  hyp  = shyp->hyp;
	queue = queue->next;
	head->next = NULL;

	/* Do nothing with active edges */
	if (hyp->type != ActiveHyp)
	{
	    /* Start any rules we can with the edge */
	    queue = start_rules(shyp, rule_head, queue);

	    /* Advance any active edges we can */
	    queue = advance_active_edges(shyp, queue);
	}

	/* Insert head into its final resting place */
	/* All hyps connect to their start node */
	insert_link(&(shyp->start->start), head);

	switch (hyp->type)
	{
	    case LexHyp:	/* No more to do */
		break;
	    case PhraseHyp:
	    {
		/* Attach across the right places. We already have one link
		   created, but must make a second one for the end link */
		Link *end = &(shyp->end->end);
		create_link(*end, TRUE, NULL, shyp, end);

		/* Now tag the phrase if asked to */
		if (trans != NULL) tag_phrase(hyp, trans, new);
		break;
	    }
	    case ActiveHyp:
	    {
		/* Attach to the node. Need to create one link */
		Link *end = &(shyp->end->end);
		create_link(*end, TRUE, NULL, shyp, end);
		break;
	    }
	}
    }
}


/*
==============================================================================
Parser interface to labeller.
*/

/*---------------------------------------------------------------------------
    parser_any_active

    Reports if there any active edges ending at the given node.
----------------------------------------------------------------------------*/

static BOOL parser_any_active(Node n)
{
    Link link;
    SHyp hyp;

    forHypothesesBack(n, hyp, link)
	if (TypeOf(hyp) == ActiveHyp) return FALSE;
    return TRUE;
}

/*---------------------------------------------------------------------------
    parser_may_tag

    Tells the tagger if labelling is permitted.
----------------------------------------------------------------------------*/

BOOL parser_may_tag(Lexeme top)
{
    return (parser_any_active(top->node));
}
