/* Program to take lancpars format corpus and remove all phrase tags except
   those from a specified set, given by the first argument on the command line.
   Use the standard map stuff for getting the tags list.

   Principle:
   Ignore anything occurring outside a phrase
   Phrases are bracketed [tag ... tag]
   Phrasal tags are terminated with space or '['
   Words have tags following underscore.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "common.h"
#include "map.h"

#define MaxBuff (255)
#define PhraseStart '['
#define PhraseEnd   ']'
#define TagSep      '_'

/*-----------------------------------------------------------------------------
    skip_white

    Skip white space, returning non white character or EOF
-----------------------------------------------------------------------------*/

static int skip_white(FILE *in)
{
    int ch;

    while ((ch = fgetc(in)) != EOF && isspace(ch)) ;
    return ch;
}

/*-----------------------------------------------------------------------------
    get_token

    Fetch a token, defined as [tag, tag], word or word_tag. Returns a pointer
    to character after _ if present. FALSE at EOF; token may still be empty.
-----------------------------------------------------------------------------*/

static BOOL get_token(FILE *in, uchar *buffer, int max, uchar **tag)
{
    int ch = skip_white(in);
    int i = 0;
    BOOL start = FALSE;
    buffer[0] = 0;
    *tag  = NULL;

    if (ch == EOF)
    {
	return FALSE;
    }
    do
    {
	buffer[i++] = ch;
	if (i == max)
	{
	    buffer[i-1] = 0;
	    fprintf(stderr, "Buffer overflow at '%s'\n", buffer);
	    get_out();
	}
	if (ch == PhraseStart)
	{
	    if (i == 1)
	    {
		start = TRUE;
	    }
	    else
	    {
		buffer[--i] = 0;
		ungetc(ch, in);
		return TRUE;
	    }
	}
	else if (ch == PhraseEnd)
	{
	    if (start || *tag != NULL)
	    {
		buffer[i] = 0;
		fprintf(stderr, "Unexpected phrase end in '%s'\n", buffer);
		get_out();
	    }
	    else
	    {
		buffer[i] = 0;
		return TRUE;
	    }
	}
	else if (ch == TagSep)
	{
	    if (start || *tag != NULL)
	    {
		buffer[i] = 0;
		fprintf(stderr, "Unexpected tag separator in '%s'\n", buffer);
		get_out();
	    }
	    else
	    {
		*tag = buffer+i;
	    }
	}
	else if (ch == EOF)
	{
	    buffer[i] = 0;
	    return FALSE;
	}
	else if (isspace(ch))
	{
	    buffer[--i] = 0;
	    return TRUE;
	}

	ch = fgetc(in);
    } while (TRUE);

    return TRUE; /* Dummy to shut the compiler up */
}

/*-----------------------------------------------------------------------------
    output_token

    Output the token, with line breaking.
-----------------------------------------------------------------------------*/

#define MaxLine (79)

static void output_token(FILE *out, uchar *token, BOOL force_line_break)
{
    static int line_length = 0;
    int len = strlen(token);

    if (force_line_break || line_length + len > MaxLine)
    {
	fprintf(out, "\n%s", token);
	line_length = len;
    }
    else
    {
	fprintf(out, " %s", token);
	line_length += len;
    }
}

/*-----------------------------------------------------------------------------
    scan_corpus

    Read a corpus and build phrases, which we then dump.
-----------------------------------------------------------------------------*/

static void scan_corpus(FILE *in, FILE *out)
{
    BOOL not_end;

    /* Read a word at a time. If it is a phrase tag, check against the list */
    do
    {
	uchar token[MaxBuff], *tag;
	int len;

	not_end = get_token(in, token, MaxBuff, &tag);
	len = strlen(token);
	if (len != 0)
	{
	    if (token[0] == PhraseStart)
	    {
		if (map_tag_quiet(token+1) != NOTAG)
		    output_token(out, token, TRUE);
	    }
	    else if (token[len-1] == PhraseEnd)
	    {
		token[len-1] = 0;
		if (map_tag_quiet(token) != NOTAG)
		{
		    token[len-1] = PhraseEnd;
		    output_token(out, token, FALSE);
		}
	    }
	    else
		output_token(out, token, FALSE);
	}
    } while (not_end);

    fprintf(out, "\n");
}

/*-----------------------------------------------------------------------------
    main
-----------------------------------------------------------------------------*/

int main(int argc, char *argv[])
{
    InitOptions;
    if (argc <= 1)
    {
	fprintf(stderr, "Usage: dephrase tag-file\n");
	exit(1);
    }
    read_mapping(argv[1]);
    scan_corpus(stdin, stdout);

    return 0;
}
