(*  Title: 	Lexicon
    Author: 	Tobias Nipkow
*)

signature LEXICON0 =
sig
  val is_identifier: string -> bool
  val scan_varname: string list -> indexname * string list
  val string_of_vname: indexname -> string
end;

signature LEXICON =
sig
  type Lexicon
  datatype Token = Token of int
		 | IdentSy of string
		 | VarSy of string * int
		 | TFreeSy of string
		 | TVarSy of string * int
		 | end_token;
  val mk_lexicon: string list -> Lexicon
  val tokenize: Lexicon -> string -> Token list
  val token_to_string: Lexicon -> Token -> string
  val matching_tokens: Token * Token -> bool
  val valued_token: Token -> bool
  val predef_term: string -> Token
  type 'b TokenMap
  val mkTokenMap: ('b * Token list) list -> 'b TokenMap
  val applyTokenMap: 'b TokenMap * Token -> 'b list
  include LEXICON0
end;

functor LexiconFun(Extension:EXTENSION) : LEXICON =
struct

datatype Token = Token of int
	       | IdentSy of string
	       | VarSy of string * int
	       | TFreeSy of string
	       | TVarSy of string * int
	       | end_token;
val no_token = ~1;
val first_token = 0;

datatype State = State of string * int * (State list);
type DFA = State list;
type TokenTab = string list;
datatype Lexicon = Lexicon of {DFA:DFA, KeyWds:int list, TokenTab:TokenTab};

fun mk_lexicon(sl:string list) : Lexicon =
    let fun part (c,sll) = let val (sll1,sll2) =
                                   partition ((apl(c,op=)) o hd) sll;
                           in (map tl sll1, sll2) end;

        fun mk([]:string list list,_:int):DFA = [] |
            mk([c]::sll,i) = let val (cl,ncl) = part(c,sll)
                             in State(c,i,mk(cl,i+1))::
                                mk(ncl,i+length(cl)+1) end |
            mk((c::sl)::sll,i) = let val (cl,ncl) = part(c,sll)
                                 in State(c,no_token,mk(sl::cl,i))::
                                    mk(ncl,i+length(cl)+1) end;

        fun is_id(c::cs) = is_letter(c) andalso forall is_quasi_letter cs;
        fun mk_kw([],_) = [] |
            mk_kw(s::sl,i) = (if is_id(explode s) then [i] else [])
                             @ mk_kw(sl,i+1);

        val ds = distinct(sort(op<)sl);
    in Lexicon {DFA = mk(map explode ds,first_token),
                KeyWds = mk_kw(ds,first_token), TokenTab = ds} end;

fun next_state(dfa:DFA,c:string): State option =
    let fun next [] = None |
            next ((st as State(c',_,_))::stl) =
                 if c=c' then Some(st) else next stl
    in next dfa end;

exception ID of string * string list;

val eof_id = "End of input - identifier expected.\n";

fun is_qld c = is_quasi_letter c orelse is_digit c;

fun is_identifier s = (case explode s of
	  [] => false
	| c::cs => is_letter(c) andalso forall is_qld cs);

(*A string of letters, digits, or ' _ *)
fun xscan_ident exn =
let fun scan []  =  raise exn(eof_id,[])
      | scan(c::cs) =  
	if  is_letter c
	then let val (ds,tail) = take_prefix is_qld cs
	     in  (implode(c::ds), tail)  end
	else raise exn("Identifier expected: ", c::cs)
in scan end;

(*Scan the offset of a Var, if present; otherwise ~1 *)
fun scan_offset cs = case cs of
    ("."::[]) => (~1,cs)
  | ("."::(ds as c::cs')) => if is_digit c then scan_int ds else (~1,cs)
  | _ => (~1,cs);

fun split_varname s =
    let val (rpost,rpref) = take_prefix is_digit (rev(explode s))
        val (i,_) = scan_int(rev rpost)
    in (implode(rev rpref), i) end;

fun xscan_varname exn cs : (string*int) * string list =
let val (a, ds) = xscan_ident exn cs;
    val (i, es) = scan_offset ds
in if i = ~1 then (split_varname a, es) else ((a,i), es) end;

fun scan_varname s = xscan_varname ID s
	handle ID(err,cs) => error(err^(implode cs));

fun isin_sorted_list (i:int) =
    let fun isin(j::l) = if i<j then false else
                         if i=j then true else isin l |
            isin([]) = false
    in isin end;

fun tokenize (Lexicon{DFA=dfa,KeyWds=kw,...}) (s:string) : Token list =
let exception LEX_ERR;
    exception FAIL of string * string list;
    val lexerr = "Lexical error: ";

    fun tokenize1 (_:DFA,[]:string list) : (Token * (string list)) =
	raise LEX_ERR
      | tokenize1(dfa,c::sl) =
	case next_state(dfa,c) of
	  None => raise LEX_ERR
	| Some(State(_,t,dfa')) =>
	  if t=no_token then tokenize1(dfa',sl)
	  else (tokenize1(dfa',sl) handle LEX_ERR =>
		if isin_sorted_list t kw andalso not(null sl) andalso
		   is_quasi_letter(hd sl)
		then raise LEX_ERR else (Token(t),sl));

    fun token(cs) = tokenize1(dfa,cs) handle LEX_ERR =>	raise FAIL(lexerr,cs);

    fun id([]) = raise FAIL(eof_id,[])
      | id(cs as c::cs') =
	if is_letter(c)
	then let val (id,cs'') = xscan_ident FAIL cs
	     in (IdentSy(id), cs'') end
	else
	if c = "?"
	then case cs' of
		"'"::xs => let val ((a,i),ys) = xscan_varname FAIL xs
			   in (TVarSy("'"^a,i), ys) end
	     | _ => let val ((a,i),ys) = xscan_varname FAIL cs'
		    in (VarSy(a,i), ys) end
	else
	if c = "'"
	then let val (a,cs'') = xscan_ident FAIL cs'
	     in (TFreeSy("'" ^ a), cs'') end
	else raise FAIL(lexerr,cs);

    fun tknize([],ts) = rev(ts)
      | tknize(cs as c::cs',ts) =
	if is_blank(c) then tknize(cs',ts) else
	let val (t,cs'') =
		if c="?" then id(cs) handle FAIL _ => token(cs)
		else (token(cs) handle FAIL _ => id(cs))
	in tknize(cs'',t::ts) end

in tknize(explode s,[]) handle FAIL(s,cs) => error(s^(implode cs)) end;

fun string_of_vname (a,idx) = "?" ^ (if is_digit(hd(rev(explode a)))
	then a ^ "." ^ string_of_int idx
	else if idx = 0 then a else a ^ string_of_int idx);

fun token_to_string (Lexicon{TokenTab=tt,...}) (Token(i):Token) : string =
      nth_elem(i,tt)
  | token_to_string _ (IdentSy(s)) = s
  | token_to_string _ (VarSy v) = string_of_vname v
  | token_to_string _ (TFreeSy a) = a
  | token_to_string _ (TVarSy v) = string_of_vname v
  | token_to_string _ end_token = "\n";

fun matching_tokens(Token(i),Token(j)) = (i=j) |
    matching_tokens(IdentSy(_),IdentSy(_)) = true |
    matching_tokens(VarSy _,VarSy _) = true |
    matching_tokens(TFreeSy _,TFreeSy _) = true |
    matching_tokens(TVarSy _,TVarSy _) = true |
    matching_tokens(end_token,end_token) = true |
    matching_tokens(_,_) = false;

fun valued_token(end_token) = false
  | valued_token(Token _) = false
  | valued_token(IdentSy _) = true
  | valued_token(VarSy _) = true
  | valued_token(TFreeSy _) = true
  | valued_token(TVarSy _) = true;

fun predef_term name =
      if name = Extension.ID then IdentSy(name) else
      if name = Extension.VAR then VarSy(name,0) else
      if name = Extension.TFREE then TFreeSy(name) else
      if name = Extension.TVAR then TVarSy(name,0) else
      end_token;

type 'b TokenMap = (int * 'b list) list * 'b list;

fun int_of_token(Token(tk)) = tk |
    int_of_token(IdentSy _) = first_token - 1 |
    int_of_token(VarSy _) = first_token - 2 |
    int_of_token(TFreeSy _) = first_token - 3 |
    int_of_token(TVarSy _) = first_token - 4 |
    int_of_token(end_token) = first_token - 5;

fun mkTokenMap(atll) =
    let val aill = map (fn(a,tl)=>(a,map int_of_token tl)) atll;
        val dom = sort op< (distinct(flat(map snd aill)));
        val mt = map fst (filter (null o snd) atll);
        fun mktm(i) =
            let fun add(l,(a,il)) = if i mem il then a::l else l
            in (i,foldl add ([],aill)) end;
    in (map mktm dom,mt) end;

fun find_al (i:int) =
    let fun find((j,al)::l) = if i<j then [] else
                              if i=j then al else find l |
            find [] = [];
    in find end;
fun applyTokenMap((l,mt),tk) = mt@(find_al (int_of_token tk) l);

end;
