(*  Title: 	parse
    Author: 	Lawrence C Paulson, Cambridge University Computer Laboratory
    Copyright   1986  University of Cambridge
*)

(*Standard parsing and lexical analysis.
  Implement prefix operators with precedence?  (example: negation)
*)


(*lexical errors such as ill-formed identifiers*)
exception lexerr : string * string list;


fun is_letter ch =
  (ord"A" <= ord ch)  andalso  (ord ch <= ord"Z")   orelse
  (ord"a" <= ord ch)  andalso  (ord ch <= ord"z");

fun is_digit ch =
  (ord"0" <= ord ch)  andalso  (ord ch <= ord"9");

(*letter or _ or prime (') *)
fun is_quasi_letter "_" = true
  | is_quasi_letter "'" = true
  | is_quasi_letter ch  = is_letter ch;


(*A string of letters or ' _ but no digits!
  Identifiers may not contain digits because trailing digits indicate
    an offset to a variable or param name*)
val scan_ident : string list -> string * string list = fn
    []  =>  raise lexerr with ("Identifier expected", [])
  | chs as (ch::chs') =>  
      if  is_letter ch  
      then
        let val (ics,tail) = take_prefix is_quasi_letter chs'
	in  (implode(ch::ics), tail)  end
      else  raise lexerr with ("Identifier expected", chs);

(*scan a numeral of the given radix, normally 10*)
fun scan_radixint (radix: int, zerochar: string, chs) : int * string list =
  let val zero = ord zerochar;
      val limit = chr (zero+radix);
      fun scan (all as (num,count,chs)) = case chs of
      	    ch::chs' =>
	      if  zerochar<=ch  andalso  ch<limit 
	      then scan(radix*num + ord ch - zero, count+1, chs')
	      else all
	  | [] => all
  in case scan (0,0,chs) of
      (_,0,_) => raise lexerr with  (*no digits found*)
         	("Radix "  ^  string_of_int radix  ^  " number expected", chs)
    | (num,_,chs') => (num,chs')  
  end;

(*scan a decimal numeral*)
fun scan_decimal chs : int * string list = scan_radixint(10,"0",chs);

(*indexname ::=   identifier  |  identifier number*)
fun scan_offset chs = case chs of
    (ch::_) => if is_digit ch then  scan_decimal chs  else  (0,chs)
  | []      => (0,chs);


(*special characters allowed in keywords*)
val is_special : string -> bool = fn
     "!" => true  |  "#" => true
  |  "$" => true  |  "%" => true
  |  "&" => true  |  "*" => true
  |  "+" => true  |  "-" => true
  |  "<" => true  |  ">" => true
  |  "/" => true  |  "\\"=> true
  |  "^" => true  |  ":" => true
  |  "." => true  |  ";" => true
  |  "~" => true  |  "@" => true
  |  "=" => true  |  "|" => true
  |  "{" => true  |  "}" => true
  |  "`" => true  |  _  => false;


(*white space: blanks, tabs, newlines*)
val is_blank : string -> bool = fn
     " " => true  |  "\t" => true  |  "\n" => true  |  _ => false;


(*Infix operators with (left precedence, right precedence)
  Also constants, parameters, boundvars, variables, keywords*)
datatype lexsymbol =
    ConstSy  of arity
  | InfixSy  of arity * int * int
  | IdentSy  of int * arity
  | VarSy   of int * arity
  | DelimSy 
  | OtherSy; 


(*a string paired with the symbol scanned from it*)
type lexeme = string * lexsymbol;


(*Lexical analysis maps the characters, a string list, to a lexeme list.
  Symbol tables are used during lexing in order to simplify the parser. *)
fun get_lexemes (lextab, chs: string list)  : lexeme list = 
  let fun idfix s = 
  	(s,   case slookup(lextab,s) of
		  None => OtherSy  |  Some ld => ld);
      (*symbol could be Const, Infix, Keyword, Ident, Other*)
      fun scan_atom chs : lexeme * string list =
	let val (s,chs2) = scan_ident chs
	in  case slookup(lextab,s) of
	      None    => ((s,OtherSy), chs2)
	    | Some (IdentSy(_,ary)) => 
		let val (idx, chs3) = scan_offset chs2
		in  ((s, IdentSy(idx,ary)), chs3)  end
	    | Some ld =>  ((s,ld), chs2)
	end;
      fun getlex (lxs, chs) = case chs of
	   [] => rev lxs
          (*lambda symbol*)
	| "%"::"("::chs' => getlex(("%(",DelimSy) :: lxs, chs')
	| "?"::chs' => (*variables*)
	   let val (s, chs2) = scan_ident chs';
	       val (idx, chs3) = scan_offset chs2
	   in  case slookup(lextab, s) of
	          Some (IdentSy(0,ary)) => 
		      getlex ((s, VarSy(idx,ary)) :: lxs, chs3)  
		| _ => raise lexerr with ("Unknown variable", chs)
 	   end
	  (*standard delimeters*)
	| "["::chs' => getlex(("[",DelimSy) ::lxs, chs')
	| "]"::chs' => getlex(("]",DelimSy) ::lxs, chs')
	| "("::chs' => getlex(("(",DelimSy) ::lxs, chs')
	| ")"::chs' => getlex((")",DelimSy) ::lxs, chs')
	| ","::chs' => getlex((",",DelimSy)  ::lxs, chs')
	| ch ::_  => 
	   if is_letter ch		(*alphanumeric identifier/infix*)
	   then let val (lx,chs') = scan_atom chs in getlex (lx::lxs, chs')
		end
	   else if is_special ch	(*symbolic identifier/infix*)
	   then let val (scs,chs2) = take_prefix is_special chs
		in  getlex (idfix(implode scs) :: lxs, chs2)  end
	   else if is_digit ch		(*number*)
	   then let val (digs,chs2) = take_prefix is_digit chs
		in  getlex (idfix(implode digs) :: lxs, chs2)  end
	   else  if is_blank ch  
	   then  getlex (lxs, strip_prefix is_blank chs) 
	   else  raise lexerr with ("Illegal character", chs)
  in  getlex ([],chs)  end;


fun lex_string (lextab, s) : lexeme list = get_lexemes (lextab, explode s);

(*Get a constant symbol, possibly infix, from a lextab*)
fun get_const lextab s : term =
  case slookup(lextab, s) of
      Some (ConstSy ary) => Const(s,ary)
    | Some (InfixSy (ary,_,_)) => Const(s,ary)
    | _ => raise term_error with ("get_const: "^s, []);


(*syntax errors*)
exception parserr : string * lexeme list;

(*sequential parsing, accumulating results*)
infix thenp;
fun (parsefn1 thenp parsefn2) lxs =
  let val (x1,lxs1) = parsefn1 lxs;
      val (x2,lxs2) = parsefn2 lxs1
  in  ( (x1,x2), lxs2)  end;

(*parse a phrase then skip the given keyword*)
infix 7 thenkey;
fun (parsefn thenkey s) lxs =
  let val (x,lxs1) = parsefn lxs
  in  case lxs1 of
      (s',DelimSy) :: lxs2 =>
	  if s=s' then (x,lxs2)
	  else raise parserr with ("Symbol "^s^" expected", lxs1)
      | _ =>   raise parserr with ("Symbol "^s^" expected", lxs1)
  end;


(*Parse a phrase repeatedly (0 or more times), using parse function,
  so long as the first symbol is the delimeter key
  Returns the list of results  *)
fun parepeat (key: string, parse) lxs : ('a list * lexeme list) = case lxs of
      ((key',DelimSy) :: lxs') =>    
	  if key=key' then
	    let val (x,lxs2) = parse lxs';
		val (xs, lxs3) = parepeat (key, parse) lxs2
	    in  (x::xs,lxs3)  end
	  else ([], lxs)
    | _ => ([], lxs);


(*parses    <phrase> key ... key <phrase>    *)
fun parepin (key,parse) lxs =
  let val (x,lxs') = parse lxs;
      val (xs, lxs2) = parepeat (key, parse) lxs'
  in  (x::xs,lxs2)  end;


(*check that no lexical symbols remain*)
fun parse_end (x, []) = x
  | parse_end (_,lxs) = raise parserr with ("Leftover text", lxs);


(*Parse a bound predeclared variable: check next symbol.
  The "index" part of a bound variable is converted to a string.*)
fun parse_bound ((s, IdentSy(idx,ary)) :: lxs) =
      ((string_of_xname(s,idx),ary), lxs)
  | parse_bound lxs = raise parserr with("Variable name expected", lxs);


(*Search for s among the bound variables in bnids.
  If not there, it must be a parameter.*)
fun check_bound (s: string, idx: int, ary, bnids: string list) : term =
  let val name = string_of_xname(s,idx);
      fun chkid (_, []) = Param((s,idx), ary)
      |   chkid (bno, bnid::bnids) =
	     if bnid=name then  Bound bno  else  chkid(bno+1,bnids)
  in  chkid(0,bnids)  end;



(*Functional for writing parsers
  prec is the precedence of the operator to the left;
    parsing stops at an operator with lower (= weaker) precedence
    so that a*b+c is parsed as (a*b)+c. *)
fun genparse parsetm (prec,boundids) : lexeme list -> term * lexeme list =
  let fun parse_atom lxs0 = case lxs0 of
	  (s, ConstSy ary) :: lxs => (Const(s,ary), lxs)
	| (s, IdentSy(idx,ary)) :: lxs =>
	     (check_bound (s,idx,ary,boundids), lxs)
	| (s, VarSy(idx,ary)) :: lxs => (Var((s,idx), ary) , lxs)
	| ("(",DelimSy) :: (s, InfixSy(ary,_,_)) :: (")",DelimSy) :: lxs =>
	    (Const(s,ary), lxs)
	| ("(",DelimSy) :: lxs => (recparse thenkey ")") lxs
	| lxs => parsetm boundids lxs
      and recparse lxs = genparse parsetm (0,boundids) lxs
      and parse_combtail (tm,lxs) = case lxs of
	  ("(",DelimSy)::lxs' =>
	    let val (args, lxs2) = 
		    (parepin (",", recparse) thenkey ")") lxs'
	    in  parse_combtail (list_comb(tm, args), lxs2)  end
	| (s, InfixSy(ary,lp,rp)) :: lxs' =>
	    if lp<prec then  (tm,lxs)
	    else
	      let val (tm2,lxs2) = genparse parsetm (rp,boundids) lxs'
	      in  parse_combtail (Const(s,ary) $ tm $ tm2, lxs2)
	      end
	| _ => (tm,lxs)
  in fn
      ("%(",DelimSy) :: lxs =>
	let val (bns,lxs') = 
	 	(parepin (",", parse_bound) thenkey ")") lxs;
	    val boundids' = rev (map fst bns);
	    val (body, lxs2) = genparse parsetm (0, boundids'@boundids) lxs'
	in  case is_sdistinct boundids' of
	      [] => (list_abs (bns, body), lxs2)
	    | s::_ => raise parserr with
	    		 ("Repeated bound variable: "^s, lxs)
	end
    | lxs =>  parse_combtail (parse_atom lxs)
  end;


(*conversion from lexemes to variable list*)
fun uvars_of_lexemes [] = [] : term list
  | uvars_of_lexemes ( (s, VarSy(idx,ary)) :: lxs) =
      Var((s,idx), ary) :: uvars_of_lexemes lxs
  | uvars_of_lexemes lxs = 
      raise parserr with ("Only scheme variables allowed", lxs);


(*Dependency declaration ::= identifier ">" var,...,var *)
fun read_deps (lextab : lexsymbol symbol_table) s : indexname * term list = 
  case  lex_string (lextab,s) of
    (pname,IdentSy(idx,_)) :: (">",_) :: lxs =>
        ((pname,0), uvars_of_lexemes lxs)
  | lxs =>  raise parserr with ("Illegal dependency declaration", lxs);

