;;; version of learner that uses bracketing information.

;;; Parse a sustr.trees1 style file, and do usual, except 
;;; operating over n-best parses (as defined in terms of
;;; tree similarity between candidates parses and bench parse).

;;; also can continue with lots of unannotated corpora.

;;; definition of a model.
(defstruct model prior likelihood rules parse) 

(defvar *bench-parse* nil) ;;; holds current bench parse
(defvar *bracketing* t) ;;; true to use bracketing information.
(defvar *random-selection* nil) ;;; true to randomly select a best matching tree
(defvar *right-branching* nil) ;;; true to pick parse with most right headed rules


(defun bracket-run (bracketing)

;;; bracketing is sustr.trees1 format file.
;;; train learner on brackting info.  

  (with-open-file 
   (in bracketing :direction :input)
   (loop
    (let ((sentence (read in nil 'eof nil))
	  (tree (read in nil 'eof nil)))

      (if (equal sentence 'eof)
	  (progn
	    (return t))
	(progn	  
	  (setf *bench-parse* tree)
	  (setf *previous-sentence sentence)	  
	  (invoke-parser sentence nil)))))))

		  
(defun calc-posterior (raw-parses)
  
;;; we've now produced n-best parses (using JAC routine), so
;;; calculate posterior of each parse and pick best parse.
;;; From best parse, add rules to table to update likelihood prob
;;; and prior.

;;; uses Eirik's treematcher

  (let ((models nil)
	(filter-parses nil)
	(pairs nil)
	(parses (rank-parses raw-parses)) ;;; here
	(r-table (make-hash-table :test #'equal)))
   
    (format t "Originally ~A Ranked ~A ~%" (length raw-parses)
	    (length parses))
    (dolist (p parses)	    
	    (let ((model (make-model :rules 
				     (get-nonterms-from-tree p)
				     :parse p)))
	      (push model models)))
	    
    (dolist (rr models pairs)
	    (let* ((likelihood-rules (model-rules rr))
		   (rules (add-counts-lite  likelihood-rules)))
	      (let ((likelihood 0.0)
		    (temp nil)
		    (post 0.0)
		    (prior 0.0))
		(setf likelihood (rule-likelihood rules))
		(dolist (p *parse-queue*)		      
		      (setf likelihood (+
					likelihood 
					(rule-likelihood p))))		
		(when (and *parameter-prior* *structure-prior*)
		      (setf prior (* 
				   (prior-weight *total-parsed*)
				   (+ prior (encode-dictionary 
					     *parse-queue*
					     rules)))))
		(format t "Prior: ~A Likelihood: ~A ~%" prior likelihood)
		;posterior, prior weighted
		(setf (model-prior rr) prior)
		(setf (model-likelihood rr) likelihood)
		(push rr pairs)	      
		(minus-counts-lite  likelihood-rules))))))


(defun get-learnt-rules (likerules)
  (let ((res nil))
    (dolist (r likerules res)
	    (let ((rule (fetch-rule-entry (car r))))
	      (when rule
		    (push rule res))))))


(defun add-counts-lite (likelihood-rules)

;;; updates  counts of just learnt rules, inc daughter freqs

;;; returns all rules (including given ones)

  (let ((learnt nil))
    (dolist (r-d likelihood-rules learnt)
	    (setf *freq-total* (+ 1 *freq-total*))
	    (let* ((r (car r-d))
		   (d (cadr r-d))
		   (entry2 (fetch-rule-entry r))
		   (dfreqs (rule-daughters entry2)))

	      (when (null entry2) (error "no rule ~A" r))

	      (push entry2 learnt)
	      (when (rule-body entry2) 
		    ;(push entry2 learnt)
		    (setf (svref *rule-denom* (rule-mother entry2))
			  (1+ (svref *rule-denom* (rule-mother entry2))))    
		    
		    (let ((temp nil))
		      (dolist (dd dfreqs) 
			      (when (null d) (return))
			      (push (add-daughter (car d) dd) temp)
			      (pop d))

		      (when d
		;;;okay, kleene op has added extra stuff
			    (dolist (dd d) 
				    (push (add-daughter dd nil) temp)))
		      (setf (rule-daughters entry2) 
			    (reverse temp)))
		    (setf (rule-freq entry2) (+ 1 (rule-freq entry2))))))))


(defun rank-parses (raw-parses)
;;; out of the possible derivations, select some subset of the
;;; n-equal best.  The default is to let model selection 
;;; decide.  

  (let ((res nil))
    (dolist (p raw-parses)
	    (push (get-rule-labelling-from-parse-tree (cdr p))
		  res))

    ;;; if we don't have a treebank parse, fake it.
    (unless *bracketing*
	    (setf *bench-parse*
		  (cons 'x *previous-sentence)))

    (let ((res2
	   (find-most-similar-trees res *bench-parse*)))

      (cond
       ((not *bracketing*) res) ;;; ie if not using brackting, return all.
      (*random-selection*
       (list (nth (random (length res2)) res2)))
      (*right-branching*
       (list (pick-right-branching res2)))
      (t res2)))))

(defun pick-right-branching (parses)
  (if (null parses)
      (return-from pick-right-branching nil))

  (let ((count 0) (parse (car parses)))
    (dolist (p (cdr parses) parse)
	    (let ((count-temp (count-right p)))
		  (when (> count-temp count)
			(setf count count-temp)
			(setf parse p))))))

(defun count-right (tree)
  ;;; calc percentage of learnt rules that are right branching
  (let ((rules (get-l-rules tree)) (l-count 0) (r-count 0))
    (dolist (r rules (/ r-count (+ r-count l-count))) 
	    (let ((head (subseq r 0 (- (position #\: r) 1)))
		  (d1 (subseq r (+ 1 (position #\[ r))
			      (- (position #\- r) 1))))
	      (if (string= head d1)
		  (setf l-count (+ 1 l-count))
		(setf r-count (+ 1 r-count)))))))
		  
  
(defun get-l-rules (tree)
  (cond
   ((null tree) nil)
   ((stringp (car tree))
    (let ((res2 (fetch-rule-entry (car tree))))
      (if (and res2 (rule-body res2))
	  (cons (car tree)
		(get-l-rules (cdr tree)))
	(get-l-rules (cdr tree)))))
   ((atom (car tree)) (get-l-rules (cdr tree)))
   (t
    (let ((res2 (get-l-rules (car tree)))
	  (res3 (get-l-rules (cdr tree))))
      (append res2 res3)))))
	 
(defun corpus-tree-constituents (tree)
      (extract-constituents
       tree 0 #'(lambda (x) (atom x))))

;;; Parse/corpus tree matching
;;; ==========================
;;; The default function for parse tree matching is based on maximising the
;;; value of simi = s1 + s2 - 3*s3 where
;;;
;;;   s1 = number of shared constituents
;;;   s2 = number of shared constituents with matching labels
;;;   s3 = number of crossing constituents (in corpus relative to parse)
;;;
;;; Constituents must include 2+ words.  Identical constituents count as one.
;;; Labels match when the max alphanumeric prefixes are equal ignoring case.

(defun find-most-similar-trees (trees match)
  "Find the parse tree or trees most similar to the corpus tree"
  (let ((match-cons (corpus-tree-constituents match)) (simi nil))
    (dolist (tree trees)
      (let ((tree-cons (parser-tree-constituents tree)))
        (push (similarity-measure tree-cons match-cons) simi)))
    (let ((max (reduce #'max simi)))
      (mapcan #'(lambda (x tr) (when (>= x max) (list tr)))
              (nreverse simi) trees))))

(defun similarity-measure (tree-cons match-cons)
  (let ((s1 0) (s2 0) (s3 0))
    (dolist (x match-cons (+ s1 s2 (* -3 s3)))
      (let ((b1 nil) (b2 nil) (b3 nil))
        (dolist (y tree-cons)
          (unless b2
            (when (and (= (car x) (car y)) (= (cadr x) (cadr y)))
              (unless b1 (incf s1) (setq b1 t))
              (when (member-if
                     #'(lambda (a)
                         (member-if
                          #'(lambda (b) (matching-labels a b))
                          (cddr y)))
                     (cddr x))
                (incf s2) (setq b2 t))))
          (unless b3
            (when (or (< (car x) (car y) (cadr x) (cadr y))
                      (> (car x) (car y) (cadr x) (cadr y)))
              (incf s3) (setq b3 t))))))))

(defvar *parser-gaps nil)

(defun parser-tree-constituents (tree)
  (extract-constituents
   tree 0 #'(lambda (x) (and (atom x) (not (member x *parser-gaps))))))

;(defun corpus-tree-constituents (tree)
 ; (extract-constituents
  ; tree 0 #'(lambda (x) (and (listp x) (eq (car x) :word)))))

(defun extract-constituents (tree start word-p)
  "Get all constituents of two or more words in tree"
  ;; Two values: ((v1 v2 label... )... ), end-position
  (cond
   ((funcall word-p tree) (values nil (1+ start)))
   ((atom tree) (values nil start))
   (t (let ((consts nil) (end start))
        (dolist (dau (if (atom (car tree)) (cdr tree) tree))
          (multiple-value-bind (x y) (extract-constituents dau end word-p)
            (setq consts (nconc consts x) end y)))
        (when (> (- end start) 1)
          (let ((q (find-if
                    #'(lambda (s)
                        (and (eql (car s) start) (eql (cadr s) end)))
                    consts)))
            (unless q (push (setq q (list start end)) consts))
            (when (and (atom (car tree)) (not (member (car tree) (cddr q))))
              (push (car tree) (cddr q)))))
        (values consts end)))))

(defun matching-labels (a b)
  "Do these labels match (for tree comparison purposes)?"
  ;; Called from above, a is from the corpus, b is from the parser.
  ;; Requires non-empty identical alphanumeric prefixes ignoring case.
  (let ((i (string-not-equal (string a) (string b))))
    (or (not i)
        (and (>= i (or (position-if-not #'alphanumericp (string a)) 0) 1)
             (>= i (or (position-if-not #'alphanumericp (string b)) 0) 1)))))








