;;; LR1 - VPARSE-2.LSP
;;;
;;; 2-level deep rule probabilities variant, i.e. pcfg augmented with
;;; daughter-rule probabilities
;;;
;;; Replaces files ttrain.lsp, tparse.lsp, dparse.lsp, refines stuff in
;;; ask-stats.lsp
;;;
;;; (lr1-tree-train ":<train>.trees1" ":<train>.1")
;;; Train from already existing parse trees. Read definitions and trees
;;; from input files, and write out probs. trees1 file will match
;;; grammar exactly (since comes from runautotrain)
;;;
;;; (lr1-history-probs ":<train>.1" ":<train>.probs")
;;; Just copy existing probs file
;;;
;;; (lr1-read-history-probs ":<train>.probs")
;;; Read in rule probs
;;;
;;; (lr1-parse words)
;;; Call chart parser but with special unpacking

#|
(lr1-tree-train "internal:tag:test.trees1" "internal:tag:test.1")
(lr1-history-probs "internal:tag:test.1" "internal:tag:test.probs")
(lr1-read-history-probs "internal:tag:test.probs")
(lr1-parse 
   '(|the_AT| |statement_NN2| |may_VM| |be_VB0| |highly_RR| |prejudicial_JJ| |to_II|
 |my_APP$| |client_NN1| |,_,| |Bellows_NP1| |tell_VVD| |the_AT| |court_NNJ1|)
)

(make-parse-rule-tree t)
(setq *lr1-parse t)
(setq *tagged-words t)
(lr1-read-history-probs "internal:tag:sustrtsg11.probs-2")
(setq +level-two-probs+ t)
(setq +level-one-probs+ t)

p
the_AT statement_NN2 may_VM be_VB0 highly_RR prejudicial_JJ to_II my_APP$ client_NN1 ,_, Bellows_NP1 tell_VVD the_AT court_NNJ1

|#


(defun lr1-tree-train (tree-file raw-hist-out)
   (let ((freqs nil))
      (with-open-file (tree-str tree-file :direction :input)
         (loop
            (let ((sentence (cons nil (read tree-str nil t))) tree)
               (when (atom (cdr sentence))
                  (format t "~&----------------~%Finished parsing file ~A~%"
                     tree-file)
                  (return))
               (setq tree (read tree-str))
               (when (integerp tree)
                  ;; this is an output file from lr1-parse-analysis-trees - 
                  ;; format: sentence, number of parses, 3 trees
                  (setq tree (read tree-str))
                  (read tree-str) (read tree-str))
               ;; (format t "~&~{~A~^ ~}~%~%" (cdr sentence)) (finish-output)
               (setq freqs (compute-tree-probs tree freqs)))))
      (write-tree-probs freqs raw-hist-out)))


(defun compute-tree-probs (tree freqs)
  ;; tree is a normal s-expression, e.g.
  ;; ("T/txt-sc1/---"
  ;;  ("T/leta_s"
  ;;   ("S/adv_s/+" ("AP/a1" ("A1/a" |nevertheless_RR|)) |,_,|
  (let*
     ((rule (intern (car tree)))
      (entry (getf freqs rule))
      (n 0))
     (unless entry
        (setq entry (cons nil nil))
        (setf (getf freqs rule) entry))
     (setf (car entry) (1+ (or (car entry) 0)))
     (dolist (d (cdr tree))
        (incf n)
        (let*
           ((dr ; cons = rule, atom = word - return nil
               (if (consp d) (intern (car d)) nil))
            (de (assoc n (cdr entry))))
           (unless de
              (setq de (cons n nil))
              (setf (cdr entry)
                 (merge 'list (list de) (cdr entry) #'< :key #'car)))
           (setf (getf (cdr de) dr) (1+ (or (getf (cdr de) dr) 0)))
           (when (consp d)
              (setq freqs (compute-tree-probs d freqs)))))
     freqs))


(defun write-tree-probs (freqs out)
   ;; get nt symbol for each rule mother
   (tree-probs-collect-mother-nts)
   ;; normalise rule freqs wrt lr1 backbone category of mother
   (let ((ms nil) (total-n 0))
      ;; ms <- (mother-cat1 ((rule1 . data1) ...) ...)
      (do*
          ((tail freqs (cddr tail)))
          ((null tail))
          (let ((rule (car tail)) (data (cadr tail)))
             (push (cons rule data)
               (getf ms (tree-probs-mother-nt rule)))))
      ;; should really hold 1 back for unseens within each mother category,
      ;; and distribute evenly amongst all relevant rules for any unseen
      ;; mother cats ***
      (do*
          ((tail ms (cddr tail)))
          ((null tail))
          (let* ((rule-n-pairs (cadr tail))
                 (int-n (apply #'+ (mapcar #'cadr rule-n-pairs)))
                 (den (log int-n 10)))
             (incf total-n int-n)
             (dolist (pair rule-n-pairs)
                (setf (cadr pair) (- (log (cadr pair) 10) den))
                (dolist (d-pair (cddr pair))
                   (let ((d-total 0))
                      (do ((d-tail (cdr d-pair) (cddr d-tail)))
                         ((null d-tail))
                         (incf (cadr d-tail)) ; add-1
                         (incf d-total (cadr d-tail)))
                      (incf d-total) ; hold one back
                      (do ((d-tail (cdr d-pair) (cddr d-tail))
                           (d-den (log d-total 10)))
                         ((null d-tail)
                            (setf (getf (cdr d-pair) '$) (- (log 1 10) d-den)))
                         (let ((rule (car d-tail)) (n (cadr d-tail)))
                            (setf (cadr d-tail) (- (log (cadr d-tail) 10) d-den))))
                      )))))
      (let ((*print-level* nil) (*print-length* nil) (*print-pretty* nil))
         (with-open-file
            (str out :direction :output :if-exists :supersede
               :if-does-not-exist :create)
            ;; one extra observation for unseen
            (format str "(~A" (- (log 1 10) (log (1+ total-n) 10))) ; hold one back
            (do* ((tail ms (cddr tail)))
                 ((null tail))
               (dolist (pair (cadr tail)) (print pair str)))
            (format str "~%)")))))


(defvar *tree-probs-mother-nts* nil)

(defun tree-probs-collect-mother-nts nil
   (let ((res (mapcar #'car g-tracerules)))
      (dotimes (n (length g-gramtree))
         (setq res
            (tree-probs-traverse (cdr (svref g-gramtree n)) res)))
      (setq *tree-probs-mother-nts*
         (remove-duplicates res :key #'cadr :test #'equal))))

(defun tree-probs-traverse (tree res)
   (if tree
      (tree-probs-traverse (cddr tree)
         (if (consp (car tree))
            (nconc (mapcar #'car tree) res) ; rule mothers
            (tree-probs-traverse (cadr tree) res)))
      res))

(defun tree-probs-mother-nt (rulename)
   (let ((entry
           (find (string rulename) *tree-probs-mother-nts* :key #'cadr :test #'equal)))
      (if entry
         (intern (princ-to-string (svref (car entry) 0)))
         (error (format nil "Did not find rule ~A in ~A" rulename '*tree-probs-mother-nts*)))))


;;;
                    
(defun lr1-history-probs (in out)
   (with-open-file
      (in-str in :direction :input)
      (with-open-file
         (out-str out :direction :output :if-exists :supersede
            :if-does-not-exist :create)
         (loop
            (let ((char (read-char in-str nil nil)))
               (unless char (return))
               (write-char char out-str))))))


;;;

(defvar *production-probs* nil)

(defun lr1-read-history-probs (action-prob-in)
   (with-open-file (in-str action-prob-in :direction :input)
      (setq *production-probs* (read in-str))
      t))


;;;

(defparameter +level-one-probs+ t)
(defparameter +level-two-probs+ t)

(defparameter +n-best-retained+ 3)

(defun lr1-parse (words)
   ;; redefinition
   (g-parse words (top-category-indexes)))


(defun g-unpack (tree vt)
   ;; redefinition
   (multiple-value-bind (vts-and-trees scores)
       (v-unpack tree vt (cons (log 1.0 10) 0))
      ;; rank them
      (let*
           ((sorted
              (sort
                 (mapcar #'(lambda (vt-t s) (cons vt-t (/ (car s) (cdr s))))
                    vts-and-trees scores)
                 #'> :key #'cdr))
            (pruned (ldiff sorted (nthcdr +n-best-retained+ sorted))))
         (print (mapcar #'cdr pruned))
         (mapcar #'car pruned))))


(defun v-unpack (tree vt score)
   (g-perform-stack-check)
   (let
      ((tail (cdar tree)) (unpacked nil) (unpacked-scores nil) mother)
      (loop
         (when (or (atom tail) (atom (car tail)))
            (setq mother (cons (caar tree) tail)) (return))
         (let ((packed (cdar tail)))
            (multiple-value-bind (success new-rvt new-nvt)
               (g-unify
                  (caar tree) (caar (cddr packed)) vt (cadr packed))
               (declare (ignore new-rvt))
               (when success
                  (multiple-value-bind (unp unp-s)
                      (v-unpack (cddr packed) new-nvt score)
                     (dolist (u unp)
                        (push
                           ;; (car packed) is the nvt of packed-into node at
                           ;; the time of packing
                           (cons (nconc (ldiff vt (car packed)) (car u))
                              (cdr u))
                           unpacked))
                       (dolist (u unp-s) (push u unpacked-scores))))))
         (setq tail (cdr tail)))
      (cond
         ((and (cdr tree) (atom (cadr tree)))
            ;; a word node
            (values (cons (cons vt (cons mother (cdr tree))) unpacked)
               (cons score unpacked-scores)))
         (t
            (let*
                ((daughters-alts (list (list vt)))
                 (entry
                    (assoc (intern (cadr mother)) (cdr *production-probs*)
                       :test #'eq))
                 (score
                    (if +level-one-probs+
                       (cons
                          (+ (car score) (or (cadr entry) (car *production-probs*)))
                          (1+ (cdr score)))
                       score))
                 (daughters-alts-scores (list score))
                 (d-entries (reverse (cddr entry)))) ; !!! inefficient
               (dolist (d (cdr tree))
                  (multiple-value-setq (daughters-alts daughters-alts-scores)
                     (v-unpack-daughter
                        d daughters-alts daughters-alts-scores
                        (if d-entries (cdr (pop d-entries)) nil))))
               (v-unpack-prune
                  (append unpacked
                     (if (cdr daughters-alts)
                        (mapcar
                           #'(lambda (alt)
                              (list* (car alt) mother (reverse (cdr alt))))
                           daughters-alts)
                        (list (cons vt tree))))
                  (append unpacked-scores daughters-alts-scores)))))))
                   

(defun v-unpack-daughter (d daughters-alts daughters-alts-scores d-entries)
   (let ((unpacked nil) (unpacked-scores nil))
      (dolist (alt daughters-alts)
         (multiple-value-bind (unp unp-s)
             (v-unpack d (car alt) (pop daughters-alts-scores))
            (dolist (up unp)
               (let*
                  ((name ; nb packing may still be present until rempack
                     (if (atom (cadr (cdr up)))
                        nil ; a word
                        (car (last (cdar (cdr up))))))
                   (n
                      (if (and d-entries +level-two-probs+)
                         (or (getf d-entries (if name (intern name) nil))
                            (getf d-entries '$))
                         (log 1.0 10)))
                   (s (pop unp-s)))
                  (push (list* (car up) (cdr up) (cdr alt)) unpacked)
                  (push (if +level-two-probs+ (cons (+ n (car s)) (1+ (cdr s))) s)
                     unpacked-scores)))))
      (values unpacked unpacked-scores)))


(defun v-unpack-prune (trees scores)
   ;; (values trees scores)
   ;; put in buckets depending on ntrans, then sort and prune each bucket
   (let ((buckets nil) (pruned nil))
      (dolist (tree trees)
         (let* ((score (pop scores))
                (entry (assoc (cdr score) buckets)))
            (unless entry
               (setq entry (cons (cdr score) nil))
               (push entry buckets))
            (push (cons tree score) (cdr entry))))
      (dolist (bucket buckets)
         (let ((sorted (sort (cdr bucket) #'> :key #'cadr)))
            (setq pruned
               (nconc (ldiff sorted (nthcdr +n-best-retained+ sorted)) pruned))))
      (values (mapcar #'car pruned) (mapcar #'cdr pruned))))


;;; End of file
