
;;; Scoring metric for ANLT DCG rules.  


;;; no treatment of re-entrant features (other than they are all counted
;; as same) yet.

(defvar *feats (if (and (boundp '*features) *features)
		   (remove 'H *features)
		 nil))

(defvar *table* nil) ;;; hold (static) prob. distributions of feature values.

(defvar *rule-info* 
  (make-hash-table :test #'equal)) ;;; holds freq info about
                                   ;;;rules used in previous parses

(defvar *rule-denom* nil) ;;; used to calc rule probs
(defvar *freq-total* 0) ;;; sum of above

(defvar *n-best* 0.75) ;;; percentage of rules invented to consider

(defvar *exception-threshold* 50) ;;; threshold to reject rules.
(defvar *max-candidates* 150)  ;;; max number of rules for some sentence

(defun encode-integer (n)

;;; Rissannen coding scheme for ints

;; log 10; is this right?

  (let* ((a (log n 10)))
    (if (> a 0.0)
	(let ((b (log a 10)))
	  (if (> b  0.0)
	      (+ a (log b 10))
	    a))
      a)))
	 
(defvar *encode-3* nil) ;;; no point doing this again and again
(defvar *encode-4* nil) ;;; no point doing this again and again

(setf *encode-3* (encode-integer 3))
(setf *encode-4* (encode-integer 4))

(defun encode-learnt-rule (edge)
  ;;; given an inactive edge, work out the prior
  ;;; encodes all cats and number of cats.

  (let ((mother (g-chart-edge-res edge))
	(daughters (reverse (g-chart-edge-found edge)))
	(tot 0)
	(index 0))
    
    (dolist (cat daughters)
	    (setf index (+ 1 index))
	    (setf tot (+ tot (encode-cat (car cat) index))))	     
    (+ tot (if (= 3 tot) *encode-4* *encode-3*)
       (encode-cat mother 0))))

(defun shannon-complexity (p)
;;; note, to base 10
  (* -1 (log p 10)))

(defun encode-cat (cat index)

;;; ignores any bindings (ie just encodes cats as if they were
;;; a rule)

  (let ((nice-cat (map-parser-cat-to-cat cat))
	(len 0))

    (dolist (f-v nice-cat len)
	    (setf len (+ len (shannon-complexity 
			      (get-val-prob-given-cat (car f-v)
						       (cdr f-v)
						       index
						       *table*)))))))
(defun n-best (edges)

;;; find n-best rules.

;;; Rank edges in terms of scfg parameter and domain 
;;; constraints.  Take first n best edges.

;;; when evaluating, return all edges created.

  (when *evaluate* (return-from n-best edges))

  (let ((scored nil)
	(n 0)
	(res nil)
	(len (length edges)))
    

    (dolist (e edges)
	    (let ((res (exceptions e)))
	      (when (< res *exception-threshold*)
		(push (cons (+ (* -1 res)
			       0;(single-rule-score 
				    ;(cadr (g-chart-edge-res e)))
			       )
			    e)
		      scored))))
    
    (when (null scored) (return-from n-best nil))

    (setf scored (remove-duplicate-edges scored))

    (let ((ll (length scored)))
      (if (> ll  *max-candidates*)
      ;;; too many candidates ...
	  (progn
	    (setf scored (sort scored #'> :key #'car))    
	    (setf n (round (* *n-best* (length scored)))))
	(setf n ll)))
    
    (dotimes (x n res)
	     (when (> x *max-candidates*) (return-from n-best res))
	     (when (null scored) (return-from n-best res))
	     (push (cdar scored) res)	     
	     (add-rule-to-table  (cdar scored) *rules*)
	     (pop scored))))

(defun create-rule-freq-table nil
  
;;; create table to hold freq info about rules

;;; also initialises denom table
  (psm-init)
  (format t "Creating rule freq table ~%")
  
  (setf *freq-total* 0)
  (clrhash *rule-info*)
  
  (dolist (r *id-rules)
	  (let ((raw-rules (get-rules-given-name r)))
	    (setf *grammar-size* (+ *grammar-size* (length raw-rules)))
	      (dolist (r raw-rules)
		      (setf (gethash 
			     (let ((name (second (car r))))
			       (unless (stringp name)
				       (error "name not found"))
			       name)
			     *rule-info*)
			    (make-rule 
			     :static-length (encode-given-rule 
					     r)
			     :backbone (create-given-backbone
					r)
			     :daughters (let ((temp nil))
					  (dolist (y (cdr r)) temp)
					  (push nil temp))
			    :mother (svref 
				     (caar r) 0) ;;;cheat
			    :freq 1)))))
  
  (setf *rule-denom* (make-array (+ 1 (length *index-category-table))
				 :initial-element 1))
  
  (dolist (r *id-rules)
	  (let ((raw-rules (get-rules-given-name r)))
	    (dolist (r raw-rules)
		    (let ((mother (caar r)))
		      (setf (aref *rule-denom* (svref   mother 0))
			    (+ 1 (aref 
				  *rule-denom* (svref mother 0))))))))
  
  (dotimes (x (length *rule-denom*))
	   (setf *freq-total* (+ *freq-total* (aref *rule-denom* x))))
  
;;;  this adds one for each distinct cat 
  (setf *freq-total* (+ *freq-total* (length *index-category-table))))


(defun encode-given-rule (rule)

;;; note, does not encode exceptions
;;; (a manually written rule does not violate backaground theory, right?
  (let ((len 0)
	(counter 0))
    (dolist (cat rule (+ (encode-integer (length rule)) len))
	    (setf len (+ (encode-cat cat counter)))
	    (setf counter (+ 1 counter)))))
	      
(defun tree-likelihood (rr)
  (let ((entry (fetch-rule-entry rr)))
    (/ 
     (if (and entry (> (rule-freq entry)
		       0))
	 (rule-freq entry)
       1) ;;; unseens
     
     (let ((res 
	    (svref *rule-denom*
		   (rule-mother
		    entry))))
       (if (= 0 res)
	   ;;; unseen lhs
	   1
	 res)))))

(defvar *structure-prior* t)
	
(defvar *dict-length* nil)

(defun encode-dictionary (q rules)
  
;;; work out how long the dictionary, corresponding to 
;;; encodes entire grammar

  
  (let ((distinct nil) 
	(seen nil)
	(len 0))

 ;   (unless *structure-prior* (return-from encode-dictionary 0.0))
    
    ;;; rules in current parse that are unseen
    (dolist (r rules)
	    (let ((name (rule-name r)))
	      (when (and (rule-body r) ;;; learnt rule
			 (null (gethash name *rule-info*)) ;;; new rule
			 (not (member name seen :test #'string=))) ;;; seen once
		    (push name seen)
		    (when *parameter-prior* 
			  (setf len (+ len (parameter-prior r))))
		    (when
		     *structure-prior*
		     (setf len (+ len (rule-static-length 
				       r)))))))
    ;;; rest in grammar
    (maphash #'(lambda (name rule)
		       (when (and name rule)
			     (when *parameter-prior* 
				   (setf len (+ len (parameter-prior rule))))
			     (when
			      *structure-prior*
			      (setf len (+ len (rule-static-length 
						rule))))))
			     
	     *rule-info*)

    (if (= 0 len) 0 (* -1.0 len))))	      					

(defun pre-train (fp)

;;; give sensible initial values to likelihood and compression probs
  
;;; assumes grammar already read in.

  ;(reset-tables)
  ;
  (format t "Pre training ... ~%")

  (with-open-file (in fp :direction :input)
		  (loop
		   (let ((sentence (read in nil 'eof nil))
			 (tree (read in nil 'eof nil)))
		     (cond
		      ((equal sentence 'eof)
		       (return))
		      (t
		       (format t ".")
		       (add-counts 
			(keep-in-coverage
			 (get-nonterms-from-tree tree)))))))))



(defun pre-train-learnt-rules (fp)
  ;;; assign freqs to psm of learnt rules (not seen in pre-training)

  (with-open-file (in fp :direction :input)
		  (let ((names (read in nil 'eof nil)))
		    (dolist (n names)
			    (let ((ent (fetch-rule-entry (format nil "~A"
								 (car n)))))
			      (setf (rule-freq ent) (cdr n))
			      (setf (svref *rule-denom* (rule-mother ent))
				    (+ (- (cdr n) 1) ;;; already have count one
				       (svref *rule-denom* 
					      (rule-mother ent)))))))))

(defun get-rules-from-tree (tree)
  (cond
   ((null tree) nil)
   ((listp (car tree))
    (append (get-rules-from-tree (car tree))
	    (get-rules-from-tree (cdr tree))))
   ((stringp (car tree))
    (cons (car tree) (get-rules-from-tree (cdr tree))))
   (t
    (get-rules-from-tree (cdr tree)))))

(defun get-nonterms-from-tree (tree)
;;; version that also extracts daughter info
  (cond
   ((null tree) nil)
   ((listp (car tree))
    (append (get-nonterms-from-tree (car tree))
	    (get-nonterms-from-tree (cdr tree))))
   ((stringp (car tree))
    (let ((mother (car tree))
	  (dau nil))
      (dolist (d (cdr tree))
	      (cond
	       ((atom d)
		(push "word" dau))
	       ((stringp (car d))
		(push (car d) dau))))

      (cons (list mother (reverse dau)) (get-nonterms-from-tree (cdr tree)))))
   (t
    (get-nonterms-from-tree (cdr tree)))))


(defun single-rule-score (name)

;;; if we've just invented some rule, try to give it
;;; a reasonable score: likelihood + unigram compression.

  (let ((entry (fetch-rule-entry name)))
    (cond
     (entry
      (+ (if *likelihood* (log (tree-likelihood name) 
			       10)
	   0.0)
	 (if *parameter-prior*
	     0.0 ;(log (/ (rule-freq entry) 
		  ;   *freq-total*) 10)
	   0.0)))
     ((> *freq-total* 0)
      (+ -100.0 ;;; ie a small prob if totally new rule.
	 (if *parameter-prior* (log (/ 1 *freq-total*) 10) 0.0)))
     (t
      0.0))))
      
