;;; ngram-based compression of some sequence of events.

(defvar *bigrams* (make-hash-table :test #'equal))
(defvar *trigrams* (make-hash-table :test #'equal))

(defvar *[* 0) ;counts for lossless compression of trees
(defvar *]* 0)

(defun add-bigram (bigram)
  (setf *freq-total* (1+ *freq-total*))
  (let* ((a (intern (car bigram)))
	 (b (intern (cdr bigram)))
	 (res (gethash a *bigrams*)))
    (if res
	(dolist (r res (setf (gethash a *bigrams*)
			     (push (cons b 1) res)))
		(when (equal (car r) b)
		      (setf (cdr r) (1+ (cdr r)))
		      (return-from add-bigram t)))
      
      (setf (gethash a *bigrams*) (list (cons b 1))))))

(defun remove-bigram (bigram)
  (setf *freq-total* (- *freq-total* 1))
  (let* ((a (intern (car bigram)) )
	 (b (intern (cdr bigram)))
	 (res2 nil)
	 (res (gethash a *bigrams*)))

    (dolist (r res nil)
	    (when (equal (car r) b)
		  (setf (cdr r) (- (cdr r) 1))
		  (return-from remove-bigram t)))))

(defun get-bigram-freq (a b1)
  (let ((res (gethash (intern a) *bigrams*))
	(b (intern b1)))
    (dolist (r res 0)
	    (when (equal (car r) b) (return-from get-bigram-freq 
						   (cdr r))))))

(defun get-trigram-freq (a b1 c1)
  (let ((res (gethash (intern a) *trigrams*))
	(b (intern b1))
	(c (intern c1)))
    (dolist (r res 0)
	    (when (and (equal (car r) b) (equal (second r) c))
		       (return-from get-trigram-freq 
						   (third r))))))
(defun events-to-bigrams (events)

  (if (null events)
      nil
    (let ((bigrams (list (cons "start" (car events))
			 (cons (car (last events)) "end")))
	  (prev (car events)))

      (when (string= (car events) "[") (setf *[* (1+ *[*)))
      (when (string= (car events) "]") (setf *]* (1+ *]*)))

      (dolist (e (cdr events))

	      (when (string= e "[") (setf *[* (1+ *[*)))
	      (when (string= e "]") (setf *]* (1+ *]*)))

	      (push (cons prev e) bigrams)

	      (setf prev e))
      (dolist (b bigrams)
	      (add-bigram b)))))

(defun remove-events-to-bigrams (events)

  (if (null events)
      nil
    (let ((bigrams (list (cons "start" (car events))
			 (cons (car (last events)) "end")))
	  (prev (car events)))

      (when (string= (car events) "[") (setf *[* (- *[* 1)))
      (when (string= (car events) "]") (setf *]* (- *]* 1)))

      (dolist (b bigrams)
	      (remove-bigram b))
      (dolist (e (cdr events))
	      (when (string= e "[") (setf *[* (- *[* 1)))
	      (when (string= e "]") (setf *]* (- *]* 1)))
	      (remove-bigram (cons prev e))
	      (setf prev e)))))

(defun dump-bigrams nil
  (maphash #'(lambda (key val)
	       (when key
		     (dolist (v val)
			     (format t 
				     "~A ~A ~A ~%" key (car v) (cdr v)))))
	   *bigrams*))

(defun dump-trigrams nil
  (maphash #'(lambda (key val)
	       (when key
		     (dolist (v val)
			     (format t 
				     "~A ~A ~A ~A ~%" key (car v) (second v)
				     (third v)))))
	   *trigrams*))

(defun remove-terms (tree)
;;; I'm sure I've done this before ...
  (cond
   ((null tree) nil)
   ((listp (car tree))
    (cons (remove-terms (car tree))
	  (remove-terms (cdr tree))))
   ((stringp (car tree))
    (cons (car tree) (remove-terms (cdr tree))))
   (t
    (remove-terms (cdr tree)))))

(defun tree-to-list (tree)

;;; rewrite a tree as a list of rules.  Open paren [, close paren ].

  (cond
   ((null tree) nil)
   (t
    (let ((tr1 (remove-terms tree)))
      (cons "[" (tree-to-list1 tr1))))))

(defun tree-to-list1 (tree)
  (cond
   ((null tree)
    (list "]"))
   ((listp (car tree))
    (cons "[" (append (tree-to-list1 (car tree))
		      (tree-to-list1 (cdr tree)))))
   (t
    (cons (car tree) (tree-to-list1 (cdr tree))))))
    
    
(defun add-trigram (trigram)

  (let* ((a (intern (car trigram)))
	 (b (intern (second trigram)))
	 (c (intern (third trigram)))
	 (res (gethash a *trigrams*)))
    (if res
	(dolist (gram res (setf (gethash a *trigrams*)
				(push (list b c 1) res)))
		(when (and (equal b (car gram))
			   (equal c (second gram)))
		      (setf (third gram) (1+ (third gram)))
		      (return-from add-trigram t)))
      (setf (gethash a *trigrams*)
	    (list (list b c 1))))))

(defun remove-trigram (trigram)

  (let* ((a (intern (car trigram)))
	 (b (intern (second trigram)))
	 (c (intern (third trigram)))
	 (res2 nil)
	 (res (gethash a *trigrams*)))

    (dolist (r res nil)
	    (when (and (equal (car r) b) (equal (second r) c))
		  (setf (third r) (- (third r) 1))
		  (return-from remove-trigram t)))))

(defun prune-ngrams (n)

  (maphash #'(lambda (k v)
	       (when (and k v)
		     (if (= 0 (rule-freq v))
			 (remhash k *rule-info*)
			; (setf (gethash k *rule-info*) nil)
		       (setf (rule-daughters v) (prune-daughters
						 (rule-daughters v))))))
	   *rule-info*)

  (maphash #'(lambda (key val)
	       (when key
		     (let ((res nil))
		       (dolist (v val)
			       (unless (>= n (cdr v))
				       (push v res)))
		       
		       (setf (gethash key *bigrams*) res))))
	   *bigrams*)

  (maphash #'(lambda (key val)
	       (when key
		     (let ((res nil))
		       (dolist (v val)
			       (unless (>= n (third v))
				       (push v res)))
		       
		       (setf (gethash key *trigrams*) res))))
	   *trigrams*))

(defun events-to-trigrams (events)

  (if (null events)
      nil
    (progn
      (push "start" events)
      (push "start" events)
      (let ((prev1 (car events))
	    (prev2 (second events)))
	(dolist (e (cddr events))
		(let ((trigram (list prev1 prev2 e)))
		  (add-trigram trigram))
		(setf prev1 prev2)
		(setf prev2 e))
	(add-trigram (list (car (last (butlast events)))
			   (car (last events)) "end"))
	(add-trigram (list (car (last events)) "end" "end"))))))

(defun remove-events-to-trigrams (events)

  (if (null events)
      nil
    (progn
      (push "start" events)
      (push "start" events)
      (let ((prev1 (car events))
	    (prev2 (second events)))
	(dolist (e (cddr events))
		(let ((trigram (list prev1 prev2 e)))
		  (remove-trigram trigram))
		(setf prev1 prev2)
		(setf prev2 e))
	(remove-trigram 
	 (list (car (butlast events)) (car (last events)) "end"))
	(remove-trigram 
	 (list (car (last events)) "end" "end"))))))

(defun tree-prior (rule rules index)

;;;  ngram-based compression

  (let ((bigram-f (get-bigram-freq (rule-index-to-rule index rules) rule))
	(trigram-f (get-trigram-freq (rule-index-to-rule (- index 1)
						       rules) 	    
				     (rule-index-to-rule index rules) rule))
	(unigram-f
	 (let ((entry
		   (fetch-rule-entry rule)))
	      (if (and entry 
		       (> (rule-freq entry)
			  0))
		  (rule-freq entry)
		(cond
		 ((and (string= rule "[") (> *[* 0))
		  *[*)
		 ((and (string= rule "]") (> *]* 0))
		  *]*)
		 (t
		  1)))))) ;;; unseens
;(format t "~A ~A ~A ~%" trigram-f bigram-f unigram-f)
      (cond
       ((and (> trigram-f 0) (> bigram-f 0))
	(+ (* 0.6 (/ trigram-f bigram-f))
	   (* 0.3 (/ bigram-f unigram-f))
	   (* 0.1 (/ unigram-f *freq-total*))))
       ((> bigram-f 0)
	(+ (* 0.9 (/ bigram-f unigram-f))
	   (* 0.1 (/ unigram-f *freq-total*))))
       (t	
	(/ unigram-f *freq-total*)))))




