;;; create treebank compatible with some grammar.  The idea is to
;;; take WSJ, parse sentences, and out of the parses produced, pick
;;; the 'best' ones.  We then record that in a file and uses these
;;; trees to train the prob. chart parser.  Since we won't have any
;;; stats to start with (and we won't be able to unpack all the parses)
;;; we do this in two stages: firstly generating lots of trees and
;;; picking the best; then, we go over these trees and build a 
;;; parse selection model.  We then continue, but this time only unpack
;;; a few trees.


(defvar *size* 10000000)

(defvar *number* 5) ;;; emit this number of parses per sentence.
                      ;;; if there are less, we duplicate.  

;(load "parseval.lisp")

;;; for some reason there seems to be a bug when using
;;; morph system, so workaround
;;; 

(defun d-checkfeaturepair (featurepair) t)

(defun multi-create-training-set (base out n)
  (dolist (ext n)
	  (create-training-set (format nil "~A-~A"
				       base ext)
			       (format nil "~A~A"
				       out ext)
			       "/tmp/fred")))

(defun create-training-set (in-fp out-fp1 out-fp)

;;; in-fp will be a set of sentences and associated trees.
;;; out-fp1 will be initial treebank (poss unreliable)
;;; out-fp will be final treebank (should be more reliable)

  (setf *generate-edges* nil)
  (setf *parse-length* 2)
  ;(reset-tables)
  (let ((parsed 0) (total 0))
    (with-open-file 
     (in in-fp :direction :input)
   (with-open-file
    (out out-fp1 :direction :output)
    ;;; build initial training material.
    (setf +n-best-retained+ 5)    
    
    (loop
     (setf total (+ 1 total))
     (when (= parsed *size*) (return t))
     (let ((sentence (read in nil 'eof nil))
	   (tree (read in nil 'eof nil)))
       (if (equal sentence 'eof)
	   (progn
	     (format t "~A ~A ~%" total parsed)
	     (return  t))
	 (progn
	   (if (< (length sentence) 20)	       
	       (progn
		 (format t "~A ~A ~%" total parsed)
		 (setf *tagged-sentence sentence) 
		 (setf *tagged-sentence-pos 0)
		 (setf *previous-sentence sentence)
		 (invoke-parser sentence nil))
	     (setf *current-parse-trees nil))
	   
	   (if *current-parse-trees
	       (progn 
		 (setf parsed (+ 1 parsed))
		 (let ((trees nil))
		   (dolist (p *current-parse-trees)
			   (push (get-rule-labelling-from-parse-tree
				  (cdr p))
				 trees))
		   (emit-parse out sentence trees tree)))))))))
   
   (pre-train out-fp1)
   (setf +n-best-retained+ 5)
   (with-open-file 
    (out out-fp :direction :output)
    (loop
     (setf total (+ 1 total))
     (let ((sentence (read in nil 'eof nil))
	   (tree (read in nil 'eof nil)))
       (if (equal sentence 'eof)
	   (progn
	     (format t "~A ~A ~%" total parsed)
	     (return-from create-training-set  t))
	 (progn
	   (setf parsed (+ 1 parsed))
	   (if (< (length sentence) 20)	       
	       (invoke-parser sentence nil)
	     (setf *current-parse-trees nil))
	   (format t "~A ~A ~%" total parsed)
	   (invoke-parser sentence nil)
	   (if *current-parse-trees
	       (progn 
		 (setf parsed (+ 1 parsed))
		 (let ((trees nil))
		   (dolist (p *current-parse-trees)
			   (push (get-rule-labelling-from-parse-tree
				  (cdr p))
				 trees))
		   (emit-parse out sentence trees tree))))))))))))
	   


			       
(defun emit-parse (fp sentence gde-parses bench-parse )

;;; best parse defined using Eirik's  tree scoring routines.


  (let* ((best (n-first *number* (find-most-similar-trees gde-parses bench-parse)))
	 (mult (round (/ *number* (length best)))))
    (format t " ~A best " (length best))
    (dolist (p best)
	    (dotimes (x mult)
		     (format fp "~S ~% ~S " sentence p)))))
    
(defun corpus-tree-constituents (tree)
  (extract-constituents
   tree 0 #'(lambda (x) (atom x))))

(defun n-first (n l)
  (let ((res nil) (selected 0))
	(dotimes (x n res)
		 (if l
		     (push (pop l) res)
		   (return-from n-first res)))))
	


(defun split-treebank (trees split n)

;;; split file of tree-sentence pairs ino files of n length.

  (let ((x 1) (out nil) (ext 0))
	(setf out (open (format nil "~A-~A" split ext) :direction :output))
	(with-open-file (in trees :direction :input)
			(loop
			 (let ((sent (read in nil 'eof nil))
			       (tree (read in nil 'eof nil)))
			   
			   (cond
			    ((eq sent 'eof) 
			     (close out)
			     (return t))
			     ((= 0 (mod x n))
			      (close out)
			      (setf ext (+ 1 ext))
			      (setf out (open 
					 (format nil "~A-~A" split ext)
					 :direction :output))))
			    
			    (setf x (+ 1 x))
			    (format out "~S~%~S~%~%"
				    sent tree))))))
			 



  


