;;; various useful routines for dealing with wsj (in .trees1 format)

(defun split-into-train-and-test (in-fp train-fp test-fp number)

;;; create training and test sets, sentences less than
;;; length in size, train with number sentences.
;;; random selection.

  (with-open-file 
   (in in-fp :direction :input)
   (with-open-file 
    (train train-fp :direction :output)
    (with-open-file
     (test test-fp :direction :output)
		   
   (let ((selected 0) (total 0))
     (loop
      (let ((sentence (read in nil 'eof nil))
	    (tree (read in nil 'eof nil)))
	      (if (equal sentence 'eof)
		  (progn	      
		    (format t "Selected ~A out of ~A ~%" selected total)
		    (return t))
		(progn
		  (when (and (< 1 (length sentence))
			     (< (length sentence) 31))
			(setf total (+ 1 total))
			(if (and (> 2 (random 20)) (< selected number))
			    (progn
			      (format test "~S ~% ~S"
				      sentence tree)
			      (setf selected (+ 1 selected)))
			  (format train "~S ~% ~S" sentence tree))))))))))))

		  
		       
(defun trees-to-fparse (in-fp out-fp &optional (words nil))

;;; given a treebank, create a set of sentences in a form
;;; that fparse can handle.

;;; Words = true dumps just words (and not also tags).

  (with-open-file
   (in in-fp :direction :input)
   (with-open-file
    (out out-fp :direction :output)
    (loop
     (let ((sentence (read in nil 'eof nil))
	   (tree (read in nil 'eof nil)))
       (if (equal sentence 'eof)
	   (return t)
	 (progn
	   (dolist (w sentence)
		   (if words
		       (let ((wd (intern 
				  (subseq (string w) 0
					  (position #\_ (string w))))))
			 (format out "~A " wd))
		     (format out "~A " w)))
	   (format out " . ~%"))))))))

		  
(defun trees-to-fparse-with-sym (in-fp out-fp sym &optional (words nil))

;;; given a treebank, create a set of sentences in a form
;;; that fparse can handle.  When a substring is dominated by
;;; sym (eg (NP the cat )) emit [ the cat ]

;;; Words = true dumps just words (and not also tags).

  (with-open-file
   (in in-fp :direction :input)
   (with-open-file
    (out out-fp :direction :output)
    (loop
     (let ((sentence (read in nil 'eof nil))
	   (tree (read in nil 'eof nil)))
       (if (equal sentence 'eof)
	   (return t)
	 (progn
	   (dolist (w (extract-bracketed-string (list tree) sym))
		   (if words
		       (let ((wd (intern 
				  (subseq (string w) 0
					  (position #\_ (string w))))))
			 (format out "~A " wd))
		     (format out "~A " w)))
	   (format out " . ~%"))))))))



(defun extract-bracketed-string (tree sym)
  (cond
   ((null tree) nil)
   ((atom tree) tree)
   ((listp (car tree))
     (let ((res (extract-bracketed-string (cdar tree) sym)))
       (if (member (caar tree) sym)	   
	   (append (append (cons '[ res)
			   (list ']))
		   (extract-bracketed-string (cdr tree) sym))
					
	 (append res (extract-bracketed-string (cdr tree) sym)))))
      
   (t
    (cons (car tree)
	  (extract-bracketed-string (cdr tree) sym)))))