Module Owl_nlp.Corpus

module Corpus: Owl_nlp_corpus

type t = {
   mutable uri : string;
   mutable bin_ofs : int array;
   mutable tok_ofs : int array;
   mutable bin_fh : Pervasives.in_channel option;
   mutable tok_fh : Pervasives.in_channel option;
   mutable vocab : Owl_nlp_vocabulary.t option;
   mutable minlen : int;
   mutable docid : int array;
}
val _close_if_open : Pervasives.in_channel option -> unit
val _open_if_exists : string -> Pervasives.in_channel option
val cleanup : t -> unit
val create : string ->
int array ->
int array ->
Pervasives.in_channel option ->
Pervasives.in_channel option ->
Owl_nlp_vocabulary.t option -> int -> int array -> t
val get_uri : t -> string
val get_bin_uri : t -> string
val get_bin_fh : t -> Pervasives.in_channel
val get_tok_uri : t -> string
val get_tok_fh : t -> Pervasives.in_channel
val get_vocab_uri : t -> string
val get_vocab : t -> Owl_nlp_vocabulary.t
val get_docid : t -> int array
val length : t -> int
val next : t -> string
val next_tok : t -> int array
val iteri : (int -> 'a -> 'b) -> t -> unit
val iteri_tok : (int -> 'a -> 'b) -> t -> unit
val mapi : (int -> 'a -> 'b) -> t -> 'b array
val mapi_tok : (int -> 'a -> 'b) -> t -> 'b array
val get : t -> int -> string
val get_tok : t -> int -> int array
val reset_iterators : t -> unit
val next_batch : ?size:int -> t -> string array
val next_batch_tok : ?size:int -> t -> int array array
val tokenise : t -> string -> int array
val build : ?docid:int array ->
?stopwords:(string, 'a) Hashtbl.t ->
?lo:float ->
?hi:float ->
?vocab:Owl_nlp_vocabulary.t -> ?minlen:int -> string -> t
val unique : string -> string -> int array
val simple_process : string -> string
val preprocess : (string -> bytes) -> string -> string -> unit
val reduce_model : t -> t
val save : t -> string -> unit
val load : string -> t
val save_txt : t -> string -> unit
val to_string : t -> string
val print : t -> unit