(* -=-- ---------------------------------------------------- --=- *
 *                                                                *
 * Acute URI module                                               *
 *                                                                *
 * Version: $Id: uri.ml,v 1.505 2004/12/22 12:23:32 zappa Exp $
 *                                                                *
*** Copyright 2002-2004 The Acute Team

  Allen-Williams, Mair
  Bishop, Steven
  Fairbairn, Matthew
  Habouzit, Pierre [*]
  Leifer, James [*]
  Sewell, Peter
  Sjberg, Vilhelm
  Steinruecken, Christian
  Vafeiadis, Viktor
  Wansbrough, Keith
  Zappa Nardelli, Francesco [*]
  Institut National de Recherche en Informatique et en Automatique (INRIA)

  Contributions of authors marked [*] are copyright INRIA.

All rights reserved.

This file is distributed under the terms of the GNU Lesser General
Public License, with the special exception on linking described in
file NEW-LICENSE.

***
 * -=-- ---------------------------------------------------- --=- *)

open Util

exception BadURI of string;;

type uri = { (* all stored unencoded, e.g., %3d appears as = *)
             scheme    : string;         (* "http" *)
             authority : string option;  (* // "www.cl.cam.ac.uk" *)
             path      : string option;  (* "/~kw217/index.html" *)
             query     : string option;  (* ? "foo" *)
           };;

let scheme_re_str = "[a-z][-a-z0-9+.]*";;
let authority_re_str = "\\(\\([-a-zA-Z0-9_.!~*'();:&=+$,]\\|%[0-9a-fA-F][0-9a-fA-F]\\)*@\\)?\\(\\(\\([a-zA-Z0-9]\\|[a-zA-Z0-9][-a-zA-Z0-9]*[a-zA-Z0-9]\\)\\.\\)*\\([a-zA-Z]\\|[a-zA-Z][-a-zA-Z]*[a-zA-Z]\\)\\.?\\|[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+\\)\\(:[0-9]*\\)?";;
let path_re_str = "\\([-a-zA-Z0-9_.!~*'():@&=+$,]\\|%[0-9a-fA-F][0-9a-fA-F]\\)*\\(;\\([-a-zA-Z0-9_.!~*'():@&=+$,]\\|%[0-9a-fA-F][0-9a-fA-F]\\)*\\)*\\(/\\([-a-zA-Z0-9_.!~*'():@&=+$,]\\|%[0-9a-fA-F][0-9a-fA-F]\\)*\\(;\\([-a-zA-Z0-9_.!~*'():@&=+$,]\\|%[0-9a-fA-F][0-9a-fA-F]\\)*\\)*\\)*";;
let query_re_str = "\\([-a-zA-Z0-9_.!~*'()]\\|%[0-9a-fA-F][0-9a-fA-F]\\)*";;

let url_re =
  Str.regexp ("\\(" ^ scheme_re_str ^ "\\):" ^
              "//\\(" ^ authority_re_str ^ "\\)?" ^
              "\\(/" ^ path_re_str ^ "\\)?" ^
              "\\(\\?\\(" ^ query_re_str ^ "\\)\\)?" ^
              "$");;

let (n_scheme,n_authority,n_path,n_query) = (1,2,10,19);;
    (* ("http","www.cl.cam.ac.uk","/~kw217/index.html","foo") *)

let esc_re = Str.regexp "%[0-9a-fA-F][0-9a-fA-F]";;

let unescape : string -> string =
  fun s ->
    let f s =
      let s' = Str.matched_string s in
      let h c = let n = int_of_char c - 48 in
                if n < 10 then n else (n - 7) mod 32 in
      let n = h (String.get s' 1) * 16 + h (String.get s' 2) in
      String.make 1 (char_of_int n)
    in
    Str.global_substitute esc_re f s

let escchars_re = Str.regexp "[^a-zA-Z0-9-_.!~*'()]"

let escape : string -> string =
  fun s ->
    let f s =
      let s' = Str.matched_string s in
      Printf.sprintf "%%%02x" (int_of_char (String.get s' 0))
    in
    Str.global_substitute escchars_re f s

let path_escchars_re = Str.regexp "[^a-zA-Z0-9-_.!~*'()/]"

let path_escape : string -> string =
  fun s ->
    let f s =
      let s' = Str.matched_string s in
      Printf.sprintf "%%%02x" (int_of_char (String.get s' 0))
    in
    Str.global_substitute path_escchars_re f s

(* Parses a URI string, as a subgrammar of RFC2396 <absoluteURI> *)
(* - we support only schemes file, http, https, ftp;

   - we do not allow uppercase in the scheme (contrary to S3.1,
   because we hope that this is normalised earlier - important if we
   ever hash resolvespecs);

   - these schemes are all hierarchical, so we need not consider
     <opaque_part>;

   - we require that a <net_path> is given, naming the authority;

   - these schemes all use server-based naming for authorities
     (S3.2.2).
*)

let parse : string -> uri =
  fun s ->
    if not (Str.string_match url_re s 0) then
      raise (BadURI "URL is not an RFC2396 hierarchical, server-based naming <absoluteURI>");
    (* XXX NOT THREAD SAFE! *)
    let scheme    = Str.matched_group n_scheme s in
    let authority = try Some (Str.matched_group n_authority s) with Not_found -> None in
    let path      = try Some (Str.matched_group n_path s) with Not_found -> None in
    let query     = try Some (Str.matched_group n_query s) with Not_found -> None in
    if not (List.mem scheme ["file";"http";"ftp"]) then
      raise (BadURI ("Unsupported URI scheme (protocol) "^scheme));
    if (List.mem scheme ["http";"ftp"] && not (is_Some authority)) then
      raise (BadURI ("Authority (hostname) must be specified for URI scheme "^scheme));
    if (scheme = "file" && is_Some authority) then
      raise (BadURI ("Authority (hostname) must not be specified for file URI"));
    if (scheme = "file" && is_Some query) then
      raise (BadURI ("Query part must not be present in a file URI"));
    { scheme    = scheme;
      authority = option_lift unescape authority;
      path      = option_lift unescape path;
      query     = option_lift unescape query;
    };;

let build : uri -> string =
  fun uri ->
    uri.scheme ^ ":" ^
    (match uri.authority with
      None   -> ""
    | Some s -> "//" ^ escape s) ^
    (match uri.path with
      None   -> ""
    | Some s -> path_escape s) ^
    (match uri.query with
      None   -> ""
    | Some s -> "?" ^ escape s)

let wget_command = "/usr/bin/wget";;

let wget : string -> in_channel =
  fun uri_str ->
    let (in0,out0) = Unix.pipe () in  (* data written to out will appear at in *)
    let (in1,out1) = Unix.pipe () in  (* data written to out will appear at in *)
    (* stderr redirection commented out; ideally we'd catch this data and
       present it in case of an error, but since it's asynchronous that's
       nontrivial.  For now, let it go to stderr of the main process (ick). *)
    (* let (in2,out2) = Unix.pipe () in *)  (* data written to out will appear at in *)

    (try Unix.access wget_command [Unix.X_OK] with
    Unix.Unix_error(_) -> raise (Failure ("Command "^wget_command^" is not executable!")));

    let pid = Unix.fork () in
    (if pid <> 0 then  (* we are parent *)
      (Unix.close in0;  (* we don't need these ends of the pipes *)
       Unix.close out1;
       (* Unix.close out2 *) )
    else  (* we are child *)
      ((* WARNING: the child must not make any slow call; if OCaml is
          induced to invoke enter_blocking_section(), then a race occurs
          which frequently ends in deadlock.  Sadly, this means that if
          the exec fails, it must do so silently.  Hmm, it's possible that
          even raising an exception is fatal :-( *)
       Unix.dup2 in0  Unix.stdin;  Unix.close in0;  Unix.close out0; (* redirect stdin *)
       Unix.dup2 out1 Unix.stdout; Unix.close out1; Unix.close in1;  (* redirect stdout *)
       (* Unix.dup2 out2 Unix.stderr; Unix.close out2; Unix.close in2; *) (* redirect stderr *)
       (try
         Unix.execv wget_command [| wget_command; "-nv"; "-O"; "-"; uri_str |]
       with
         _ -> ());
       exit 255));

    let out_child   = Unix.out_channel_of_descr out0 in  (* out to child's stdin *)
    let in_child    = Unix.in_channel_of_descr  in1  in  (* in from child's stdout *)
    (* let inerr_child = Unix.in_channel_of_descr  in2  in *)  (* in from child's stderr *)

    close_out out_child;  (* no need to tell wget anything *)

    in_child;;

let open_uri : uri -> in_channel =
  fun uri ->
    Debug.print (fun () -> "open_uri("^build uri^")");
    if uri.scheme = "file" then
      begin
        assert (not (is_Some uri.authority));
        assert (not (is_Some uri.query));
        assert (is_Some uri.path);
        Debug.print (fun () -> "open_uri doing open_in("^the uri.path^")");
        open_in (the uri.path);
      end
    else if List.mem uri.scheme ["ftp";"http"] then
      begin
        assert (is_Some uri.authority);
        (if not (is_Some uri.path) then
	  raise (BadURI ("URL doesn't have a path")));

	wget (build uri)
      end
    else
      raise (Util.Unimplemented ("URI scheme "^uri.scheme^" unimplemented; sorry!"))

