/******************************************************************************
 * socketcall.c
 * 
 * BSD sockets interface to TCP/IP stack.
 * 
 * Copyright (c) 1999-2000, K A Fraser
 * 
 * $Id: socketcall.c,v 3.2 1999/12/18 16:27:38 kaf24 Exp kaf24 $
 */

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <asm/types.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#include <time.h>
#include <signal.h>
#include "thread.h"
#include "private.h"
#include <sys/mman.h>
#include <sched.h>
hash_table_t   *pcbs = NULL;
#include "libusernet.h"
#undef printf
#undef fprintf
#include "lowlevel.h"

/******************************************************************************
 * If given condition is false, quit with given error.
 */
#define ERROR_CHECK(_cond, _errnum, _errstr)                         \
    if( !(_cond) ) {                                                 \
        DB_ERR(_errstr "(errno str: %s)", strerror(_errnum));        \
        errno = (_errnum);                                           \
        return(-1);                                                  \
    }
#define FERROR_CHECK(_cond, _errnum, _errstr)                        \
    if( !(_cond) ) {                                                 \
        FDB_ERR(_errstr "(errno str: %s)", strerror(_errnum));       \
        errno = (_errnum);                                           \
        return(-1);                                                  \
    }

/******************************************************************************
 * Get pcb associated with socket, and lock it for exclusive access.
 */
#define PCB_FROM_FD(_s, _pcb)                                        \
    if ( (_pcb = element_for_key(pcbs, _s)) == NULL )                \
    { errno = EBADF; return(-1); }

/******************************************************************************
 * Unique identifier for next connection we create.
 */
int next_connection_id;

/* XXX KAF -- test vars */
unsigned int histogram[NUM_BUCKETS];

/* Nice fast gettimeofday(). However, it may be up to 10ms behind wall clock */
struct { unsigned long volatile *jiffies; struct timeval *tv; } tmaps;
int gettimeofday(struct timeval *__tv, struct timezone *__tz)
{
    *__tv = *tmaps.tv; return(0);
}

/******************************************************************************
 * user_init:
 *   Initialises the protocol stack. Should be called at start-of-day and
 *   never called again!
 */
int initialised = 0;

static void clearup(void) { user_kill(); }
static void *clearup_and_exit(void *arg) { user_kill(); exit(0);}

pthread_t death_thread;
static void sigint_handler(int ignored) 
{ 
    pthread_create(&death_thread, NULL, clearup_and_exit, NULL);
}

void user_init(void)
{
    struct sigaction act;
    
    if ( !test_and_set_bit(0, &initialised) )
    {
        /* The first thing we do is get the timer mappings! */
        {
            FILE *f = fopen("/proc/afuser_tmap", "rb");
            if ( !f || (fread(&tmaps, 1, sizeof(tmaps), f) != sizeof(tmaps)) ||
                 !tmaps.jiffies || !tmaps.tv )
            {
                printf("Could not read from /proc/afuser_tmap\n");
                printf("Are the required kernel modules installed?\n");
                exit(1);
            }
            fclose(f);
        }

        if ( !pth_init() )
        {
            printf("FATAL ERROR initialising thread scheduler\n");
            exit(1);
        }

        TRC_INIT();
        PROF_INIT();
        next_connection_id = 1;
        if ( (pcbs = init_hash_table(16)) == NULL ) exit(1);
        init_upcalls();
        inet_proto_init(NULL);
        if ( !test_and_set_bit(1, &initialised) ) atexit(clearup);

        /* Register signal handlers for proper cleanup. */
        act.sa_handler = sigint_handler;
        sigemptyset(&act.sa_mask);
        act.sa_flags = 0;
        //sigaction(SIGINT, &act, NULL);
    }
}


/******************************************************************************
 * user_kill:
 *   Kills the protocol stack.
 */
void user_kill(void)
{
    int i;
    struct user_pcb *pcb;

    if ( test_and_clear_bit(0, &initialised) )
    {
        /* Force every connection to start closing down. NB. This code sucks.*/
        for ( i = 0; i < next_connection_id; i++ ) user_close(i);

        /* Wait for all connections to go away. */
        wait_for_usd_conns_to_close();

        /* This guarantees us that the stack is inactive. */
        kill_upcalls();

        /* Noone will attempt to access hash table now -- time to kill it. */
        destroy_hash_table(pcbs);

        /* Now we knock the stack itself on the head. */
        inet_proto_shutdown();
        PROF_CLOSE();
        TRC_CLOSE();
        pth_kill();
    }
    else
    {
        /*
         * Should still wait for all connections to go away, or caller may
         * simply exit() as soon as we return!
         */
        wait_for_usd_conns_to_close();
    }
}


/******************************************************************************
 * create_new_socket:
 *   Allocates a new pcb for a connection of <type>.
 *
 *   NOTE: No connection to the NIC is made until listen() or connect() is
 *   called. Also, this function does not call into the stack core: it is
 *   up to the caller to do inet_create().
 */
struct u_socket *create_new_socket(int type)
{
    struct user_pcb *pcb;
    caddr_t          buf;
    int              ret;
    struct u_proto *prot;

    DB("entered");

    if ( (pcb = malloc(sizeof(struct user_pcb))) == NULL ) return(NULL);
    memset(pcb, 0, sizeof(struct user_pcb));

    if ( (pcb->usd_conn = usd_setup_device_connection(type, getpid(), pcb)) 
         == NULL )
    {
        DB_ERR("leaving, could not create connection to usd device");
        goto e3;
    }
    pcb->uid = next_connection_id++;

    if ( usd_get_shared_data_area(pcb->usd_conn, &pcb->shared_data_area) <
         SHARED_DATA_SIZE )
    {
        DB_ERR("leaving, insufficient shared data area");
        errno = ENOMEM;
        goto e2;
    }

    if ( init_locked_tx_mem(pcb, 
                            BUFFERS_PER_RING, 
                            MAX_HEADER_LEN, 
                            pcb->shared_data_area + TX_HEADER,
                            TCP_FIFO_SIZE + TCP_FIFO_GRANULARITY,
                            pcb->shared_data_area + TX_DATA) < 0 )
    {
        FDB_ERR("leaving, no skbuff memory");
        goto e2;
    }
    
    /* Setup pcb here */
    pcb->type = type;

    /* Socket setup -- massively cutdown version of code from socket.c */
    pcb->sock.type = pcb->type;
    init_waitqueue_head(&(pcb->sock.wait));

    FDB("leaving, succeeded");
    return(&(pcb->sock));

    /*
     * Clean up what has been created when an error occurs.
     */
e2: usd_close_device_connection(pcb->usd_conn);
e3: free(pcb);
    return(NULL);
}


/******************************************************************************
 * free_socket:
 *   Reverses the good work of 'create_new_socket'.
 */
void free_socket(struct u_socket *sock)
{
    struct user_pcb *pcb = PCB_FROM_SOCKET(sock);
    usd_close_device_connection(pcb->usd_conn);
    free(pcb);
}


/******************************************************************************
 * bind_new_socket:
 *   Binds <sock> to the local address <saddr>:<sport> which should be
 *   specified in NETWORK BYTE ORDER.
 *
 *   NOTE: It is the caller's responsibility to do inet_bind() -- this call
 *   simply calls into the NIC interface code and sets up a local filter.
 */
int bind_new_socket(struct u_socket *sock, u32 *saddr, u16 *sport)
{
    struct user_pcb *pcb = PCB_FROM_SOCKET(sock);
    struct sockaddr_in inaddr;

    inaddr.sin_family      = AF_INET;
    inaddr.sin_addr.s_addr = *saddr;
    inaddr.sin_port        = *sport;
    if ( usd_bind_to_local_address(pcb->usd_conn, &inaddr, NULL) )
    {
        FDB_ERR("leaving, could not bind to local address");
	return(-1);        
    }
    
    *saddr = inaddr.sin_addr.s_addr;
    *sport = inaddr.sin_port;
    set_bit(STATE_LOCAL_BOUND, &(pcb->state));

    FDB("bound to %d.%d.%d.%d:%d",
        *saddr&0xff, (*saddr>>8)&0xff, (*saddr>>16)&0xff, (*saddr>>24)&0xff, 
        ntohs(*sport));
        
    return(0);
}


/******************************************************************************
 * connect_new_socket:
 *   Connects <sock> to the remote address <daddr>:<dport> which should be
 *   specified in NETWORK BYTE ORDER.
 *
 *   NOTE: It is the caller's responsibility to do inet_connect() -- this
 *   call simply calls into the NIC interface code and sets up a local
 *   filter for the given remote address.
 */
int connect_new_socket(struct u_socket *sock, u32 daddr, u16 dport)
{
    struct user_pcb *pcb = PCB_FROM_SOCKET(sock);
    struct sockaddr_in inaddr;
    int i;

    inaddr.sin_family      = AF_INET;
    inaddr.sin_addr.s_addr = daddr;
    inaddr.sin_port        = dport;
    if ( usd_connect_to_remote_address(pcb->usd_conn, &inaddr) )
    {
	FDB_ERR("leaving, failed to connect to remote address");
	return(-1);
    }

    /* Immediately after connecting, we set up the rx ring. */
    for ( i = 0; i < BUFFERS_PER_RING; i++ )
    {
#ifndef HDR_SPLIT
        usd_add_to_rx_queue(pcb->usd_conn, 
                            pcb->shared_data_area + RX_DATA + 2 + 4 +  
                            i * BYTES_PER_BUFFER,
                            BYTES_PER_BUFFER, TRUE);
#else
        usd_add_hdr_to_rx_queue(pcb->usd_conn,
                                pcb->shared_data_area + RX_HEADER + 2 +
                                i * MAX_HEADER_LEN,
                                MAX_HEADER_LEN);
        usd_add_data_to_rx_queue(pcb->usd_conn,
                                 pcb->shared_data_area + RX_DATA +
                                 i * BYTES_PER_BUFFER,
                                 BYTES_PER_BUFFER, TRUE);
#endif
    }
    usd_push_new_rx_bufs_to_nic(pcb->usd_conn);
#ifndef HDR_SPLIT
    usd_rx_req_callback(pcb->usd_conn, 1);
#else
    usd_rx_req_callback(pcb->usd_conn, 2);
#endif
    usd_tx_req_callback(pcb->usd_conn, 1);

    if ( usd_enable_connection_filtering(pcb->usd_conn) ) return(-1);

    set_bit(STATE_CONNECTED, &(pcb->state));

    FDB("connected to %d.%d.%d.%d:%d",
        daddr&0xff, (daddr>>8)&0xff,
        (daddr>>16)&0xff, (daddr>>24)&0xff, ntohs(dport));

    return(0);
}


/******************************************************************************
 * bind_and_connect_new_socket:
 *   Does the work of 'bind_new_socket' and 'connect_new_socket', all in
 *   one neat package! This is only called from within the Linux stack, to
 *   create a new active connection from a listener.
 */
int bind_and_connect_new_socket(struct u_socket *sock, 
                                u32 saddr, 
                                u16 sport, 
                                u32 daddr, 
                                u16 dport)
{
    int ret;
    
    if ( (ret = bind_new_socket(sock, &saddr, &sport)) < 0 ) return(ret);

    return(connect_new_socket(sock, daddr, dport));
}


int user_accept(int s, void *addr, int *addrlen)
{
    struct user_pcb *pcb;
    struct user_pcb *newpcb;
    struct u_socket *newsock;
    struct sockaddr_in *inaddr = addr;
    int err;
    
    DB("entered");

    PCB_FROM_FD(s, pcb);

    FERROR_CHECK(pcb->type == SOCK_STREAM, EOPNOTSUPP, 
                 "leaving, wrong sort of socket");
    FERROR_CHECK(*addrlen >= sizeof(struct sockaddr_in),
                 EFAULT, "leaving, failed");

    if ( (err = inet_accept(&(pcb->sock), &newsock, 0)) < 0 )
    {
        FDB_ERR("user_accept() failed: %d (%s)", -err, strerror(-err));
        return(-1);
    }
    newpcb = PCB_FROM_SOCKET(newsock);

    if ( !insert_hash_entry(pcbs, newpcb->uid, newpcb) )
    {
        errno = ENOMEM;
        FDB_ERR("leaving, couldn't add socket to hash table");
        inet_release(&(newpcb->sock));
        return(-1);
    }

    inaddr->sin_family      = AF_INET;
    inaddr->sin_addr.s_addr = newpcb->sk->daddr;
    inaddr->sin_port        = newpcb->sk->dport;

    FDB("leaving, succeeded");
    return(newpcb->uid);
}


int user_bind(int s, const void *addr, int  addrlen)
{
    struct user_pcb      *pcb;
    struct sockaddr_in    inaddr;
    struct sockaddr_in    ouraddr;
    int                   ouraddrlen, err;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    FERROR_CHECK(addrlen >= sizeof(struct sockaddr_in), 
                 EFAULT, "leaving, bad addrlen");
    FERROR_CHECK(((struct sockaddr_in *)addr)->sin_family == AF_INET, 
                 EINVAL, "leaving, address family must be AF_INET");
    FERROR_CHECK(!test_and_set_bit(STATE_LOCAL_BOUND, &(pcb->state)), 
                 EINVAL, "leaving, already bound");

    /* Copy address, as not supposed to modify. */
    memcpy(&inaddr, addr, sizeof(struct sockaddr_in));

    /*
     * Note that this will modify address and port if they haven't been
     * specified by caller (ie. set to zero).
     */
    if ( bind_new_socket(&(pcb->sock), 
                         &(inaddr.sin_addr.s_addr), 
                         &(inaddr.sin_port)) < 0 )
    {
        clear_bit(STATE_LOCAL_BOUND, &(pcb->state));
        FDB_ERR("bind_new_socket failed, error %d: %s", 
                errno, strerror(errno));
        return(-1);
    }

    if ( (err = inet_bind(&(pcb->sock), 
                          (struct sockaddr *)&inaddr, 
                          sizeof(struct sockaddr_in))) < 0 )
    {
        clear_bit(STATE_LOCAL_BOUND, &(pcb->state));
        FDB_ERR("inet_bind failed, error %d: %s", -err, strerror(-err));
        errno = -err;
        return(-1);
    }

    FDB("leaving, succeeded");	
    return(0);
}


int user_connect(int s, void *addr, int addrlen)
{
    struct user_pcb    *pcb;
    struct sockaddr_in *inaddr = (struct sockaddr_in *)addr;
    int                 err;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    FERROR_CHECK(addrlen == sizeof(struct sockaddr_in),
                 EFAULT, "leaving, bad addrlen");
    FERROR_CHECK(inaddr->sin_family == AF_INET,
                 EINVAL, "leaving, address family must be AF_INET");
    FERROR_CHECK(!(test_bit(STATE_LISTENING, &(pcb->state)) || 
                   test_and_set_bit(STATE_CONNECTED, &(pcb->state))),
                   EINVAL, "leaving, already bound/listening");

    if ( !test_bit(STATE_LOCAL_BOUND, &(pcb->state)) )
    {
        struct sockaddr_in dummy_addr = {0};
        FDB("binding to default local address and port");
        dummy_addr.sin_family = AF_INET;

        if ( user_bind(s, &dummy_addr, sizeof(struct sockaddr_in)) < 0 )
        {
            clear_bit(STATE_LOCAL_BOUND, &(pcb->state));
            clear_bit(STATE_CONNECTED,   &(pcb->state));
            return(-1);
        }
    }

    if ( connect_new_socket(&(pcb->sock), 
                            inaddr->sin_addr.s_addr, 
                            inaddr->sin_port) < 0 )
    {
        clear_bit(STATE_CONNECTED,   &(pcb->state));
        FDB_ERR("connect_new_socket failed, error %d; %s", 
                errno, strerror(errno));
        return(-1);
    } 

    if ( (err = ((pcb->type == SOCK_STREAM) ? 
                 inet_stream_connect : inet_dgram_connect)
          (&(pcb->sock), addr, sizeof(struct sockaddr_in), 0)) < 0 )
    {
        clear_bit(STATE_CONNECTED,   &(pcb->state));
        FDB_ERR("inet_???_connect failed, error %d: %s",
                -err, strerror(-err));
        errno = -err;
        return(-1);
    }

    FDB("leaving, succeeded");
    return(0);
}

#if 0
int user_getpeername(int s, void *addr, int *addrlen)
{
    struct user_pcb    *pcb;
    struct sockaddr_in *inaddr;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    FERROR_CHECK(pcb->connected, ENOTCONN, "leaving, not connected");
    FERROR_CHECK(addrlen && *addrlen >= sizeof(struct sockaddr_in),
                 EFAULT, "leaving, bad addrlen");

    inaddr   = (struct sockaddr_in *)addr;
    *addrlen = sizeof(struct sockaddr_in);

    inaddr->sin_addr.s_addr = pcb->ipv4.un.visible.dst.dst_long;
    inaddr->sin_port        = pcb->ports.dst;

    FDB("leaving, succeeded");
    return(0);
}


int user_getsockname(int s, void *addr, int *addrlen)
{
    struct user_pcb    *pcb;
    struct sockaddr_in *inaddr;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    FERROR_CHECK(pcb->local_bound, ENOTCONN, "leaving, not connected");
    FERROR_CHECK(addrlen && *addrlen >= sizeof(struct sockaddr_in),
                 EFAULT, "leaving, bad addrlen");

    inaddr   = (struct sockaddr_in *)addr;
    *addrlen = sizeof(struct sockaddr_in);

    inaddr->sin_addr.s_addr = pcb->ipv4.un.visible.src.src_long;
    inaddr->sin_port        = pcb->ports.src;

    FDB("leaving, succeeded");
    return(0);
}

int user_getsockopt(int s, int level, int optname, void *optval, int optlen)
{
    struct user_pcb    *pcb;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    if ( level == SOL_SOCKET &&
         (optname == SO_RCVLOWAT || optname == SO_SNDLOWAT) )
    {
        FERROR_CHECK(*optlen >= sizeof(int),
                     EINVAL, "leaving, incorrect optlen");
        *optlen = sizeof(int);
        *((int *)optval) = (optname == SO_RCVLOWAT) ? 
            pcb->rcvlowat : pcb->sndlowat;
    }
    else if ( getsockopt(s, level, optname, optval, optlen) < 0 )
    {
	FDB_ERR("leaving, getsockopt failed");
	pthread_mutex_unlock(&(pcb->lock));
	return(-1);
    }
    
    FDB("leaving, succeeded");
    return(0);
}


int user_setsockopt(int s, int level, int optname, void *optval, int optlen)
{
    struct user_pcb    *pcb;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    if ( level == SOL_SOCKET &&
         (optname == SO_RCVLOWAT || optname == SO_SNDLOWAT) )
    {
        FERROR_CHECK(optlen == sizeof(int), 
                     EINVAL, "leaving, incorrect optlen");
        if ( optname == SO_RCVLOWAT )
        {
            pcb->rcvlowat = *((int *)optval);
        }
        else /* SO_SNDLOWAT */
        {
            pcb->sndlowat = *((int *)optval);
        }
    }
    else if ( setsockopt(s, level, optname, optval, optlen) < 0 )
    {
	FDB_ERR("leaving, setsockopt failed");
	return(-1);
    }
    
    FDB("leaving, succeeded");
    return(0);
}
#endif

int user_listen(int s, int backlog)
{
    struct user_pcb    *pcb;
    int                 i;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    FERROR_CHECK(pcb->type == SOCK_STREAM,
                 EOPNOTSUPP, "leaving, wrong sort of socket");
    FERROR_CHECK(test_bit(STATE_LOCAL_BOUND, &(pcb->state)) && 
                 !test_bit(STATE_CONNECTED, &(pcb->state)) &&
                 !test_and_set_bit(STATE_LISTENING, &(pcb->state)),
                 ENOTCONN, "leaving, unbound/listening/connected");

    if ( (i = inet_listen(&(pcb->sock), backlog)) < 0 )
    {
        clear_bit(STATE_LISTENING, &(pcb->state));
        FDB_ERR("inet_listen() failed, error %d: %s", -i, strerror(-i));
        errno = -i;
        return(-1);
    }

    if ( usd_listen_for_incoming_connections(pcb->usd_conn, backlog) != 0 )
    {
        clear_bit(STATE_LISTENING, &(pcb->state));
	FDB_ERR("leaving, usd_listen failed");
	return(-1);
    }

    /* Immediately after connecting, we set up the rx ring. */
    for ( i = 0; i < BUFFERS_PER_RING; i++ )
    {
#ifndef HDR_SPLIT
        usd_add_to_rx_queue(pcb->usd_conn, 
                            pcb->shared_data_area + RX_DATA + 2 + 4 +  
                            i * BYTES_PER_BUFFER,
                            BYTES_PER_BUFFER, TRUE);
#else
        usd_add_hdr_to_rx_queue(pcb->usd_conn,
                                pcb->shared_data_area + RX_HEADER + 2 +
                                i * MAX_HEADER_LEN,
                                MAX_HEADER_LEN);
        usd_add_data_to_rx_queue(pcb->usd_conn,
                                 pcb->shared_data_area + RX_DATA +
                                 i * BYTES_PER_BUFFER,
                                 BYTES_PER_BUFFER, TRUE);
#endif
    }
    usd_push_new_rx_bufs_to_nic(pcb->usd_conn);
#ifndef HDR_SPLIT
    usd_rx_req_callback(pcb->usd_conn, 1);
#else
    usd_rx_req_callback(pcb->usd_conn, 2);
#endif
    usd_tx_req_callback(pcb->usd_conn, 1);


    if ( usd_enable_connection_filtering(pcb->usd_conn) ) return(-1);

    FDB("leaving, succeeded");
    return(0);
}


int user_shutdown(int s, volatile int how)
{
    int err;
    struct user_pcb    * volatile pcb;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    if ( (err = inet_shutdown(&(pcb->sock), how)) < 0 )
    {
        FDB_WRN("inet_shutdown, %d: %s", -err, strerror(-err));
        errno = -err;
        return(-1);
    }

    FDB("leaving, succeeded");
    return(0);
}


int user_socket(int af, int type, int protocol)
{
    struct u_socket *sock;
    struct user_pcb *pcb;
    int              ret;

    DB("entered");

    ERROR_CHECK(af == AF_INET, EINVAL, "incorrect address family");

    if ( (sock = create_new_socket(type)) == NULL )
    {
        DB_ERR("create_new_socket failed, error %d: %s", 
               errno, strerror(errno));
        return(-1);
    }
    pcb = PCB_FROM_SOCKET(sock);

    if ( (ret = inet_create(sock, protocol)) < 0 )
    {
        FDB_ERR("inet_create failed, error %d: %s", -ret, strerror(-ret));
        free_socket(sock);
        errno = -ret;
        return(-1);
    }

    /* All done setting up the pcb. Store it away in the global hash table. */
    if ( !insert_hash_entry(pcbs, pcb->uid, pcb) )
    {
	FDB_ERR("leaving, could not insert new socket in hash table");
        inet_release(sock);
        free_socket(sock);
	errno = ENOMEM;
	return(-1);
    }

    FDB("leaving, succeeded");
    return(pcb->uid);
}


int user_socketpair(int af, int type, int protocol, int sv[2])
{
    DB_ERR("operation not supported");
    errno = EOPNOTSUPP;
    return(-1);
}


int user_close(int s)
{
    int err;
    struct user_pcb    * volatile pcb;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    if ( test_and_set_bit(STATE_CLOSING, &(pcb->state)) ) return(0);

    if ( (err = inet_release(&(pcb->sock))) < 0 )
    {
        FDB_WRN("inet_release, %d: %s", -err, strerror(-err));
        errno = -err;
        return(-1);
    }

    /* Okay, user can longer use this socket. */
    remove_hash_entry(pcbs, pcb->uid);
    if ( pcb->usd_conn == NULL ) free(pcb); // only if not attached to NIC!

    DB("leaving, succeeded");
    return(0);
}


int user_send(int s, void *msg, int len, int flags)
{
    int              err;
    struct iovec     iov;
    struct msghdr    msgh;
    struct user_pcb *pcb;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    iov.iov_base = msg;
    iov.iov_len  = len;

    msgh.msg_name       = NULL;
    msgh.msg_namelen    = 0;
    msgh.msg_control    = NULL;
    msgh.msg_controllen = 0;
    msgh.msg_iov        = &iov;
    msgh.msg_iovlen     = 1;
    msgh.msg_flags      = flags;

    if ( (err = inet_sendmsg(&(pcb->sock), &msgh, len, NULL)) < 0 )
    {
        FDB_ERR("inet_sendmsg, %d: %s", -err, strerror(-err));
        errno = -err;
        return(-1);
    }
    
    FDB("leaving, succeeded");
    if ( ++pcb->sends_since_downcall == PACKETS_PER_YIELD )
    {
        pcb->sends_since_downcall = 0;
        sched_yield();
    }
    return(err);
}


int user_recv(int s, void *msg, int len, int flags)
{
    struct iovec     iov;
    struct msghdr    msgh;
    int              err;
    struct user_pcb *pcb;

    DB("entered");

    PCB_FROM_FD(s, pcb);

    iov.iov_base = msg;
    iov.iov_len  = len;
    
    msgh.msg_name       = NULL;
    msgh.msg_namelen    = 0;
    msgh.msg_control    = NULL;
    msgh.msg_controllen = 0;
    msgh.msg_iov        = &iov;
    msgh.msg_iovlen     = 1;
    msgh.msg_flags      = flags;

    if ( (err = inet_recvmsg(&(pcb->sock), &msgh, len, flags, NULL)) < 0 )
    {
        FDB_WRN("inet_recvmsg, %d: %s", -err, strerror(-err));
        errno = -err;
        return(-1);
    }
    
    FDB("leaving, succeeded");
    return(err);
}


/******************************************************************************
 * user_fcntl:
 *   Partially implemented descriptor-related hacks.
 */
int user_fcntl(int fd, int cmd, int arg)
{
    struct user_pcb * volatile pcb;
    int ret;

    DB("entered");
    PCB_FROM_FD(fd, pcb);
    
    switch ( cmd )
    {
    case F_GETFL:
    {
        /*
         * GETFLAGS: get the current set of fcntl flags.
         */
        ret = pcb->fcntl_flags;
        break;
    }
    case F_SETFL:
    {
        /*
         * SETFLAGS: set the fcntl flags for this socket. Currently
         * supported: O_NONBLOCK.
         */
        pcb->fcntl_flags = arg;
        ret = 0;
        break;
    }
    default:
    {
        /*
         * Nothing else supported as yet!
         */
        errno = EINVAL;
        FDB_ERR("invalid command %d", cmd);
        ret = -1;
        break;
    }
    }

    FDB("leaving, succeeded");
    return(ret);
}


/******************************************************************************
 * user_poll:
 *   Block on multiple sockets at the same time, waiting for work to do.
 */
pthread_mutex_t global_poll_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t  global_poll_cond  = PTHREAD_COND_INITIALIZER;
#if 0
static long poll_check(struct pollfd *fdarray, unsigned long nfds)
{
    unsigned long  i;
    struct pollfd *fdent;
    struct user_pcb * volatile pcb;
    long ready_socks = -1;

    for ( i = 0, fdent = fdarray; i < nfds; i++, fdent++ )
    {
        if ( fdent->fd < 0 || fdent->revents == POLLNVAL )
        {
            DB("invalid socket %d", fdent->fd);
            continue;
        }
        if ( (pcb = element_for_key(pcbs, fdent->fd)) == NULL )
        {
            DB("socket %d not in table, setting POLLNVAL", fdent->fd);
            fdent->revents = POLLNVAL;
            continue;
        }

        /*
         * NB. We don't bother locking the pcb as we change nothing. We can
         * be certain that the pcb won't disappear from underneath us, as
         * the global poll mutex is required to removbe a hash table entry.
         */
        switch ( pcb->type )
        {
        case SOCK_STREAM:
            if ( (fdent->events & POLLIN) &&
                 ((pcb->connected && 
                   amount_in_fifo(pcb->recv_data) >= pcb->rcvlowat) ||
                  (pcb->listening && pcb->protocol_specific.tcp.connections[
                      pcb->protocol_specific.tcp.first_slot] != NULL)) )
            {
                fdent->revents |= POLLIN;
            }
            if ( (fdent->events & POLLOUT) &&
                 pcb->connected && 
                 amount_free_in_fifo(pcb->send_data) >= pcb->sndlowat )
            {
                fdent->revents |= POLLOUT;
            }
            break;

        case SOCK_DGRAM:
            if ( !fdent->revents ) usd_rx_req_callback(pcb->usd_conn, 1);
            if ( (fdent->events & POLLIN) &&
                 // XXX could replace this next call! -- KAF
                 usd_completed_buffers_in_rx_queue(pcb->usd_conn) )
            {
                fdent->revents |= POLLIN;
            }
            if ( (fdent->events & POLLOUT) &&
                 usd_completed_buffers_in_tx_queue(pcb->usd_conn) )
            {
                fdent->revents |= POLLOUT;
            }
            break;

        default:
            DB("socket %d not of supported type", fdent->fd);
            fdent->revents = POLLNVAL;
            continue;
        }
        if ( ready_socks == -1 ) ready_socks = 0;
        if ( fdent->revents ) ready_socks++;
    }

    /*
     * Caller should sleep only if there is work to wait for and nothing to
     * wake up for yet.
     */
    return(ready_socks);
}

int user_poll(struct pollfd *fdarray, unsigned long nfds, int timeout)
{
    unsigned long i;
    struct pollfd *fdent;
    struct user_pcb * volatile pcb;
    long ready;

    DB("entered");

    /* XXX -- we don't support timeouts yet!!! */
    ERROR_CHECK(timeout == INFTIM, EINVAL, "timeout not supported!");

    /* Clear out return bitmasks. */
    for ( i = 0, fdent = fdarray; i < nfds; i++, fdent++ ) fdent->revents = 0;

    /* Wait for work to return. */
    pthread_mutex_lock(&(global_poll_mutex));
    WAIT_NO_PCB(ready = poll_check(fdarray, nfds), 
                global_poll_cond, 
                global_poll_mutex);
    pthread_mutex_unlock(&(global_poll_mutex));


    DB("leaving, succeeded");
    return((ready == -1) ? 0 : ready);
}
#endif
