/*
 * All modifications are copyright (c) 2000, K A Fraser.
 * 
 * Please note the original copyright and license details below.
 */

/*
 * Copyright (C) 1997 Massachusetts Institute of Technology 
 *
 * This software is being provided by the copyright holders under the
 * following license. By obtaining, using and/or copying this software,
 * you agree that you have read, understood, and will comply with the
 * following terms and conditions:
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation for any purpose and without fee or royalty is
 * hereby granted, provided that the full text of this NOTICE appears on
 * ALL copies of the software and documentation or portions thereof,
 * including modifications, that you make.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO
 * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE,
 * BUT NOT LIMITATION, COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR
 * WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR
 * THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY
 * THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. COPYRIGHT
 * HOLDERS WILL BEAR NO LIABILITY FOR ANY USE OF THIS SOFTWARE OR
 * DOCUMENTATION.
 *
 * The name and trademarks of copyright holders may NOT be used in
 * advertising or publicity pertaining to the software without specific,
 * written prior permission. Title to copyright in this software and any
 * associated documentation will at all times remain with copyright
 * holders. See the file AUTHORS which should have accompanied this software
 * for a list of all copyright holders.
 *
 * This file may be derived from previously copyrighted software. This
 * copyright applies only to those changes made by the copyright
 * holders listed in the AUTHORS file. The rest of this file is covered by
 * the copyright notices, if any, listed below.
 */

/* 
 * Code generation machinery required to compile DPF trie.
 *
  TODO:
	1. Don't have to regen unless we went from eq -> ht or we had 
	   a collision.
	2. Need support for incremental linkage (vcode mod).
	3. Simplicity has been emphasized, at the cost of a few extra loads.
		Could eliminate them.
	4. Need to cleanup the vcode perl script so that it will work 
	on systems with fragile perls. 

	5. Rehash after collisions or hash table fillup.

   Some of the decisions we have made assume that work required is
   proportional to number of different protocols and that the number of
   protocols is relatively small (say 20).  If there is a near 1-1
   correspondence of filters and protocols or many protocols period we
   will perform badly on insertion/deletion.  While the fix requires
   work, it is quite feasible (and actually not hard, if one is willing
   to take a ~10-20% hit in demultiplexing speed) -- let me know if 
   you have a style of usage that corresponds to this situation.

  A possible optimization is to extend coalescing to glom together as
  many atoms as possible, ignoring machine word size, and use the xor
  trick to get rid of branches.  Might be more efficient on modern 
  machines.

  Every or point has a value associated with it, giving the maximum
  offset of the straightline code that follows it.  On branches where a
  shift or or occurs, we check whether the value exceeds the message size.
      Just pass a ptr to the last shift/or we have seen
      that is free and clear.

  During code generation we track the number of shifts we see.  At each
  shift point we store the current message pointer before overwriting
  it with the computed one.  If any node fails, we load the previous
  value up.  After code gen is complete, we allocate an array of size
  equal to the longest sequence of shifts.

 */
#include "dpf-internal.h"
#include "vcode/vcode.h"

/* Should just make these statically allocated registers (using T0 & co.). */
static v_reg_t src_r;			/* Value loaded from message */
static v_reg_t val_r;

/* Volatile -- SAFE as not required to stay the same across fn calls */
static v_reg_t t0;

/* Function arguments -- SAFE as will not change!*/
static v_reg_t msg_r;	  /* Register holding message pointer                */
static v_reg_t nbytes_r;  /* Reg holding number of bytes in message          */
static v_reg_t hlen_r;    /* Reg to store header length                      */
static v_reg_t plen_r;    /* Reg to store packet length                      */

/*
 * These allow us to create position-independent code.
 * `base_r' is a register holding the address of a code point at run time
 * `hostbase' is the address of the same code point at compile time (ie. now)
 * By doing calculations on these we can create a value which is the address
 * of any code point at run time, allowing us to create position-independent
 * jump tables, for instance.
 */
static v_reg_t base_r;    /* Register holding base of code   */
static uint8  *hostbase;  /* Base of vcode in host mem. */

/* 
 * Vector that holds stack offsets to store the shifts (so we can restore
 * if their associated filter does not accept.  It is sized to the maximum
 * number of simultaneous shifts that can occur.
 */
enum { GEN_SHIFT, GEN_HEADER_EXT, GEN_PACKET_EXT };
static struct { int nbytes, msg; } shift_stackpos[DPF_MAXELEM];
/* Track the largest shift.  Needed to reuse stack locations.  */
static int shift_highwater;
static int shift_sp;	/* Current index into shift_stackpos. */ 

/* Compile an atom. */
static void gen(Atom a, v_label_t elsel, int alignment, int shiftp);

/* initialize the shift save/restore routines. */
static void shift_init(void) 
{
    shift_highwater = shift_sp = 0;
}

/* 
 * Save nbytes & msg.  Note: this shift can be elided if failure of the 
 * current node will not lead to further message reads before (1)
 * a shift right above us restores the msg/nbyte state or (2) no more
 * filters exist.  These conditions probably hold in more cases than
 * one would suppose, but I'm not sure they are worth optimizing for.
 */
static void save_shift_state(int type) 
{
    if ( ++shift_sp > shift_highwater ) 
    {
        shift_highwater = shift_sp;
        shift_stackpos[shift_sp].nbytes = v_local(V_U);
        shift_stackpos[shift_sp].msg    = v_local(V_U);		
    }
    switch ( type )
    {
    case GEN_SHIFT:
        v_stui(nbytes_r, v_lp, shift_stackpos[shift_sp].nbytes);
        v_stpi(msg_r, v_lp, shift_stackpos[shift_sp].msg);
        break;
    case GEN_PACKET_EXT:
        v_stui(plen_r, v_lp, shift_stackpos[shift_sp].nbytes);
        break;
    case GEN_HEADER_EXT:
        v_stui(hlen_r, v_lp, shift_stackpos[shift_sp].nbytes);
        break;
    default:
        fatal(unknown shift type on save);
        break;
    }
}

/* Restore msg & nbytes */
static void restore_shift_state(int type) 
{
    switch ( type )
    {
    case GEN_SHIFT:
        v_raw_load(v_ldui(nbytes_r, v_lp, shift_stackpos[shift_sp].nbytes), 1);
        v_raw_load(v_ldpi(msg_r, v_lp, shift_stackpos[shift_sp].msg), 1);
        break;
    case GEN_PACKET_EXT:
        v_raw_load(v_ldui(plen_r, v_lp, shift_stackpos[shift_sp].nbytes), 1);
        break;
    case GEN_HEADER_EXT:
        v_raw_load(v_ldui(hlen_r, v_lp, shift_stackpos[shift_sp].nbytes), 1);
        break;
    default:
        fatal(unknown shift type on restore);
        break;
    }
    shift_sp--;
}

/* Generate code for hte entry. */
static void compile_hte(Atom hte, v_label_t elsel, 
                        int alignment, int shiftp) 
{
    int pid = hte->pid;

    /* If no kids, just return id. */
    if( !hte->child ) 
    {
        demand(pid, bogus node: must have a pid!);
        v_retii(pid);
    } 
    else 
    {
        v_label_t l = !pid  ? elsel : v_genlabel();
        
        /* gen a label for this atom to jump to. */
        gen(hte->child, l, alignment, shiftp);
        
        /* If the atoms for this node reject, jump here. */
        if( pid ) 
        {
            v_label(l);
            v_retii(pid);
        }
    }
}

/* Generate code for a hash table. */
static void compile_ht(Ht ht, v_label_t l, v_label_t clabel, 
                       int alignment, int shiftp) 
{
    v_label_t elsel, hashopts[8];
    int i;
    Atom a;
    demand(ht->htsz <= 8, hash too big!);

    /* Compute the hash function. XXX KAF MIPs-only bogosity!!! (but BWGAF?) */
    v_andui(val_r, src_r, ht->htsz-1);
    v_lshui(val_r, val_r, 3);
    v_addp(val_r, val_r, base_r);
    /*
     * the `3' here is particularly nasty, as it depends on the next two
     * vcode insts being converted to three real instructions.
     */
    v_addui(val_r, val_r, (uint8 *)(v_ip+3) - hostbase); /* 1 inst. */
    v_jp(val_r);                                         /* 2 inst. */   

    /* Generate jump table. */
    for ( i = 0; i < ht->htsz; i++ ) 
    {
        hashopts[i] = ht->ht[i] ? v_genlabel() : l;
        beq(0, 0, hashopts[i]);
    }

    /* Generate jump table targets. */
    for ( i = 0; i < ht->htsz; i++ )
    {
        if ( ht->ht[i] ) v_label(hashopts[i]);
        for ( a = ht->ht[i]; a; a = a->or )
        {
            elsel = a->or ? v_genlabel() : l;
            /* Now we do the hash walk. */
            v_setul(val_r, a->ir.eq.val);
            v_bnep(src_r, val_r, elsel);
            compile_hte(a, clabel, alignment, shiftp);
            if ( a->or ) v_label(elsel);
        }
    }
}

/* 
 * If we have shifted on this path or the offset surpasses the minimal amount
 * of buffer space we are allocated, emit a check to see if we are in bounds.
 */
static void emit_bounds_check(int shiftp, uint32 offset, v_label_t l) 
{
    if ( shiftp || offset > DPF_MINMSG ) v_bleui(nbytes_r, offset, l);
}

/* Compile the lhs of a message ld: (msg[offset:nbits] &  mask) */
static void compile_msgld(struct eq *e, v_label_t l, int alignment, int shiftp)
{    
    uint32 mask;
    uint16 offset;
    
    mask = e->mask;
    offset = e->offset;
    emit_bounds_check(shiftp, offset, l);
    
    switch(e->nbits) {
    case 8:		
        v_alduci(src_r, msg_r, offset, alignment);
        if(mask != 0xff)
            v_andui(src_r, src_r, mask); 
        break;
    case 16:
        v_aldusi(src_r, msg_r, offset, alignment);
        if(mask != 0xffff)
            v_andui(src_r, src_r, mask); 
        break;
    case 24:
        v_aldui(src_r, msg_r, offset, alignment);
        v_andui(src_r, src_r, mask);
        break;
    case 32:
        v_aldui(src_r, msg_r, offset, alignment);
        if(mask != 0xffffffff)
            v_andui(src_r, src_r, mask); 
        break;
    default: fatal(bogus number of bits);
    }
}

/* Compile the left hand part of an eq. */
static void compile_eq(Atom a, v_label_t l, v_label_t clabel, 
                       int alignment, int shiftp) 
{
    struct eq *e;
    Ht ht;
    
    e = &a->ir.eq;	
    
    compile_msgld(e, l, alignment, shiftp);
    
    if( (ht = a->ht) ) 
    {
        compile_ht(a->ht, l, clabel, alignment, shiftp);
    } 
    else 
    {
#ifndef MIPS
        v_bneui(src_r, e->val, l);
#else
        /* Fill delay with (possibly unnecessary) imm load. */
        v_nuke_nop();
        v_setu(t0, e->val);
        v_bneu(src_r, t0, l);
#endif
        if(a->child || a->ht)
            v_nuke_nop();	/* I believe this is safe. */
        gen(a->child, clabel, alignment, shiftp);
    }
}

static void compile_shift(Atom a, v_label_t l, v_label_t clabel, 
                          int align, int shiftp) 
{
    uint8 shift;
    v_label_t restore;
    
    /* 
     * Simple optimization: if clabel is the exit label, don't
     * emit the save/restore since either we will succeed (and
     * return a pid) or we will fail and so it won't matter
     * if the msg/nbytes is saved.
     */

    /* Load value into src_r. */
    compile_msgld(&a->ir.eq, l, align, shiftp); 
    v_nuke_nop();

    if ( a->ir.shift.ext & SHIFT   ) save_shift_state(GEN_SHIFT);
    if ( a->ir.shift.ext & HDR_EXT ) save_shift_state(GEN_HEADER_EXT);
    if ( a->ir.shift.ext & PKT_EXT ) save_shift_state(GEN_PACKET_EXT);

    /* shift src by the required amount: negative shift => right shift. */
    if( (shift = a->ir.shift.shift) & 0x80 )
    {
        v_rshui(src_r, src_r, 0x100 - shift);
    }
    else if ( shift )
    {
        v_lshui(src_r, src_r, shift);
    }

    /* 
     * Subtract off the loaded value from nbytes & add it to the message
     * ptr.   This, of course, can be reduced to a single, but it would
     * make checking message offsets (ala eq and ht's) more expensive.
     */
    if ( a->ir.shift.ext & SHIFT )
    {
        v_subu(nbytes_r, nbytes_r, src_r);
        v_addp(msg_r, msg_r, src_r);
    }
    /* KAF: we may need to extend the packet and/or header length counts. */
    if ( a->ir.shift.ext & HDR_EXT ) v_addu(hlen_r, hlen_r, src_r);
    if ( a->ir.shift.ext & PKT_EXT ) v_addu(plen_r, plen_r, src_r);

    restore = v_genlabel();
    
    /* Indicate that we encountered a shift. */
    gen(a->child, restore, a->ir.shift.align, !!(a->ir.shift.ext & SHIFT));

    /* Only get here on failure. */
    v_label(restore);
    if ( a->ir.shift.ext & PKT_EXT ) restore_shift_state(GEN_PACKET_EXT);
    if ( a->ir.shift.ext & HDR_EXT ) restore_shift_state(GEN_HEADER_EXT);
    if ( a->ir.shift.ext & SHIFT   ) restore_shift_state(GEN_SHIFT);
    /* now jump to where we were supposed to on failure. */
    beq(0, 0, clabel);
}

static void compile_shifti(Atom a, v_label_t l, v_label_t clabel, 
                           int align, int shiftp) 
{
    uint16 shift = a->ir.shifti.shift;
    v_label_t restore;

    if ( a->ir.shifti.ext & SHIFT )
    {
        v_subui(nbytes_r, nbytes_r, shift);
        v_addpi(msg_r, msg_r, shift);
    }
    /* KAF: we may need to extend the packet and/or header length counts. */
    if ( a->ir.shifti.ext & HDR_EXT ) v_addui(hlen_r, hlen_r, shift);
    if ( a->ir.shifti.ext & PKT_EXT ) v_addui(plen_r, plen_r, shift);

    restore = v_genlabel();

    /* Indicate that we encountered a shift. */
    gen(a->child, restore, a->ir.shifti.align, !!(a->ir.shifti.ext & SHIFT));

    /* Only get here on failure. */
    v_label(restore);
    if ( a->ir.shifti.ext & PKT_EXT ) v_subui(plen_r, plen_r, shift);
    if ( a->ir.shifti.ext & HDR_EXT ) v_subui(hlen_r, hlen_r, shift);
    if ( a->ir.shifti.ext & SHIFT   )
    {
        v_subpi(msg_r, msg_r, shift);
        v_addui(nbytes_r, nbytes_r, shift);        
    }
    /* now jump to where we were supposed to on failure. */
    beq(0, 0, clabel);
}

/* Compile atom. */
static void gen(Atom a, v_label_t elsel, int alignment, int shiftp) 
{
    /* Walk down each branch */
    for(; a; a = a->or) {
        int pid;
        v_label_t next_or, child_label;
        
        /* The last label is the elsel */
        next_or = !a->or ? elsel : v_genlabel();
        pid = a->pid;

        /* If this node has a pid, jump to pid return on fail in the child */
        child_label = (!pid || !a->child) ? next_or : v_genlabel();
        
        /* 
         * Label rules:
         * 	1. If an atom's initial comp fails, jump to next
         * 	label.  This label will either be generated here
         * 	or, on the last node, will be elsel.
         *	2. If an atom's initial comp succeeds and the
         * 	the atom has a pid, send a new label to gen to
         * 	jump to on failure & at this label's location
         * 	insert a return with the pid (this is for longest
         * 	match).
         *	3. If an atom's initial comp succeeds and the
         *	atom does not have pid, jump to the next label.
         */
        if ( dpf_isshifti(a->ir.eq.op) )
        {
            compile_shifti(a, next_or, child_label, alignment, shiftp);
        }
        else if ( dpf_iseq(a->ir.eq.op) )
        {
            compile_eq(a, next_or, child_label, alignment, shiftp);
        }        
        else
        {
            compile_shift(a, next_or, child_label, alignment, shiftp);
        }
        
        /* Return pid. */
        if ( pid ) 
        {
            if ( a->child ) v_label(child_label);
            v_retii(pid);
        }
        
        /* Position label. */
        if ( a->or ) v_label(next_or);
    }
}

/* Compile DPF trie. */
v_iptr dpf_compile(Atom trie) 
{
    /* Memory to hold code in -- size is a hack. */
    static v_code insn[2048];      
    v_reg_t args[4];
    v_label_t no_match;
	
    /* dpf_filter(msg, nbytes, hlen, plen) */
    v_lambda("dpf-filter", "%p%u%u%u", args, V_LEAF, insn, sizeof insn);
    shift_init();

    /* So we can produce position-independent jump tables for hashing. */
    v_getreg(&base_r, V_P, V_TEMP);
    v_getcodebase(base_r, (v_code **)&hostbase);
    
    if( !v_getreg(&src_r, V_UL, V_TEMP) ||
        !v_getreg(&t0, V_UL, V_TEMP) ||
        !v_getreg(&val_r, V_UL, V_TEMP) ) fatal(Out of registers);
    
    no_match = v_genlabel();
    msg_r    = args[0]; 
    nbytes_r = args[1];
    hlen_r   = args[2];
    plen_r   = args[3];

    /* Zero out header and packet length counts */
    v_seti(hlen_r, 0);
    v_seti(plen_r, 0);

    /* Walk down trie, Generating code for all elements. */
    gen(trie, no_match, DPF_MSG_ALIGN, 0);
    
    v_label(no_match);	/* jump here if no matches */
    v_seti(hlen_r, 14);
    v_movu(plen_r, nbytes_r);
    v_retii(0);
    
    v_putreg(val_r, V_UL);
    v_putreg(src_r, V_UL);
    v_putreg(t0, V_UL);

    return v_end(0).i;
}

void *dpf_ret_zero;
void gen_ret_zero(void)
{
    v_reg_t args[4];
    /*
     * XXX DANGER: if we make teh code region too small, vcode will
     * happily squirt code into memory that doesn't belong to us! :-(
     */
    static v_code insn[1000];
    v_lambda("dpf-zero-filt", "%p%u%u%u", args, V_LEAF, insn, sizeof insn);
    v_seti(args[2], 14);
    v_movu(args[3], args[1]);
    v_retii(0);
    dpf_ret_zero = v_end(0).i;
}
