// (C) 2009-11 D J Greaves
// (C) 2011-12 D J Greaves + M Puzovic
// University of Cambridge, Computer Laboratory.
// ACS P35 SoC D/M Classes - $Id: consistent_cache64.cpp,v 1.10 2011/08/22 13:30:19 djg11 Exp $


// Generic payload blocking transport consistent cache.

#include <stdint.h>
#include "systemc.h"
#include "tenos.h"
#include "consistent_cache64.h"
// include "../mtracer/tracedriven_core.h"
#include "llsc_extension.h"

/*
This is a set-associative cache with one-place write buffer.  It can
cope with reads and writes of burst length up to one cache line.  Read
and write operations on the lower port (here called secondary) are
always complete cache lines.

The write buffer provides dirty flags on a per-byte basis, supporting
byte writes.

Reads use random eviction.  Writes are put in the write buffer unless
the set-associative array is already warm with that cache line. All
writes are only copied back to secondary store on eviction.


Consistency is implemented by calling every other cache in a
consistency group when cache lines change status using the MESI
protocol.  No data is directly passed between caches in a consistent
group: instead dirty lines are evicted to secondary store and then
re-served from that.

TODO: Add bypass for llsc and DMI.

 */

#define MANUAL_DELAY 0

using namespace std;

extern bool dynload_done;


// Constructor
cache64::cacheway::cacheway(cache64 *parent, int way):
  parent(parent),
  m_way(way)
{
  char name[64];

  snprintf(name, 64, "Lock_%d", way);
  waylock = new sc_mutex(name);
  snprintf(name, 64, "Data_%d", way);
  Data = new smallram8(name, 
		       8, 
		       parent->geom.dmapping * sizeof(u8_t) * parent->geom.linesize);


  snprintf(name, 64, "Tags_%d", way);
  Tags = new smallram64(name, 64-parent->geom.dmap_shift, parent->geom.dmapping);

  Status = (mesi_t *) malloc(parent->geom.dmapping * sizeof(mesi_t));
  for (int i = 0; i<parent->geom.dmapping; i++) 
    Status[i] = invalid;
}

//
// Method called by local cache to update a given line's status.
//
cache64::cacheway::mesi_t cache64::cacheway::operate(mesi_t op, u64_t addr, int dmap, bool &changed)
{
  mesi_t old_state = Status[dmap];

  changed = (Status[dmap] != op);
  switch (op)
    {
    case invalid:
      Status[dmap] = op;
      break;

    case shared:
      Status[dmap] = op;
      break;

    case modified:
      Status[dmap] = op;
      break;
      
    case exclusive:
      Status[dmap] = op;

    case owned:
      Status[dmap] = op;

    default:
      sysc_assert1(parent, 0);

    }

  return old_state;
}


//
// Check whether cache line is changed and if so invalidates entries in all other caches.
//
void cache64::operate(cacheway *cw, cacheway::mesi_t op, u64_t addr, int dmap, sc_time &delay)
{
  sysc_assert(cw);
  
  cw->waylock->lock();
  cw->Status[dmap] = op;
  cw->waylock->unlock();

} 

// we should only clean one cache line
void cache64::cacheway::clean(sc_time &delay, 
					 int dmap, 
					 cache_miss_extension* cme, 
					 bool force)
{
  waylock->lock();
  if(Status[dmap] != modified 
     && Status[dmap] != owned
     && !force) {
    C1PTRC(0, fprintf(stdout, "%s: state of line is %d\n", parent->name(), Status[dmap]));
    waylock->unlock();
    return;
  }

  if(!force) 
    Status[dmap] = invalid;
  waylock->unlock();
    
  //printf("%s: clean %x %i\n", parent->name(), dmap, force);

  // we should set cache line to invalid immediately even before
  // we complete copying to the secondary storage. The reason why
  // we need to do it before is because if we use temporal decoupling
  // it can happen that one core is writing content of cache line to
  // the secondary storage and before it is completed we can have
  // another core reading the same cache line. If it is not set to invalid
  // before start of the transfer the second core will think that
  // line is still valid.

#if 0
  // This is now handled by the queue system.

  // Before we write check if there are other caches that are
  // waiting for this line if there are then their transaction
  // should be aborted and either this way should send transaction
  // on the bus or the core that was waiting needs to reissue... 
  if(parent->grp)
    for(int g = 0; g < parent->grp->target_members; g++) {
      if(parent->grp->Members[g] == parent) continue;
      
      cache64* c = parent->grp->Members[g];
..      
      if(c->secondary_storage_lreq == (*Tags)[dmap])
	c->secondary_storage_lreq = -1;		  
    }
#endif
  

  /*parent->ml.lock();
  u64_t oldAddr = 0;
  if(parent->busyFlag) {
    oldAddr = parent->busyAddr;
    parent->busyAddr = (*Tags)[dmap];
  }
  parent->ml.unlock();*/

  // as we are doing the cleaning (i.e. writting dirty line to secondary
  // storage) we can have some other core trying to access the same
  // cache line so set the status of this line to invalid before we send
  // write request to secondary storage 

  //for (int i=0; i<parent->geom.dmapping; i++)
  //{
  //if (Status[i] == modified)
  //{

  // for know, we should never need to clean instruction cache line
  assert(parent->my_type == CT_DATA || parent->my_type == CT_UNIFORM);

  cache_miss_extension mm;
  PW_TLM_PAYTYPE *trans = parent->cache_miss_mm.allocate();
  trans->set_auto_extension<cache_miss_extension>(&mm);
  trans->acquire();

  // data length will either be secondary block size if it is less then
  // the size of cache line otherwise it is going to be cacheline
  int dl = parent->geom.linesize >= parent->geom.secondary_blocksize_bytes ?
    parent->geom.secondary_blocksize_bytes :
    parent->geom.linesize;
  trans->set_data_length(dl);

  trans->set_byte_enable_length(0);
  trans->set_byte_enable_ptr(0); // All bytes to be operated on if ptr is null.
  trans->set_streaming_width(parent->geom.linesize);
  trans->set_write();

  for(int d=0; d<parent->geom.linesize; d += parent->geom.secondary_blocksize_bytes) {
    trans->set_response_status( tlm::TLM_INCOMPLETE_RESPONSE );

    POWER3(parent->pw_module_base::record_energy_use(Tags->m_read_energy_op)); // Read tags
    trans->set_address((*Tags)[dmap] + d);

    int ops = parent->geom.linesize * 8 / Data->width;
    //std::cout << "Tmp debug " << parent->name() << " ops=" << ops << "\n";
    POWER3(parent->pw_module_base::record_energy_use(Data->m_read_energy_op * ops)); // Read data

#if !MANUAL_DELAY
    delay += Tags->m_sr_latency;
    delay += Data->m_sr_latency * ops;
#endif

    trans->set_data_ptr(Data->read8p(dmap*parent->geom.linesize)+d);

    C1PTRC(trans->get_address(), printf("%s: for address 0x%lx sends ", parent->name(), (*Tags)[dmap]+d);	\
	 for(int k = 0; k < parent->geom.secondary_blocksize_bytes; k++) \
	   printf("%02x", *(Data->read8p(dmap*parent->geom.linesize)+d+k)); \
	 printf("\n"));

    POWER3(PW_TLM3(trans->pw_set_origin(parent, PW_TGP_DATA | PW_TGP_ADDRESS | PW_TGP_LANES | PW_TGP_ACCT_SRC, &parent->secondary_bus_tracker))); // Evict write.
    parent->inita_socket->b_transport(*trans, delay);
    POWER3(PW_TLM3(trans->pw_terminus(parent)));
    if (trans->is_response_error()) {
	char txt[256];
	snprintf(txt, 256, 
		 "%s: Error from cacheway secondary write addr=" PFX64 ", response status = %s", 
		 parent->name(), (*Tags)[dmap]+d, trans->get_response_string().c_str());
	SC_REPORT_ERROR(__FILE__, txt);
    } 
  }

  C1PTRC(trans->get_address(), printf("%s: status of line 0x%lx is invalid, dmap = %lx, way = %d\n", 
	      parent->name(), (*Tags)[dmap], dmap, m_way));

  trans->release();

  /*  parent->ml.lock();
  if(parent->busyFlag) {
    assert(parent->busyAddr == (*Tags)[dmap]);
    parent->busyAddr = oldAddr;
  }
  parent->ml.unlock();*/


  /*if(!force) {
    waylock->lock();
    Status[dmap] = invalid;
    waylock->unlock();
    }*/

}


//
// Adds a new line to a cache way.
//
bool cache64::cacheway::insert(u64_t addr, 
			       int dmap, 
			       u8_t *cline, 
			       sc_time &delay,
			       cache_miss_extension* cme,
			       mesi_t new_state)
{
  C1PTRC(addr, printf("%s: cacheway=%i, insert line dmap=0x%x, delay = %s\n", 
	      parent->name(), m_way, dmap, delay.to_string().c_str()));

  switch (Status[dmap])
    {
    case invalid:
    case exclusive: 
    case shared:
      break;

    case modified:
    case owned:
      // if address is different need to evict
      POWER3(parent->pw_module_base::record_energy_use(Tags->m_read_energy_op)); // Read tags
#if !MANUAL_DELAY
      delay += Tags->m_sr_latency;
#endif
      if (addr != (*Tags)[dmap])
	{
	  C1PTRC(addr, printf("%s way=%i Evict " PFX64 " for " PFX64 " - dmap alias?\n", parent->name(), m_way, (*Tags)[dmap], addr));
	  
	  u8_t* dd = Data->read8p(dmap * parent->geom.linesize);
	  C1PTRC(addr, fprintf(stdout, "previous data: ");	\
	       for(int i = 0; i < parent->geom.linesize; i++)	\
		 fprintf(stdout, "%02x", dd[i]);		\
	       fprintf(stdout, "\n"));
	  parent->evict_lower_level_line((*Tags)[dmap], 
					 parent->geom.linesize,
					 dd);

	  dd = Data->read8p(dmap * parent->geom.linesize);
	  C1PTRC(addr, fprintf(stdout, "new data: ");		\
	       for(int i = 0; i < parent->geom.linesize; i++)	\
		 fprintf(stdout, "%02x", dd[i]);		\
	       fprintf(stdout, "\n"));	  

	  clean(delay, dmap, cme);
	  // stats.evict(... log this ... );
	  break;
	}

      else 
	{
	  printf("End of world: %s way=%i insert already present " PFX64 " for " PFX64 " - dmap alias?\n", parent->name(), m_way, (*Tags)[dmap], addr);
	  sysc_assert1(parent, 0); // Should never happen.
	}

    default:
      sysc_assert1(parent, 0);

    }

  POWER3(parent->pw_module_base::record_energy_use(Tags->m_write_energy_op)); // Write tags
#if !MANUAL_DELAY
  delay += Tags->m_sr_latency;
#endif
  Tags->write(dmap, addr);

  int ops = parent->geom.linesize * 8 / Data->width;
  //std::cout << "Tmp debug " << parent->name() << " ops=" << ops << "\n";
  POWER3(parent->pw_module_base::record_energy_use(Data->m_write_energy_op * ops)); // Write data
#if !MANUAL_DELAY
  delay += Data->m_sr_latency * ops;
#endif
 
  memcpy(Data->read8p(dmap * parent->geom.linesize), cline, parent->geom.linesize);

  C1PTRC(addr, printf("%s: wrote data to address 0x%lx: ", parent->name(), addr); \
       for(int k = 0; k < parent->geom.linesize; k++)			\
	 printf("%02x", *(Data->read8p(dmap*parent->geom.linesize)+k)); \
       printf(", delay = %s", delay.to_string().c_str());
       printf("\n"));

  waylock->lock(); // This write non atomic?
  Status[dmap] = new_state;
  waylock->unlock();
  return true;
}

// Return true if the address is present.
bool cache64::cacheway::lookup(u64_t addr, int dmap, u8_t **clinep, sc_time &max_delay)
{
  // It is wrong to add the latency for each way's lookup to delay since these are done in parallel (should do a max join over them).
  POWER3(parent->pw_module_base::record_energy_use(Tags->m_read_energy_op)); // Read tags

#if !MANUAL_DELAY
  max_delay = max(max_delay, Tags->m_sr_latency);
#endif

  if (Status[dmap] != invalid && (*Tags)[dmap] == addr)
    {
      if (clinep)
	{ // Not every lookup leads to data being read
	  int ops = parent->geom.linesize * 8 / Data->width;
	  //std::cout << "Tmp debug " << parent->name() << " ops=" << ops << "\n";
	  POWER3(parent->pw_module_base::record_energy_use(Data->m_read_energy_op * ops)); // Read data
#if !MANUAL_DELAY
	  max_delay = max(max_delay, Data->m_sr_latency * ops);
#endif
	  CTRC(cout << parent->name() << ": ops = " << ops 
	       << " + m_sr_latency = " << Data->m_sr_latency << endl);
          *clinep = Data->read8p(dmap * parent->geom.linesize);
	}
      return true;
    }
  else return false;
}


// clean: write out dirty contents: only some lanes of a burst may be dirty and some bursts may not need issuing.
void cache64::write_buffer::clean(sc_time&delay, cache_miss_extension* ext)
{
  wbl.lock();
  // Secondary memory word width may be less than linesize.
  if (m_addr != -1)
    {

      u64_t addr_m = m_addr;
      evict_addr = m_addr;
      m_addr = -1;
      wbl.unlock();

      /*parent->ml.lock();
      u64_t oldAddr = 0;
      if(parent->busyFlag) {
	oldAddr = parent->busyAddr;
	parent->busyAddr = addr_m;
      }
      parent->waylock->unlock();*/

      parent->ml.lock();
      if(parent->secondary_reqs.find(addr_m) != parent->secondary_reqs.end())
	parent->invalid_reqs.insert(addr_m);
      parent->ml.unlock();

      PW_TLM_PAYTYPE* trans = parent->cache_miss_mm.allocate(); 
      trans->set_auto_extension<cache_miss_extension>(ext);
      trans->acquire();

      u8_t data[MAX_LINESIZE];
      u8_t lanemask[MAX_LINESIZE]; 
      for(int i = 0; i < MAX_LINESIZE; i++) {
	lanemask[i] = 0x00;
	data[i] = 0;
      }
      
      // data length will either be secondary block size if it is less then
      // the size of cache line otherwise it is going to be cacheline
      int dl = linesize >= parent->geom.secondary_blocksize_bytes ?
	parent->geom.secondary_blocksize_bytes :
	linesize;
      trans->set_data_length(dl);

      trans->set_byte_enable_length(linesize);
      trans->set_byte_enable_ptr((u8_t *)&lanemask); // All bytes to be operated on if ptr is null.
      trans->set_streaming_width(linesize);//??
      
      trans->set_write();
      //printf("%s: wb clean write " PFX64 "\n", parent->name(), addr_m);
      for (int d=0; d<linesize; d += parent->geom.secondary_blocksize_bytes)       // linesize read in one go?
	{
	  bool w = false;
	  // Irrespective of the system being modelled, the cache and main memory use simulator workstation endianness. This is little for x86.  
	  for (int i=0; i<min(parent->geom.secondary_blocksize_bytes, linesize); i++) 
	    if (Dirty[d+i]) 
	      {
		lanemask[i] = 0xFF;
		//Dirty[d+i] = false;
		//printf("%s: wb clean write was dirty on %i at  " PFX64 "\n", parent->name(), d+i, addr_m+d+i);
		w = true;
	      }
	    else 
	      {
		lanemask[i] = 0x00;
		//CTRC(printf("%s: wb clean write was clean on %i (0x" PFX64 ")\n", parent->name(), d+i, addr_m+d+i));
	      }
	  if (w)
	    {
	      //CTRC(printf("clean write buffer addr=0x" PFX64 " mask=0x" PFX64 "... d=%i\n", addr_m, *((u64_t *)lanemask), d));
	      
	      // before writting buffer to secondary memory we need to
	      // invalidate cache line from consistency group, but by
	      // MESI definition we should not have cache line in 
	      // other consistency group

	      CTRC(printf("%s: line size = %d, secondary blocksize bytes = %d\n",
			  parent->name(), linesize, parent->geom.secondary_blocksize_bytes));
	      C1PTRC(addr_m+d, printf("%s: write buffer addr: 0x%lx sends ", 
			  parent->name(), addr_m+d);				\
		   for(int k = 0; k < parent->geom.secondary_blocksize_bytes; k++)\
		     printf("%02x", *(cline.read8p(d)+k)); \
		   printf(" lanes = ");
		   for(int k = 0; k < parent->geom.secondary_blocksize_bytes; k++) \
		     printf("%02x", lanemask[k]);
		   printf("\n"));


	      trans->set_response_status( tlm::TLM_INCOMPLETE_RESPONSE );
	      trans->set_address(addr_m + d);
	      
	      // we need to make a copy of the write buffer content as
	      // we have already released the entry to be overwrittent
	      // by someone else
	      for(int k = 0; k < parent->geom.secondary_blocksize_bytes; k++)
		data[k] = *(cline.read8p(d)+k);
	      // Reference to a byte view of internal data.
	      trans->set_data_ptr(&data[0])
;
	      POWER3(PW_TLM3(trans->pw_set_origin(parent, PW_TGP_DATA | PW_TGP_LANES | PW_TGP_ADDRESS | PW_TGP_ACCT_SRC, &parent->secondary_bus_tracker))); // Write out
	      parent->inita_socket->b_transport(*trans, delay);
	      POWER3(PW_TLM3(trans->pw_terminus(parent)));
	      if (trans->is_response_error())
		{
		  char txt[256];
		  snprintf(txt, 256, "%s: Error from secondary write addr=" PFX64 ", response status = %s", 
			   parent->name(), addr_m, trans->get_response_string().c_str());
		  SC_REPORT_ERROR(__FILE__, txt);
		} 
	    }
	  else {
	    CTRC(printf("%s: there are no dirty bits in a line 0x%lx\n", parent->name(), evict_addr));
	  }
	}
      trans->release();

      wbl.lock();
      evict_addr = -1;
      /*for(int d = 0; d < linesize; d++)
	Dirty[d] = false;*/
      wbl.unlock();
      
      /*parent->ml.lock();
      if(parent->busyFlag) {
	if(parent->busyAddr != addr_m) {
	  fprintf(stdout, "0x%lx vs 0x%lx\n", parent->busyAddr, addr_m);
	}
	assert(parent->busyAddr == addr_m);
	parent->busyAddr = oldAddr;
      }
      parent->ml.unlock();*/
    }
  else
    wbl.unlock();
}

void cache64::clean(sc_time &delay)
{
  for (int w=0; w<geom.ways; w++) 
    for(int l=0; l<geom.dmapping; l++)
      Cont[w]->clean(delay, l, 0);

  // Todo: some of this might be done in parallel, so accumulating a total in delay is not correct - but this is only used on IPL (initial program load).
  if(buf0)
    buf0->clean(delay, 0);
}



// Conduct TLM operation on next level memory down.
// This socket should have width equal to secondary_width?
// Needs better be passthrough for passing DMI and uncached/LLSC transactions straight thru ?
// Return 0 on ok, -ve on bus error etc.
int cache64::secondary_lookup(u64_t line_addr, 
					 u8_t *cline, 
					 sc_time &delay, 
					 int w,
					 cache_miss_extension* cme)
{
  PW_TLM_PAYTYPE *trans;

  // pick up port where to send it
  // depends on the number of banks secondary storage has
  int bank = geom.secondary_banks == 1 
    ? 0 
    : ((line_addr >> geom.secondary_linesize_bits) & (geom.secondary_banks - 1));

  trans = cache_miss_mm.allocate(); 
  trans->set_auto_extension<cache_miss_extension>(cme);
  trans->acquire();

  // data length will either be secondary block size if it is less then
  // the size of cache line otherwise it is going to be cacheline
  int dl = geom.linesize >= geom.secondary_blocksize_bytes ?
    geom.secondary_blocksize_bytes :
    geom.linesize;
  trans->set_data_length(dl);
  trans->set_byte_enable_length(0);
  trans->set_byte_enable_ptr(0); // All bytes to be operated on if ptr is null.
  trans->set_streaming_width(geom.linesize);
  trans->set_read();
  for (int d=0; d<geom.linesize; d += geom.secondary_blocksize_bytes)       // linesize read in one go?
    {
      trans->set_response_status( tlm::TLM_INCOMPLETE_RESPONSE );
      trans->set_address(line_addr + d);

      trans->set_data_ptr(cline+d);
      POWER3(PW_TLM3(trans->pw_set_origin(this, PW_TGP_ADDRESS | PW_TGP_ACCT_SRC, &secondary_bus_tracker))); // Read a line
      if(inita_socket.size() > 0) // is this IF really needed?
	inita_socket[bank]->b_transport(*trans, delay);
      else
	inita_socket->b_transport(*trans, delay);
      POWER3(PW_TLM3(trans->pw_terminus(this)));
      if (trans->is_response_error())
	{
	  char txt[256];
	  snprintf(txt, 256, "%s: Error from secondary access addr=" PFX64 ", response status = %s", 
		   name(), line_addr, trans->get_response_string().c_str());
	  SC_REPORT_ERROR(__FILE__, txt);
	} 
    }

  C1TRC(trans->get_address(), cout << name() << " : after secondary lookup  delay = " 
            << delay << " and time = " << sc_time_stamp() << endl);
  trans->release();

  return 0;
}


bool cache64::write_buffer::word_present(int offset)
{
  int i;
  for (i=0; i<8; i++) if (!Dirty[offset*8+i]) break;
  return (i==8); // All bytes are dirty.
}

#if 0
u64_t cache64::write_buffer::word_get(int offset)
{
  return cline.read64(offset*8);
}
#endif

void cache64::cacheway::print(FILE *fd, int dmap)
{
  fprintf(fd, "Cacheway %i mesi=%i " PFX64 ":\n", dmap, Status[dmap], (*Tags)[dmap]);
  for (int i=0;i<parent->geom.linesize;i++)
    fprintf(fd, "%02X", (*Data)[dmap * parent->geom.linesize+i]);
  fprintf(fd, "\n");
}


// Lookup function (not for load-linked).
// Servicef is read/not-write for local work and false for snoop operations.
cache64::cacheway *cache64::lookup(bool servicef, 
				   u64_t line_addr, 
				   int dmap, 
				   int loffset, 
				   u64_t * &datap1, 
				   sc_time &delay, 
				   cache_miss_extension* cme,
				   bool nested) {
  bool foundf = 0;
  cacheway *cw = 0;
  u8_t *cline = 0;
  sc_time max_lookup_delay = SC_ZERO_TIME;
  C1TRC(line_addr, printf("%s: performing lookup for 0x%lx, dmap = %d\n", name(), line_addr, dmap));
  for (int w=0; w<geom.ways; w++)
    {
      cw = Cont[w];
      bool present = cw->lookup(line_addr, dmap, &cline, max_lookup_delay);
      if (present) {
	C1TRC(line_addr, 
	      (cout << name() << ": found the cache line: " << hex << line_addr 
	      << " delay: " << max_lookup_delay << endl));
	break;
      }
    }
#if !MANUAL_DELAY
  delay += max_lookup_delay;
#endif

  CTRC(cout << name() << ": didn't find any lines, delay = " << delay << endl);

  if (!cline && !servicef) 
    {
      C1TRC(line_addr, printf("inq "));
      return 0;
    }
  if (cline) 
    { // Read hit
      stats.hits += 1;
      datap1 = ((u64_t*)(cline)) + loffset; // loffset in words
      C1TRC(line_addr, printf("%s hit, line_addr=" PFX64 " + %x 0x" PFX64 ", wa = %d\n", name(), line_addr, loffset * 8, *datap1, cw->m_way));
#if MANUAL_DELAY
      delay += clock_period * hit_cycle_time;
#endif

#if 0
      // already accounted in lookup
      int ops = parent->geom.linesize * 8 / Data->width;
      //std::cout << "Tmp debug " << parent->name() << " ops=" << ops << "\n";
      POWER3(parent->pw_module_base::record_energy_use(Data->m_read_energy_op * ops)); // Read data
      delay += Data->m_sr_latency * ops;
#endif

      //CTRC(cw->print(stdout, loffset));

      operate_miss_type(cw, dmap, cme);

      return cw;
    }


  u8_t snooped_data[MAX_LINESIZE];
  int exist = do_a_snoop(nested, servicef, line_addr, delay, cme, &snooped_data[0]);

  lru = (lru == geom.ways - 1) ? 0 : lru + 1;
  cw = Cont[lru];

  bool ins = true;
  if(exist > 0) {
    assert(snooped_data != 0);
    C1TRC(line_addr, fprintf(stdout, "%s: adding snooped data: ", name());	\
	 for(int i = 0; i < geom.linesize; i++)				\
	   fprintf(stdout, "%02x", snooped_data[i]);			\
	 fprintf(stdout, "\n"));
    stats.snooped_reads += 1;
    ins = insert_line(cw, line_addr, dmap, snooped_data, delay, cme, cacheway::mesi_t::shared);
    assert(ins);
  }
  else {
    assert(exist == 0);
    // Service a read miss from main memory
    u8_t cline10[MAX_LINESIZE];
    ml.lock();
    bool requested = true;
    if(secondary_reqs.find(line_addr) == secondary_reqs.end()) {
      secondary_reqs.insert(line_addr);
      requested = false;
    }
    ml.unlock();
    if(!requested) {
      do {
	int rc = secondary_lookup(line_addr, cline10, delay, lru, cme);
	if (rc<0) return 0;
	ml.lock();
	if(invalid_reqs.find(line_addr) != invalid_reqs.end())
	  invalid_reqs.erase(line_addr);
	else {
	  ml.unlock();
	  break;
	}
	ml.unlock();
      } while(true);

      cacheway::mesi_t ns = cacheway::mesi_t::invalid;

      if(cc_protocol == MOESI_CC) {
	if(cme->resp == CT_MISS_SHARED)
	  ns = cacheway::mesi_t::shared;
	else if(cme->resp == CT_MISS_EXCLUSIVE)
	  ns = cacheway::mesi_t::exclusive;
	else if(cme->resp == CT_MISS_DIRTY)
	  ns = cacheway::mesi_t::modified;
	else if(cme->resp == CT_MISS_OWNED)
	  ns = cacheway::mesi_t::owned;
	else 
	  assert(0);
	
	assert(ns != cacheway::mesi_t::invalid);
      }
      else
	ns = cacheway::mesi_t::exclusive;

      ins = insert_line(cw, line_addr, dmap, cline10, delay, cme, ns);
      ml.lock();
      secondary_reqs.erase(line_addr);
      secondary_req_event.notify();
      ml.unlock();
    }
    else {
      bool w = true;
      do {
	wait(secondary_req_event);
	ml.lock();
	w = !(secondary_reqs.find(line_addr) == secondary_reqs.end());
	ml.unlock();
      } while(w);
      // search again in case the data has been loaded in the meantime. find the way
      for (int w=0; w<geom.ways; w++) {
	cw = Cont[w];
	bool present = cw->lookup(line_addr, dmap, &cline, delay);
	if (present) break;
      }
      assert(cline);
      
    }
  }
  
  if(!ins)
    return lookup(servicef, line_addr, dmap, loffset, datap1, delay, cme, nested); // do it again

  else {
    bool present = cw->lookup(line_addr, dmap, &cline, delay); // should hit this time!
    sysc_assert(present);
    //for(int i=0;i<geom.linesize;i++) printf("%02X", cline[i]); printf(" B loffset=%i\n", loffset);
    datap1 = ((u64_t*)cline) + loffset; // loffset in words
    //printf("Miss serviced line_addr=" PFX64 ", loffset=%i %p\n", line_addr, loffset, datap1);
    stats.misses += 1;
    C1TRC(line_addr,  printf("%s: cache data miss\n", name()));
    C1TRC(line_addr, printf("%s: Miss serviced ok (way=%i) addr=" PFX64 ", dmap=%x, loffset=%i 1stdata=0x" PFX64 " \n", name(), cw->m_way, line_addr, dmap, loffset, *datap1));
    if(cme) 
      cme->miss[level][my_type] = true;
    return cw;
  }
}

void cache64::stat_header(const char *msg, FILE *fd) {
  fprintf(fd, "   Hits    Misses   Sharing Evictions  Ratio   Writes   Reads   Id\n");
  fprintf(fd, "------------------------------------------------------------------\n");
}

void cache64::stat_report(const char *msg, FILE *fd, bool resetf, bool no_header)
{
  const char *f1 = write_policy ==  WRITE_THROUGH ? "WRITE_THROUGH": "WRITE-BACK"; 
  const char *f2 = 
    write_miss == READ_ALLOCATE ? "READ_ALLOCATE":
    write_miss == NO_READ_ALLOCATE ? "NO_READ_ALLOCATE":
    write_miss == WRITE_ALLOCATE ? "WRITE_ALLOCATE": "NO_WRITE_ALLOCATE";
  const char *f3 = 
    store_miss == READ_ALLOCATE ? "READ_ALLOCATE":
    store_miss == NO_READ_ALLOCATE ? "NO_READ_ALLOCATE":
    store_miss == WRITE_ALLOCATE ? "WRITE_ALLOCATE": "NO_WRITE_ALLOCATE";

  int s = stats.hits + stats.misses;
  if (fd) {
    if(no_header)
      fprintf(fd, "%s: %s: Hits=%i  Misses=%i ratio=%1.3f. SharingEvictions=%i SnoopedReads=%i  Writes=%i, Reads=%i %s/%s/%s\n", 
	      name(), 
	      msg, 
	      stats.hits, 
	      stats.misses, 
	      (s) ? float(stats.hits) / float(s):0.0,  
	      stats.sharing_evictions, 
	      stats.snooped_reads, 
	      stats.writes, stats.reads, f1, f2, f3);
    else
      fprintf(fd, "%7d%10d%20d%7.3f%9d%8d %s/%s/%s  %s\n",
	      stats.hits,
	      stats.misses,
	      stats.sharing_evictions,
	      (s) ? float(stats.hits) / float(s) : 0.0,
	      stats.writes,
	      stats.reads,
	      f1, f2, f3,
	      name() 
	      );
  }
    
    
  if (resetf) {
      //printf("%s stats reset\n", name());
      stats.reset();
  }
}

void cache64::stat_accumulate(stats_t& total) {
  total.hits += stats.hits;
  total.misses += stats.misses;
  total.sharing_evictions += stats.sharing_evictions;
  total.writes += stats.writes;
  total.reads += stats.reads;
}

void cache64::stats_t::totals(FILE* fd) {
  int s = hits + misses;
  fprintf(fd, "------------------------------------------------------------------\n");
  fprintf(fd, "%7d%10d%20d%7.3f%9d%8d\n",
	  hits,
	  misses,
	  sharing_evictions,
	  (s) ? float(hits) / float(s) : 0.0,
	  writes,
	  reads);

}


void cache64::end_of_simulation()
{
  stat_report("End of simulation", stdout);
}

// constructor
cache64::write_buffer::write_buffer(cache64 * parent, int linesize, int secondary_width):
  parent(parent),
  cline("writebuffer_cline", 8, linesize),
  linesize(linesize),
  secondary_width_bytes(secondary_width/8),
  wbl("write_buffer_lock")
{
  m_addr = -1; // invalid
  evict_addr = -1;

  for (int i=0; i<linesize; i++) 
    Dirty[i] = false;
}




u64_t *cache64::write_buffer::hit(u64_t addr, int loffset) 
{ 
  //  return (addr == m_addr) ? &((u64_t *)cline)[loffset]:0; 
  // TODO  POWER3(parent->pw_module_base::record_energy_use( ); // Write buffer energy read.
  // m_delay +=   .. read Data
  return (addr == m_addr) ? (u64_t *)(cline.read8p(loffset*8)):0;
}



void cache64::write_buffer::mark_dirty(int len, int loffset, u8_t *lanes, int bel)
{
  for (int i=0; i<len; i++)
    {
      if (!lanes || !bel || lanes[i % bel]) 
	{
	  //CTRC(printf("wb mark dirty byte on %d (0x%lx)\n", i, m_addr+loffset*8+i));
	  Dirty[i+8*loffset] = true;
	}
    }
}

u64_t *cache64::write_buffer::init(u64_t addr, 
				   int loffset, 
				   int dmap,
				   cache_miss_extension* ext,
				   sc_time& delay)
{
  sysc_assert1(parent, m_addr == -1); // must be empty  

  if(parent->cc_protocol == MOESI_CC) {
    // also send the invalid message to all other cores
    PW_TLM_PAYTYPE *trans;
    
    trans = parent->cache_miss_mm.allocate();
    trans->set_auto_extension<cache_miss_extension>(ext);
    trans->acquire();
    
    trans->set_data_length(parent->geom.linesize);
    trans->set_data_ptr(0);
    trans->set_write();
    trans->set_response_status(tlm::TLM_INCOMPLETE_RESPONSE);
    trans->set_address(addr);
    
    parent->inita_socket->b_transport(*trans, delay);
    assert(trans->is_response_error() == false);


    trans->release();
  }
  //printf("%s: writing address " PFX64 " to WB\n", parent->name(), addr);
  wbl.lock();
  m_addr = addr;
  m_dmap = dmap;
  // clean dirty flags
  for(int i = 0; i < linesize; i++) {
    //CTRC(printf("%s: setting all dirty bits to false for 0x%lx\n", parent->name(), m_addr));
    Dirty[i] = false;
  }
  wbl.unlock();

  // TODO initial power and delay need accounting here.
  return (u64_t *)(cline.read8p(loffset*8));
}



// Constructor
cache64::cache64(sc_module_name mname,  
		 int bytes, 
		 int ways, 
		 int linesize, 
		 int secondary_width,
		 sc_time clock_period,
		 int cache_level,
		 cache_ty ty,
		 uint32_t hct,
		 int secondary_bl, 
		 on_miss_ty store_miss,
		 on_miss_ty write_miss,
		 write_policy_ty write_policy,
		 bool use_write_buffer,
		 int secondary_linesize,
		 int secondary_banks, 
		 bool big_endian,
		 cc_protocol_ty protocol):
  sc_core::sc_module(mname), 
#ifdef TLM_POWER3
  pw_module("power_config_cache64.txt") ,
#endif
  targ_socket("targ_socket"), 
  inita_socket("inita_socket"),
  buf0(0),
  level(cache_level),
  secondary_storage_lreq(-1),
  store_miss(store_miss),
  write_miss(write_miss),
  write_policy(write_policy),
  use_write_buffer(use_write_buffer),
  my_type(ty),
  hit_cycle_time(hct),
  clock_period(clock_period),
  busyFlag(false),
  busyAddr(0),
#ifdef TLM_POWER3
#if PW_TLM_PAYLOAD > 0
  primary_bus_tracker(this),
  secondary_bus_tracker(this),  
#endif
#endif
  ml("cache_lock"),
  cc_protocol(protocol)
  //, busy_flag(false)
{
  traceregions = 0;
  geom.big_endian = big_endian;
  geom.ways = ways;
  geom.linesize = linesize;
  geom.bytes = bytes;
  geom.secondary_blocksize_bytes= secondary_width/8 * secondary_bl;
  geom.secondary_bl = secondary_bl;
  // Input sizes are all in bytes.
  sysc_assert((secondary_width % 8) == 0);
  sysc_assert(geom.linesize > 0 && (geom.linesize % 8)==0); // Each line must be a multiple of 64 bits.
  sysc_assert(geom.ways > 0); // When ways=1 we are directly mapped
  sysc_assert(bytes > 0 && (bytes % (8 * geom.ways * geom.linesize)) == 0); // total size needs to be a valid multiple.
  sysc_assert(geom.linesize <= MAX_LINESIZE);
  sysc_assert(secondary_width >= 64);
  //  if (geom.secondary_blocksize_bytes < geom.linesize)
    {
      /*CTRC(std::cout << name() << " secondary blocksize= " << geom.secondary_blocksize_bytes << ";  linesize=" << geom.linesize << "\n");*/
    }
  sysc_assert(geom.secondary_blocksize_bytes >= geom.linesize);
  sysc_assert((geom.secondary_blocksize_bytes % geom.linesize)==0);
  // number of cache lines that are in one way
  geom.dmapping = bytes / (geom.ways * geom.linesize);
  sysc_assert(bound_log2(geom.dmapping) > bound_log2(geom.dmapping-1)); // ie dmapping is a power of 2.

  // size of the cache lines in words - 1 word = 8 bytes
  geom.word64s_per_line = geom.linesize / 8;

  /*printf("%s: ways=%i, linesize=%i bytes, dmapping=%i (totalsize=%i bytes).\n", 
	 name(), 
	 geom.ways, 
	 geom.linesize, 
	 geom.dmapping, 
	 geom.bytes));*/

  // Need to shift by: (3 for bytes per word) + log words_per_line;
  geom.dmap_shift =  3 + bound_log2(geom.word64s_per_line);

  geom.loffset_shift = 3; // Always 3 for 64 bit words when byte addressed.

  if(secondary_linesize < linesize)
    geom.secondary_linesize_bits = bound_log2(linesize);
  else
    geom.secondary_linesize_bits = bound_log2(secondary_linesize);
  geom.secondary_banks = secondary_banks;

  // End of geometry computations.
  Cont = (cacheway **) malloc(geom.ways * sizeof(cacheway*));
  for (int w=0; w<geom.ways; w++)
    {
      Cont[w] = new cacheway(this, w);
    }
  lru = 0;
  //clean(delay); // should power up as clean!ss

#ifdef TLM_POWER3
  std_energy_op = pw_energy((double) (0.1 * 64), pw_energy_unit::PW_pJ); 
  // based on: just made up!
  //POWER3(set_excess_area(pw_length(150 * geom.ways, PW_um), pw_length(150,  PW_um)));

#endif

  // Register callbacks for incoming interface method calls
  targ_socket.register_b_transport(this, &cache64::b_transport);
  targ_socket.register_nb_transport_fw(this, &cache64::nb_transport_fw);
  targ_socket.register_get_direct_mem_ptr(this, &cache64::get_direct_mem_ptr);

  // write buffer
  if(use_write_buffer)
    buf0 = new write_buffer(this, linesize, secondary_width);

  new tenos_report_item(name(), "snooped_reads", &stats.snooped_reads);
  new tenos_report_item(name(), "hits", &stats.hits);
  new tenos_report_item(name(), "misses", &stats.misses);
  new tenos_report_item(name(), "sharing_evictions", &stats.sharing_evictions);

}

//
// 1 of 2 write operations.
//
void cache64::write_through(int id,
				       PW_TLM_PAYTYPE &trans, 
				       sc_time &delay,
				       u64_t addr) 
{
  assert(0);
  // find out to which memory bank we need to send the transaction
  int mask = geom.secondary_banks == 1 ? 0x1 : geom.secondary_banks - 1;
  int bank = (addr >> geom.secondary_linesize_bits) & mask;

  PW_TLM_PAYTYPE fwd; 

  fwd.set_write();
  fwd.set_address(trans.get_address());
  fwd.set_data_length(trans.get_data_length());
  fwd.set_data_ptr(trans.get_data_ptr());
  fwd.set_byte_enable_length(trans.get_byte_enable_length());
  fwd.set_byte_enable_ptr(trans.get_byte_enable_ptr());
  fwd.set_streaming_width(trans.get_streaming_width());
  fwd.set_dmi_allowed(false); 
  fwd.set_response_status(tlm::TLM_INCOMPLETE_RESPONSE);

  POWER3(PW_TLM3(fwd.pw_set_origin(this, PW_TGP_DATA | PW_TGP_ADDRESS | PW_TGP_LANES | PW_TGP_ACCT_SRC, &secondary_bus_tracker))); // Write thru.
  inita_socket[bank]->b_transport(fwd, delay);
  POWER3(PW_TLM3(fwd.pw_terminus(this)));

  if(fwd.is_response_error()) {
    char txt[100];
    sprintf(txt, "Error from b_write_through addr=" PFX64 ", response status = %s",
	     addr, trans.get_response_string().c_str());
    SC_REPORT_ERROR("cache64", txt);
  }
  
}

//
// 2 of 2 write operations.
//
void cache64::write_back(PW_TLM_PAYTYPE &trans, 
				    sc_time &delay,
				    u64_t line_addr,
				    int loffset,
				    u64_t addr,
				    u32_t len,
				    int dmap,
				    u8_t* ptr,
				    cache_miss_extension* cme) 
{

  if(cc_protocol == MOESI_CC && trans.is_write() && trans.get_data_ptr() == 0) {
    // this is invalidation message broadcasted from lower level
    // caches, as there is no data we just need to forward it
    trans.set_data_length(geom.linesize);
    trans.set_address(line_addr);

    inita_socket->b_transport(trans, delay);
    assert(trans.is_response_error() == false);

    return;    
  }

  stats.writes += 1;
  u8_t *lanes = trans.get_byte_enable_ptr();
  u64_t *wdatap = 0;
  int bel = trans.get_byte_enable_length();
  if(buf0) {
    buf0->wbl.lock();
    wdatap = buf0->hit(line_addr, loffset);
    buf0->wbl.unlock();
  }
 
  if(wdatap) // Write coalesc with existing dirty data?
    {
      C1TRC(addr, printf("%s: warm buffered write op addr=" PFX64 " len=%i bel=%i dmap=0x%x    dmap_shift=%i\n", name(), addr, len, bel, dmap, geom.dmap_shift));
      buf0->mark_dirty(len, loffset, lanes, bel);
      buf0->wbl.unlock();
      
      delay += buf0->data_latency(len);      
    }
  else  // If the line is in the cache then update that, else use write buffer.
    {
      cacheway *rp = lookup(false, line_addr, dmap, loffset, wdatap, delay, cme);
      if (wdatap)
	{
	  C1TRC(addr, printf("%s: warm write op addr=" PFX64 " dmap=0x%x    dmap_shift=%i\n", name(), addr, dmap, geom.dmap_shift));

	  operate(rp, cacheway::mesi_t::modified, line_addr, dmap, delay);
	}
      else
	{	      
	  if(!buf0) {
	    inita_socket->b_transport(trans, delay);
	    return;
	  }
	  else {
	    C1TRC(addr, printf("%s: cold buffered write op addr=" PFX64 
			" len=%i bel=%i " PFX64 
			" dmap=0x%x    dmap_shift=%i\n", 
			name(), addr, len, bel, line_addr, dmap, geom.dmap_shift));

	    buf0->clean(delay, cme);
	    line_invalidation(line_addr, loffset, dmap, cme, delay);

	    C1TRC(addr, printf("%s: writting to buffer addr = " PFX64 
			"\n", name(), addr));
	    u8_t* data; u8_t* flags;
	    tie(data, flags) = check_lower_level_lines(line_addr, 
						       buf0->linesize,
						       cacheway::invalid);
	    wdatap = buf0->init(line_addr, loffset, dmap, cme, delay);	      
	    if(data != NULL) {
	      assert(flags != NULL);
	      u8_t* wb_data = buf0->cline.read8p(0);
	      for(int i = 0; i < buf0->linesize; i++) {
		if(flags[i] == 0xFF) {
		  wb_data[i] = data[i];
		  buf0->Dirty[i] = true;
		}
	      }
	    }
	    
	    C1TRC(addr, fprintf(stdout, "data in write buffer is: ");	\
		  u8_t* wb_data = buf0->cline.read8p(0);		\
		  for(int i = 0; i < buf0->linesize; i++) 		\
		    fprintf(stdout, "%02x", wb_data[i]);		\
		  fprintf(stdout, "; lane is: ");			\
		  for(int i = 0; i < buf0->linesize; i++) 		\
		    if(buf0->Dirty[i])					\
		      fprintf(stdout, "ff");				\
		    else						\
		      fprintf(stdout, "00");				\
		  fprintf(stdout, "\n"));
	  
	    buf0->mark_dirty(len, loffset, lanes, bel);	      
	  }
	}
    }
  sysc_assert(wdatap); // We should, by here, know where we are writing.
  for (int dd =0; dd<len; dd+=8)
    {
      assert(dd+loffset < geom.linesize);
      if (!lanes || bel==0 || ((u64_t *)lanes)[(dd % bel)/8] == 0xFFFFffffFFFFffffLLU)
	{
	  //CTRC(printf("%s Write64\n", name(), *(u64_t *)lanes));
	  wdatap[dd/8] = ((u64_t *)ptr)[dd/8];
	}
      else
	{
	  u8_t *ipw = (u8_t *)wdatap;
	  C1TRC(addr, printf("%s Write  (lanes=" PFX64 "...)\n", name(), *(u64_t *)lanes));
	  for (int l=0;l<8;l++) 
	    {
	      //printf("%i %i  %i   %p %p g=%i data=0x%02X\n", dd, l, bel, ipw, ptr, lanes[(dd+l)%bel], ptr[l+dd]);
	      if (lanes[(dd+l)%bel]) ipw[l+dd] = ptr[l+dd]; 
	    }
	}
    }

}

void cache64::b_transport(int id, PW_TLM_PAYTYPE &trans, sc_time &delay) {
  u64_t addr = trans.get_address();
  u64_t line_addr = addr & ~(geom.linesize-1);

  ml.lock();
  
  if(busyFlag) {
    assert(0);
  }
  
  busyFlag = true;
  busyAddr = line_addr;
  ml.unlock();
  //printf("%s: start %x " PFX64 "\n", name(), addr);
  b_access(id, trans, delay, addr);
  //printf("%s: end %x " PFX64 "\n", name(), addr);
  ml.lock();
  busyFlag = false;
  busyAddr = 0;
  ml.unlock();

  C1TRC(addr, fprintf(stdout, "%s: finished access to the cache\n", name()));
}

// TLM-2 blocking transport method
void cache64::b_access(int id, 
		       PW_TLM_PAYTYPE &trans, 
		       sc_time &delay, 
		       u64_t addr)
{
  // we can call this method with id either equal to zero or equal to one.
  // when it is equal to zero it means that request is coming from the local core
  // while if it is equal to one it means that it is coming from remote node

  // the requests from remote node can come at any time so we need to order them
  // if they are trying to access the same cache line that the local core
  // is accessing at the moment we would need to wait for that event to finish
  // before we process the cache line
  u8_t *ptr = trans.get_data_ptr();
  int dmap = (addr >> (u64_t)geom.dmap_shift) & (geom.dmapping-1LLU);
  int loffset  = (addr >> geom.loffset_shift) & (geom.word64s_per_line-1LLU); // loffset in words
  u64_t line_addr = addr & ~(geom.linesize-1);
  u32_t len = trans.get_data_length();

  sc_time start_time = delay + sc_time_stamp();
  tlm::tlm_command cmd = trans.get_command();
  
  llsc_extension* linked = 0;
  trans.get_extension(linked); 

				 
  cache_miss_extension* c_ext = 0;
  trans.get_extension(c_ext);
  
  if (linked) 
    { // Do an invalidate (on all caches in grp even if we do not have it! ).
      buf0->clean(delay, c_ext);
      line_invalidation(line_addr, loffset, dmap, c_ext, delay);
      //printf("%s .. cache llsc invalidate\n", name());
    }
  if (UNCACHED_ADDRESS_SPACE64(addr) || linked) 
    {
      bool tf = traceregions && traceregions->check(addr, TENOS_TRACE_MEM_READ|TENOS_TRACE_MEM_WRITE);
      int bank = geom.secondary_banks == 1 
	? 0 
	: ((addr >> geom.secondary_linesize_bits) & (geom.secondary_banks - 1));
      
#ifdef TLM_POWER3
      PW_TLM3(trans.pw_log_hop(this,  0,  &secondary_bus_tracker)); // It will pass to the secondary bus.
#endif
      inita_socket[bank]->b_transport(trans, delay);
      if (tf) printf("%s uncached or linked op pass at " PFX64 " op=%s\n", name(), addr, cmd == tlm::TLM_READ_COMMAND ?"read": "write");
#ifdef TLM_POWER3
      PW_TLM3(trans.pw_log_hop(this,  0,  &primary_bus_tracker)); 
#endif
      return;
    }
  
  
  POWER3(PW_TLM3(pw_agent_record l_agent =
		 trans.pw_log_hop(this,  
				  (cmd==tlm::TLM_READ_COMMAND ? 
				   PW_TGP_DATA: PW_TGP_NOFIELDS) 
				  | PW_TGP_ACCT_CKP,  
				  &primary_bus_tracker))); // Read or write but not passthrough.
  
  sysc_assert(trans.get_data_length() <= 8);    
  
#if 0
  // if id is greater then zero then message wasn't recieved from core
  // but from other module (in this case from L2). This message consists
  // of address and way that we need to invalidate
  if(id > 0) {
    way_extension* ext = 0;
    trans.get_extension(ext);
    assert(ext);
    ml->lock();
    // for now we let messages to race to lock but should consider
    // implementing order based on the timestamp at which msg was received
    invq.push(make_pair(dmap, ext->way));
    ml->unlock();
    
    trans.set_response_status(tlm::TLM_OK_RESPONSE);
    return;
  }
  // message received form the core
  else {
    ml->lock();
    // invalidate all entries first
    while(!invq.empty()) {
      pair<int, int> e;
      e = invq.front();
      C1TRC(addr, printf("%s: Need to invalidate way %d whose dmap is %lx\n", 
		  name(), e.second, e.first));
      Cont[e.second]->Status[e.first] = cacheway::invalid;
      invq.pop();
    }
    ml->unlock();
  }
#endif
  
  if (len % 8)  {
    C1TRC(addr, printf("%s: Illegal len or wid: len=%i wid=%i\n", name(), len, 64));
    trans.set_response_status(tlm::TLM_BURST_ERROR_RESPONSE);
    return;
  }
  
  
  if (cmd == tlm::TLM_READ_COMMAND) {
    bool tf = traceregions && traceregions->check(addr, TENOS_TRACE_MEM_READ);
    if (tf) printf("%s: cache read op at " PFX64 " len=%i dmap=0x%x, dmap_shift=%i,delay=%s\n", 
		       name(), addr, len, dmap, geom.dmap_shift, delay.to_string().c_str());
    
    C1TRC(addr, printf("%s: cache read op at " PFX64 " len=%i dmap=0x%x, dmap_shift=%i,delay=%s\n", 
		       name(), addr, len, dmap, geom.dmap_shift, delay.to_string().c_str()));

    stats.reads+=1;
    u64_t *rdatap1 = 0;
    if(buf0) {
      rdatap1 = buf0->hit(line_addr, loffset);
    }
      
    if(rdatap1) {
      // We can service from write buffer if a complete word is present, otherwise clean write buffer
      // and miss as usual.
      // An improvement would be to check only for the requested read lanes.

      if (buf0->word_present(loffset)) {
	//xprintf("%s: read op addr=" PFX64 " dmap=0x%x    dmap_shift=%i\n", 
	//name(), addr, dmap, geom.dmap_shift);
	// Ignore byte lanes on read and return complete word.
	
	delay += buf0->data_latency(len);
	// TODO power...

	if (c_ext) c_ext->resp = CT_MISS_DIRTY;
      }
      else {
	rdatap1 = 0; // Discard partial hit
	buf0->clean(delay, c_ext); // Write it all out and then lookup using reload on miss.
      }
    }
    
    if (!rdatap1) {
      C1TRC(addr, printf("%s: line_addr=0x%lx, dmap=0x%x, loffest=0x%x\n", name(), line_addr, dmap, loffset));
      cacheway *rc = lookup(true, line_addr, dmap, loffset, rdatap1, delay, c_ext);
      if (rc) {
	// Ignore byte lanes on read and return complete word.
      }
      else 
	sysc_assert(0); // incomplete response
    }
    
    sysc_assert(rdatap1);
    u64_t ans =  *rdatap1;
    *((u64_t *)ptr) = ans; 
    //Data copy: one word (burst length words) from cache to buffer pointed at in payload.
    C1TRC(addr, printf("%s: cache read op result ans=" PFX64 " len=%i\n", name(), ans, len));
    assert(len == 8);
  }
  else if (cmd == tlm::TLM_WRITE_COMMAND) {
    /*C1TRC(addr, printf("%s: cache write op addr=" PFX64 " len=%i dmap=0x%x, dmap_shift=%i, delay=%s, data=" PFX64 "\n", 
      name(), addr, len, dmap, geom.dmap_shift, delay.to_string().c_str(), ((u64_t*)ptr)[0]));*/

    switch(write_policy) {
    case WRITE_THROUGH:
      write_through(id, trans, delay, addr);
      break;
    case WRITE_BACK:
      write_back(trans, delay, line_addr, loffset, addr, len, dmap, ptr, c_ext);
      break;
    default:
      assert(0); // Unknown write policy
      assert(len == 8);
    }
  }
  
  
  trans.set_response_status(tlm::TLM_OK_RESPONSE);
  
  sc_time end_time = delay + sc_time_stamp();
  
  C1TRC(addr, cout << name() << " : at the end delay = " << delay << " and time = " 
       << sc_time_stamp() << endl);
  
#if PW_TLM_PAYLOAD > 0 
  POWER3(l_agent.record_energy_use(std_energy_op));
#else
  POWER3(record_energy_use(std_energy_op));
#endif
}



bool cache64::get_direct_mem_ptr(int n, PW_TLM_PAYTYPE &trans, tlm::tlm_dmi &dmi_data)
{
  inita_socket/*[n]*/->get_direct_mem_ptr(trans, dmi_data);
}

void cache64::set_sr_latencies(int tag_words,
			       int tag_width,
			       int data_words,
			       int data_width,
			       double factor) {

  for(int i = 0; i < geom.ways; i++) {
    cacheway* cw = Cont[i];
    cw->Data->set_sr_latency(data_words, data_width, factor);
    cw->Tags->set_sr_latency(tag_words, tag_width, factor);
  }

  return;
}

// constructor
cache64::smallram64::smallram64(sc_core::sc_module_name name_, int width, int words):
  sc_module(name_),
#ifdef TLM_POWER3
  pw_module("power_config_smallram.txt"),
#endif
  words(words),
  width(width)
{
  sysc_assert(width <= 64);
  sysc_assert(words > 0);
  u64_t bits = ((u64_t)words) * ((u64_t)width);
  //CTRC(printf("%s created, %llu bytes\n", name(), bits / 8LLU));

  u64_t l_bits = words * width;
#ifdef TLM_POWER3
  pw_power leakage = pw_power(82.0 * l_bits, PW_nW);
  set_static_power(leakage);

  set_fixed_area(pw_area(13359.0 + 4.93/8 * l_bits, PW_squm)); // or from tech file
#endif

  m_sr_latency =  sc_time(0.21 + 3.8e-4 *sqrt(float(l_bits)), SC_NS);

  recompute_pvt_parameters();
  Data = (u64_t *)malloc(words * sizeof(u64_t));
};


u64_t cache64::smallram64::read(u64_t idx)
{
  POWER3(pw_module_base::record_energy_use(m_read_energy_op));
  sysc_assert(idx >= 0 && idx < words);
  return Data[idx];
}

void cache64::smallram64::write(u64_t idx, u64_t d)
{
  POWER3(pw_module_base::record_energy_use(m_write_energy_op));
  sysc_assert(idx >= 0 && idx < words);
  Data[idx] = d;
}

// constructor
cache64::smallram8::smallram8(sc_core::sc_module_name name_, int width, int words):
  sc_module(name_),
#ifdef TLM_POWER3
  pw_module("power_config_smallram.txt"),
#endif
  words(words),
  width(width)
{
  sysc_assert(width <= 8);
  sysc_assert(words > 0);
  u64_t bits = ((u64_t)words) * ((u64_t)width);
  //CTRC(printf("%s created, %llu bytes\n", name(), bits / 8LLU));

  u64_t l_bits = words * width;
#ifdef TLM_POWER3
  pw_power leakage = pw_power(82.0 * l_bits, PW_nW);
  set_static_power(leakage);

  set_fixed_area(pw_area(13359.0 + 4.93/8 * l_bits, PW_squm)); // or from tech file
#endif

  m_sr_latency =  sc_time(0.21 + 3.8e-4 *sqrt(float(l_bits)), SC_NS);

  recompute_pvt_parameters();  
  Data = (u8_t *)malloc(words * sizeof(u8_t));
};


u8_t cache64::smallram8::read(u64_t idx)
{
  POWER3(pw_module_base::record_energy_use(m_read_energy_op));
  sysc_assert(idx >= 0 && idx < words);
  return Data[idx];
}


u64_t cache64::smallram8::read64(u64_t idx) // idx is a byte offset, as always.
{
  POWER3(record_energy_use(m_read_energy_op * 8));
  sysc_assert(idx >= 0 && idx < words-8);
  return ((u64_t *)Data)[idx>>3LLU];
}

u8_t *cache64::smallram8::read8p(u64_t idx) // idx is a byte offset, as always.
{
  POWER3(record_energy_use(m_read_energy_op * 8)); // log power, assuming it will be used.
  sysc_assert(idx >= 0 && idx < words);
  return &(Data[idx]);
}


void cache64::smallram8::write(u64_t idx, int blen, u8_t *d, u8_t *lanes)
{
  for (int w=0;w<blen;w++)
    {
      POWER3(record_energy_use(m_write_energy_op));
      sysc_assert(idx+w >= 0 && idx+w < words);
      if (lanes[w]) Data[idx+w] = d[w]; // lanes: todo.
    }
}


void cache64::smallram8::recompute_pvt_parameters() // Called when Vcc is changed and so on.
{ 
#ifdef TLM_POWER3
  u64_t l_bits = words * width;
  m_read_energy_op = pw_energy(5.0 + 1.2e-4 / 8.0 *l_bits, pw_energy_unit::PW_pJ);
  m_write_energy_op = 2.0 * m_read_energy_op; // rule of thumb!
  pw_voltage vcc = get_vcc();
  m_sr_latency = m_sr_latency / vcc.to_volts();
#endif
  /*CTRC(cout << name () << ": smallram8 basic latency = " << m_sr_latency 
    << " bits = " << l_bits << "\n");*/
}

void cache64::smallram8::set_sr_latency(int _words,
					int _width,
					double factor) {

  m_sr_latency = sc_time((0.21 + 3.8e-4 * sqrt(float(_words * _width)))/factor, SC_NS);

  /*CTRC(cout << ": smallram8 basic latency = " << m_sr_latency
    << " bits = " << _words * _width << endl);*/
}

void cache64::smallram64::recompute_pvt_parameters() // Called when Vcc is changed and so on.
{
#ifdef TLM_POWER3
  u64_t l_bits = words * width;
  m_read_energy_op = pw_energy(5.0 + 1.2e-4 / 8.0 *l_bits, pw_energy_unit::PW_pJ);

  pw_voltage vcc = get_vcc();
  m_sr_latency = m_sr_latency / vcc.to_volts();
#endif

  /*CTRC(cout << name() << ": smallram64 basic ltency = " << m_sr_latency
    << " bits = " << l_bits << "\n");*/

}

void cache64::smallram64::set_sr_latency(int _words,
					 int _width,
					 double factor) {
  m_sr_latency = sc_time((0.21 + 3.8e-4 * sqrt(float(_words * _width)))/factor, SC_NS);

  /*CTRC(cout << name() << ": smallram64 basic latency = " << m_sr_latency
    << " bits = " << _words * _width << endl);*/
}


sc_time cache64::write_buffer::data_latency(int len_in_bytes)
{

  return sc_time ((16 + len_in_bytes) / 16 * 200, SC_PS); // Made up figure.
}



PW_TLM_PAYTYPE* cache_miss_mm_t::allocate() {
  PW_TLM_PAYTYPE* ptr;

  lck.lock();
  if(free_list) {
    ptr = free_list->trans;
    empties = free_list;
    free_list = free_list->next;
  }
  else
    ptr = new (PW_TLM_PAYTYPE)(this);
  lck.unlock();

  if(ptr->get_ref_count() < 0)
    while(ptr->get_ref_count() != 0)
      ptr->acquire();

  assert(ptr->get_ref_count() == 0);

  return ptr;
}


void cache_miss_mm_t::free(PW_TLM_PAYTYPE* trans) 
{
  // do not need to release extension as they are all allocated on stack
  // so clearing should be enough
  cache_miss_extension* cme = 0;
  trans->get_extension(cme);
  if(cme)
    trans->clear_extension<cache_miss_extension>(cme);

  // clears the extension pointers for sure
  trans->reset();

  lck.lock();
  if(!empties) {
    empties = new access;
    empties->next = free_list;
    empties->prev = 0;
    if(free_list)
      free_list->prev = empties;
  }
  free_list = empties;
  free_list->trans = trans;
  empties = free_list->prev;
  lck.unlock();
}

consistent_cache64::consistent_cache64(sc_module_name name, 
				       consistent_group* grp,
				       int bytes, 
				       int ways, 
				       int linesize, 
				       int secondary_width, 
				       int cache_level, 
				       cache_ty ty,
				       sc_time clock_period,
				       uint32_t hit_cycles,
				       int bl,
				       on_miss_ty store_miss,
				       on_miss_ty write_miss,
				       write_policy_ty write_policy,
				       bool use_write_buffer,
				       int secondary_linesize,
				       int secondary_banks,
				       bool big_endian,
				       cc_protocol_ty protocol)
  : cache64(name, bytes, ways, linesize, secondary_width, clock_period, cache_level,
	    ty, hit_cycles, bl, store_miss, write_miss, write_policy, use_write_buffer,
	    secondary_linesize, secondary_banks, big_endian, protocol),
    grp(grp)
{
  if(grp) grp->add(this);  
}

int consistent_cache64::do_a_snoop(bool nested,
				   bool servicef,
				   u64_t addr,
				   sc_time& delay,
				   cache_miss_extension* cme,
				   u8_t* snooped_data) {
  int exist = 0;
  set<cacheway::mesi_t> states;
  if (grp && !nested) {  // Enquire for this cache line in my consistent neighbours if any.
    int g;
    // This for loop is done in parallel on snoopy systems:
    for (g = 0; g < grp->target_members; g++) {
      if (grp->Members[g] == this) continue;
      bool exclude = !servicef; // For a write we will want to evict others on a share.
      cacheway::mesi_t state;
      bool present = grp->Members[g]->grp_snoop(addr, 
						exclude, 
						delay, 
						cme, 
						state, 
						snooped_data);
      // Need to be careful in accumulating the delays from parallel activities
      // since there may be some serialised delay included, such as contention in 
      // access to next-level store.
      if (present) {
	exist++;
	states.insert(state);
      }
    }
     
    if (g != grp->target_members) {
      // something happened, e.g. remote evicted - we don't care 
      //at the moment since we read from next level.
    }
  }
 
  // safety check
  if(exist == 1) {
    assert(states.size() == 1);
    assert(*states.begin() == cacheway::mesi_t::exclusive 
	   || *states.begin() == cacheway::mesi_t::modified
	   || *states.begin() == cacheway::mesi_t::owned
	   // although there is only one copy of the line in cache
	   // it can be in shared state if a line from consistent cache(s)
	   // that was shared with have been replaced by some other line
	   || *states.begin() == cacheway::mesi_t::shared);
  }
  else if(exist > 1) {
    assert(states.size() == 1);
    assert(*states.begin() == cacheway::mesi_t::shared);
  }


  return exist;
}

bool consistent_cache64::grp_snoop(u64_t addr, 
				   bool exclude, 
				   sc_time &delay, 
				   cache_miss_extension* cme,
				   cacheway::mesi_t& state,
				   u8_t* data)
{
  C1TRC(addr, printf("%s: consistency request for " PFX64 "\n",  name(), addr));
  int dmap = (addr >> (u64_t)geom.dmap_shift) & (geom.dmapping-1LLU);
  int loffset  = (addr >> geom.loffset_shift) & (geom.word64s_per_line-1LLU);
  u64_t line_addr = addr & ~(geom.linesize-1);

  if (buf0) {
    buf0->wbl.lock();
    if(buf0->hit(line_addr, loffset)) {
      C1TRC(addr, printf("%s: has dirty wb for line " PFX64 " + %x and is evicting\n", name(), addr, loffset));
      stats.sharing_evictions += 1;
      buf0->wbl.unlock();
      buf0->clean(delay, cme); // turf it out!
      return false;
    }
    else 
      buf0->wbl.unlock();
  }

  u64_t *rdatap1=0;
  cacheway *rc = lookup(false, line_addr, dmap, loffset, rdatap1, delay, cme, true);
  if (rc) {
    C1TRC(addr, printf("%s: snoop has a copy of line " PFX64 " in way %d\n", name(), addr, rc->m_way));
    // check if cache lines at the lower level have been modified
    u8_t* ll_data; u8_t* flags;
    cacheway::mesi_t new_state = cacheway::mesi_t::shared;
    if(cc_protocol == MOESI_CC)
      new_state = cacheway::mesi_t::owned;

    tie(ll_data, flags) = check_lower_level_lines(line_addr, 
					       geom.linesize,
					       new_state);

    u8_t* upper_level_data = rc->Data->read8p(dmap * geom.linesize);
    
    if(ll_data != NULL) {
      C1TRC(addr, printf("%s: there is data in lower level cache\n", name()));
      assert(flags != NULL);
      for(int i = 0; i < geom.linesize; i++) 
	if(flags[i] == 0xFF)
	  upper_level_data[i] = ll_data[i];

      state = cacheway::mesi_t::modified;
      memcpy(data, upper_level_data, geom.linesize);

      switch(cc_protocol) {
        case MESI_CC:
	  rc->clean(delay, dmap, cme); // turf it out!
	  stats.sharing_evictions += 1;
	  
	  rc->waylock->lock();
	  rc->Status[dmap] = cacheway::mesi_t::shared;
	  rc->waylock->unlock();
	  
	  return true;
        case MOESI_CC:
	  rc->waylock->lock();
	  rc->Status[dmap] = cacheway::mesi_t::owned;
	  rc->waylock->unlock();

	  return true;
        default:
	  assert(0);
      }
    }
    else {
      switch(rc->Status[dmap]) {
        case cacheway::mesi_t::exclusive: 
	  C1TRC(addr, printf("%s: has exclusive copy of line " PFX64 " + %x\n", 
			     name(), addr, loffset));
	  
	  // Do not need to write back if exclusive
	  //rc->clean(delay, dmap, cme); // turf it out!
	  //stats.sharing_evictions += 1;
	  state = cacheway::mesi_t::exclusive;
	  memcpy(data, rc->Data->read8p(dmap * geom.linesize), geom.linesize);
	  rc->waylock->lock();
	  rc->Status[dmap] = cacheway::mesi_t::shared;
	  rc->waylock->unlock();
	  
	  return true;
	       
        case cacheway::mesi_t::shared: { 
	  if (exclude) {
	    
	    assert(0); // not sure if we ever get in here?
	    // Snooping a write, shared is no longer an option here, 
	    // so a sharing eviction.
	    C1TRC(addr, 
		  printf("%s: snoop has a copy of line "PFX64" + %x and is evicting\n",
			 name(), addr, loffset));
	    stats.sharing_evictions += 1;
	    rc->clean(delay, dmap, cme); // turf it out!
	    return false; // Do not stop - other caches may have it in shared mode.
	  }
	  else {
	    C1TRC(addr, 
		  printf("%s: snoop has a copy of line " PFX64 " + %x and it is shared",
			 name(), addr, loffset));
	    // We have it, and might serve from here in the future. 
	    // Stop since can't be exclusive elsewhere.
	    state = cacheway::mesi_t::shared;
	    memcpy(data, rc->Data->read8p(dmap * geom.linesize), geom.linesize);
	    
	    return true;
	  }
	}

        case cacheway::mesi_t::modified: {
	  C1TRC(addr, 
		printf("%s: snoop has a modified copy of line " PFX64 
		       " + %x and is evicting\n", 
		       name(), addr, loffset));
	  rc->clean(delay, dmap, cme); // turf it out!
	  stats.sharing_evictions += 1;
	  
	  state = cacheway::mesi_t::modified;
	  
	  memcpy(data, rc->Data->read8p(dmap * geom.linesize), geom.linesize);
	  
	  rc->waylock->lock();
	  rc->Status[dmap] = cacheway::mesi_t::shared;
	  rc->waylock->unlock();
	  
	  return true;
	}

        case cacheway::mesi_t::invalid:
	  printf("%s: invalid should not happen\n", name());
	  sysc_assert(0);
	  break;

        case cacheway::mesi_t::owned:
	  C1TRC(addr, 
		printf("%s: snoop has a copy of line " PFX64 " + %x and it is owned",
		       name(), addr, loffset));

	  state = cacheway::mesi_t::owned; // stays in the same state
	  memcpy(data, rc->Data->read8p(dmap * geom.linesize), geom.linesize);

	  return true;

        default:
	  printf("%s: Should not happen\n", name());
	  sysc_assert(0);
      }
    }
  }

  state = cacheway::mesi_t::invalid;
  return false;
}

void consistent_cache64::operate(cacheway *cw, 
				 cacheway::mesi_t op, 
				 u64_t addr, 
				 int dmap, 
				 sc_time &delay)
{
  sysc_assert(cw);
  
  cw->waylock->lock();
  //bool changed=false;
  //cacheway::mesi_t os = cw->operate(op, addr, dmap, changed); 
  cacheway::mesi_t os = cw->Status[dmap];
  C1TRC(addr, printf("%s: the previous state of cache line was %d\n", name(), os));
  cw->waylock->unlock();

  // if the previous state was shared then it needs to invalidate
  // the same cache line from its consistent peers
  if(os != op
     && (os == cacheway::shared || os == cacheway::owned)
     && grp)
    for(int g = 0; g < grp->target_members; g++) {
      if(grp->Members[g] == this) continue;

      cache64* c = grp->Members[g];
      
      for(int w = 0; w < c->geom.ways; w++) {
	cacheway* g_cw = c->Cont[w];
	
	g_cw->waylock->lock();
	bool present = g_cw->lookup(addr, dmap, 0, delay);
	if(present) {
	  g_cw->Status[dmap] = cacheway::invalid;
	  C1TRC(addr, printf("%s: invalidated addr=0x%lx, dmap=0x%x\n", c->name(), addr, dmap));
	}
	g_cw->waylock->unlock();
      }
    }
  
  // call the method from the base class
  cache64::operate(cw, op, addr, dmap, delay);
} 

bool consistent_cache64::insert_line(cacheway* me,
				     u64_t addr, 
				     int dmap, 
				     u8_t* data,
				     sc_time &delay,
				     cache_miss_extension* cme,
				     cacheway::mesi_t ns) {
  
  bool ins = me->insert(addr, dmap, data, delay, cme, ns);

  // MP noticed this sequence of operations
  // C3 has cache line L1 that is in modified state
  // C0 wants line L1, hence L1 from C3 is sent to secondary storage
  // before C0 gets reply C2 also requsts line L1
  // line L1 is still in modified state as C0 hasn't recived reply so it sent to 
  // secondary storage (again)
  // strangely C2 gets reply before C0 
  // before C0 recieves reply C2 writes to L1 making whatever C0 recives invalid

   // This happens because we are using temporal decoupling where each thread
  // keeps its own local view of time and SystemC kernel keeps a single 
  // synchronized view of time therefore thread can run ahead in simulation
  // time until it needs to synchronize with another thread. Calls to wait()
  // function need to be inserted carefully for cycle accurate simulation
  // such that above case doesn't happen

  // Safety check to test if there is a core that modified the cache line
  // between cache line request and now.
  // For debugging only - no delay/power consumption to model.
  if(grp)
    for(int g = 0; g < grp->target_members; g++) {
      if(grp->Members[g] == this) continue;

      consistent_cache64* c = grp->Members[g];
      
      cacheway* cw = 0;
      for(int w = 0; w < c->geom.ways; w++) {
	cacheway* cw = c->Cont[w];
	cw->waylock->lock();

	bool present = cw->lookup(addr, dmap, 0, delay);
	if(present 
	   && cw->Status[dmap] == cacheway::mesi_t::modified 
	   && ns == cacheway::mesi_t::exclusive) {
	  me->Status[dmap] = cacheway::mesi_t::invalid;
	  cw->waylock->unlock();
	  return false;
	}
	cw->waylock->unlock();
      }
    }

  return ins;
}

void consistent_cache64::line_invalidation(u64_t addr, 
					   int loffset, 
					   int dmap, 
					   cache_miss_extension* ext, 
					   sc_time& delay) {
  // check to see if other caches in the consistent group have this line
  // if they have it, then they need to be invalidated
  bool traverse = true;
  if(grp) 
    for(int g = 0; g < grp->target_members && traverse; g++) {
      sc_time max_lookup_delay = SC_ZERO_TIME;
      cache64* c = grp->Members[g];

      // check first write buffer
      if(c->buf0) {
	c->buf0->wbl.lock();
	if(c->buf0->hit(addr, loffset)) {
	  c->buf0->wbl.unlock();
	  c->buf0->clean(max_lookup_delay, ext);
	  break;
	}
	else c->buf0->wbl.unlock();
      }
      for(int w = 0; w < c->geom.ways && traverse; w++) {
	// check to see if the line exists
	cacheway* cw = c->Cont[w];
	cw->waylock->lock();
	bool present = cw->lookup(addr, dmap, 0, max_lookup_delay);
	//printf("%s: check grp member %i, present=%i\n", name(), g, present);
	if(present) 
	  switch(cw->Status[dmap]) {
	  case cacheway::modified: 
	    cw->Status[dmap] = cacheway::invalid;
	    cw->waylock->unlock();
	    cw->clean(max_lookup_delay, dmap, ext, true);
	    traverse = false;
	    break;
	  case cacheway::shared:
	  case cacheway::exclusive: 
	    cw->Status[dmap] = cacheway::invalid;
	    cw->waylock->unlock();
	    break;
	  case cacheway::owned:
	    cw->Status[dmap] = cacheway::invalid;
	    cw->waylock->unlock();
	    cw->clean(max_lookup_delay, dmap, ext, true);
	    break;
	  default:
	    assert(0);
	  }
	else
	  cw->waylock->unlock();
      }
    }
  
  // done
}


TENOS_KIND_DEFINITION(cache64)

// eof
