// Copyright (c) 2016, University of Cambridge
// All rights reserved.
//
// This code was written based on Intel application note 324264-001: How to Benchmark
// Code Execution Times on Intel IA-32 and IA-64 Instruction Set Architectures
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice, this
//   list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
//
// * Neither the name of the project, the copyright holder nor the names of its
//  contributors may be used to endorse or promote products derived from
//   this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sort.h>

#define USE_RDTSCP
#define THRESHOLD_CYCLES  40
#define TIME_CYCLES       12600000000000
#define MAXSAMPLES      10000
#define DEFAULT_TYPE    "accurate"
#define CNT_TH      24
#define NUM_OF_BINS 100
#define UINT64_MAX (18446744073709551615ULL)
#define SAVE_TIMING 0

#ifndef rdtscll
#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)
#endif
#ifndef rdtscpll
#define rdtscpll(val, aux) do { \
        unsigned long __a, __d; \
        asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
        (val) = (__d << 32) | __a; \
} while (0)
#endif


#ifdef USE_RDTSCP
#define do_rdtscp(val, cpu) rdtscpll(val, cpu)
#else
#define do_rdtscp(val, cpu) do { rdtscll(val); cpu=0; } while (0)
#endif


#define CPU_MHZ_FILE        "/proc/cpuinfo"
#define CPU_MHZ_PREFIX        "cpu MHz\t\t: "

static int compare_ull(const void *pa, const void *pb)
{
        unsigned long long *a = *(unsigned long long **)pa;
        unsigned long long *b = *(unsigned long long **)pb;
        return (a[0] < b[0]) ? -1 : (a[0] > b[0]) ? +1 : 0;
}

static int __init hello_start(void)
{
    unsigned long long tsc, tsc2, start, threshold;
    unsigned int i, samples = 0;
    uint64_t **buffer;
    unsigned int samples_num,threshold_usec;
    uint64_t time_cycles;
    uint64_t *samples_th;
    uint64_t *samples_count,*samples_time;
    uint64_t global_count=0,global_time=0;
    unsigned cycles_low, cycles_high, cycles_low1, cycles_high1;
    volatile int variable = 0;
    volatile int done = 0;
    unsigned long flags;


     time_cycles=TIME_CYCLES;
     threshold_usec=THRESHOLD_CYCLES;
     samples_num=MAXSAMPLES;


     printk(KERN_ERR "\nCUCL: Kernel TSC test started...");


     samples_th = kmalloc(NUM_OF_BINS*sizeof(uint64_t*), GFP_KERNEL);
     if (!samples_th)   {
      printk(KERN_ERR "CUCL: unable to allocate memory for samples_th\n");
      return 0;
      }
      samples_th[0]=CNT_TH;

      samples_count = kmalloc(NUM_OF_BINS*sizeof(uint64_t*), GFP_KERNEL);
     if (!samples_count)   {
      printk(KERN_ERR "CUCL: unable to allocate memory for samples_count\n");
      return 0;
      }
      samples_count[0]=0; //in cycles

      samples_time = kmalloc(NUM_OF_BINS*sizeof(uint64_t*), GFP_KERNEL);
     if (!samples_time)   {
      printk(KERN_ERR "CUCL: unable to allocate memory for samples_time\n");
      return 0;
      }
      samples_time[0]=0; //in cycles


   for (i=1;i<NUM_OF_BINS;i++) {
        if (i<(NUM_OF_BINS- 80)) {
           samples_th[i]=samples_th[i-1]+1;
        }
	else if (i<(NUM_OF_BINS-50))
	  {samples_th[i]=samples_th[i-1]+10;}
        else if (i<(NUM_OF_BINS-10))
	  {samples_th[i]=samples_th[i-1]+100;}
	else {
        samples_th[i]=samples_th[i-1] * 5;
        }
        samples_count[i]=0;
        samples_time[i]=0;
    }


    buffer = kmalloc (samples_num * sizeof(uint64_t*), GFP_KERNEL);
      if (!buffer)   {
      printk(KERN_ERR "CUCL: unable to allocate memory for buffer\n");
      return 0;
      }

    for (i=0; i<samples_num; i++) {
        buffer[i] = kmalloc (2 * sizeof(uint64_t*), GFP_KERNEL);
        buffer[i][0] = 0;
        buffer[i][1] = 0;

    }

    threshold = THRESHOLD_CYCLES;

    asm volatile ("CPUID\n\t"
    "RDTSC\n\t"
    "mov %%edx, %0\n\t"
    "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
    "%rax", "%rbx", "%rcx", "%rdx"); asm volatile("RDTSCP\n\t"
    "mov %%edx, %0\n\t" "mov %%eax, %1\n\t"
    "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");

    asm volatile ("CPUID\n\t"
    "RDTSC\n\t"
    "mov %%edx, %0\n\t"
    "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
    asm volatile("RDTSCP\n\t"
    "mov %%edx, %0\n\t" "mov %%eax, %1\n\t"
    "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");


      tsc = ( ((uint64_t)cycles_high << 32) | cycles_low );
      tsc2 = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
      start = tsc;
    //Polling mode
        while (!done)
        {

          variable = 0; preempt_disable();
          raw_local_irq_save(flags);
//First Read
        asm volatile ("CPUID\n\t"
        "RDTSC\n\t"
        "mov %%edx, %0\n\t"
        "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
//Second Read
        asm volatile("RDTSCP\n\t"
        "mov %%edx, %0\n\t" "mov %%eax, %1\n\t"
        "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx");
        raw_local_irq_restore(flags);
        preempt_enable();


      tsc = ( ((uint64_t)cycles_high << 32) | cycles_low );
      tsc2 = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 );
      if ( (tsc2 - tsc) < 0) {
        printk(KERN_ERR "\n\nCUCL: >>>>>>>>>>>>>> CRITICAL ERROR IN TAKING THE TIME!!!!!!\n start = %llu,   end = %llu, variable = %u\n", tsc, tsc2, variable);
      }
      if ((tsc2 - tsc > THRESHOLD_CYCLES) )
            {
                if ((samples >= MAXSAMPLES) | (samples>=samples_num))
                    {done = 1; break;}
                buffer[samples][0] = tsc2-tsc;
                buffer[samples++][1] = tsc2-start;
            }
                global_count++;
                global_time+=tsc2-tsc;
                for (i=0;i<NUM_OF_BINS;i++){
                      if (tsc2-tsc > samples_th[i]) {
                          samples_count[i]++;
                          samples_time[i]+=tsc2-tsc;
                      }
                }
        
        if (tsc2-start>TIME_CYCLES) 
           {    done = 1;
               break;
          }
        }
     

       printk(KERN_ERR "\nCUCL: now save");
       printk(KERN_ERR "\nCUCL: Test parameters: Threshold %d cycles %llu samples %d\n",threshold_usec,time_cycles,samples_num);

        sort(buffer, samples, sizeof(buffer[0]), compare_ull,NULL);
        printk(KERN_ERR "CUCL: number of events:\t%u\n",samples);
        printk(KERN_ERR "CUCL: run time:    \t%llu cycles\n\n",(tsc-start));
        if (samples>0) {
            printk(KERN_ERR "CUCL: min\t\t%llu\n" "1%%\t\t%llu\n" "5%%\t\t%llu\n" "10%%\t\t%llu\n" "25%%\t\t%llu\n" "median\t\t%llu\n" "75%%\t\t%llu\n" "90%%\t\t%llu\n" "95%%\t\t%llu\n" "99%%\t\t%llu\n" "99.9%%\t\t%llu\n"  "max\t\t%llu\n",
                buffer[0][0],
                buffer[samples*1/100][0],
                buffer[samples*5/100][0],
                buffer[samples*10/100][0],
                buffer[samples*25/100][0],
                buffer[samples*50/100][0],
                buffer[samples*75/100][0],
                buffer[samples*90/100][0],
                buffer[samples*95/100][0],
                buffer[samples*99/100][0],
                buffer[samples*999/1000][0],
                buffer[samples-1][0]);
          }

          printk(KERN_ERR "\n\nCUCL: Global statistics: Total Events\t%llu\tRun time[cycles]\t%llu \n",global_count,global_time);
          for (i=0;i<NUM_OF_BINS;i++) {
           if (((samples_count[i]>0) & (i+1<NUM_OF_BINS) & (samples_count[i]>samples_count[i+1])) | ((samples_count[i]>0) && (i+1==NUM_OF_BINS))){
           printk(KERN_ERR "CUCL: Threshold[cycles]\t%llu\t\tEvents\t%llu\t\tTime[cycles]\t%llu\n",1+samples_th[i],samples_count[i],samples_time[i]);
          }}
          printk(KERN_ERR "CUCL: successfully ended\n\n");

    for (i=0; i<samples_num; i++) {
    kfree(buffer[i]); 
    }
    kfree(buffer);
    kfree(samples_time);
    kfree(samples_count);
    kfree(samples_th);
    return 0;
}

static void __exit hello_end(void) {
 printk(KERN_INFO "Latency - The End\n");
}

module_init(hello_start);
module_exit(hello_end);


