Orangepath/HPR Logic Synthesis Project: Hardware and Embedded Software Synthesis from Executable Specifications.
Compilation from .net CIL Bytecode (second example)

Kiwi Scientific Acceleration:CSharp Floating Point HPC Early Demo

Source File

The following CSharp file was compiled with mono mcs.

//
// Kiwi Scientific Acceleration Example - Simple floating point tests.
// (C) 2014 DJ Greaves, University of Cambridge, Computer Laboratory.
//
using System;
using System.Text;
using KiwiSystem;
using System.Diagnostics;


public class test49
{
  const int problemSize = 6;

  static double [] data = new double [problemSize];

  static volatile int volx = 100; // This defeats compile-time constant propagation.

  public static void test49_phase0()
        {
           Console.WriteLine("Kiwi Demo - Test49 phase0 starting.");
           for (int i=0; i<problemSize; i++)
           {
             double qfp0 = (double)((volx+i)*3330.2);
             Kiwi.Pause();
             Console.WriteLine("data {0}  qfp0={1}", i, qfp0);
             float qfp1 = (float) qfp0;
             Kiwi.Pause();
             float qfp2 = 7.12345f * (float) i;
             Kiwi.Pause();
             int qfp3 = (int) qfp1;
             Console.WriteLine("                  qfp1={0}  qfp2={1}  qfp3={2}", qfp1, qfp2, qfp3);
             }
           }

  public static void test49_phase1()
        {
           Console.WriteLine("Kiwi Demo - Test49 phase1 starting.");
           Kiwi.Pause();
           for (int i=0; i<problemSize; i++) data[i] = 3.1415;
           data[problemSize-1] = 2.71;

           for (int it=0; it<3; it++)
           {
             Kiwi.Pause();
             data[1] *= 100.0;
             data[2] -= 100.0;
             data[3] /= 100.0;
             data[4] += 100.0;
             for (int i=0; i<problemSize; i++)
             {
               Console.WriteLine("data {0}  is {1}", i, data[i]);
             }
           }
    }
  [Kiwi.HardwareEntryPoint()]
  public static void Main()
        {
           Console.WriteLine("Kiwi Demo - Test49 starting.");
           Kiwi.Pause();
           test49_phase0();
           test49_phase1();
           Console.WriteLine("Test49 done.");
        }
}

Generated RTL from KiwiC

module DUT(input clk, input reset);
function [31:0] hpr_dbl2flt4;
input [63:0] arg;

reg          signi;
reg [10:0]   expi;
reg [51:0]   manti;
reg [7:0]    expo;
reg [22:0]   manto;
reg  overflow, scase_inf, scase_zero, scase_nan, fail;

begin
  { signi, expi, manti } = arg;  // Deconstruct input arg
  scase_zero = (arg[62:0] == 63'd0);
  scase_inf = (expi == 11'h7ff) && (manti == 0);
  scase_nan = (expi == 11'h7ff) && (manti != 0);
// We can report fail on overflow but better to report infinity.
  fail = 0;
  overflow = (expi[10] == expi[9]) ||(expi[10] == expi[8]) ||(expi[10] == expi[7]);
  expo = { expi[10], expi[6:0]};
  manto = manti[51:51-22];
  scase_inf = scase_inf || overflow;
  hpr_dbl2flt4[31]    = signi;
  hpr_dbl2flt4[30:23] = (scase_inf)? 8'hff: (scase_nan)? 8'hff: (scase_zero)? 8'd0: expo;
  hpr_dbl2flt4[22:0]  = (scase_inf)? 23'd0: (scase_nan)? -23'd1: (scase_zero)? 23'd0: manto;
end
endfunction

function [63:0] hpr_flt2dbl3;
input [31:0] darg;
hpr_flt2dbl3 = {darg[31], darg[30], {3{~darg[30]}}, darg[29:23], darg[22:0], {29{1'b0}}};
endfunction

 ... snip ...

module DUT(input clk, input reset);
function [31:0] hpr_dbl2flt4;
input [63:0] arg;

reg          signi;
reg [10:0]   expi;
reg [51:0]   manti;
reg [7:0]    expo;
reg [22:0]   manto;
reg  overflow, scase_inf, scase_zero, scase_nan, fail;

begin
  { signi, expi, manti } = arg;  // Deconstruct input arg
  scase_zero = (arg[62:0] == 63'd0);
  scase_inf = (expi == 11'h7ff) && (manti == 0);
  scase_nan = (expi == 11'h7ff) && (manti != 0);
  fail = 0;
  overflow = (expi[10] == expi[9]) ||(expi[10] == expi[8]) ||(expi[10] == expi[7]);
  expo = { expi[10], expi[6:0]};
  manto = manti[51:51-22];
  scase_inf = scase_inf || overflow;
  hpr_dbl2flt4[31]    = signi;
  hpr_dbl2flt4[30:23] = (scase_inf)? 8'hff: (scase_nan)? 8'hff: (scase_zero)? 8'd0: expo;
  hpr_dbl2flt4[22:0]  = (scase_inf)? 23'd0: (scase_nan)? -23'd1: (scase_zero)? 23'd0: manto;
end
endfunction

function [63:0] hpr_flt2dbl3;
input [31:0] darg;
hpr_flt2dbl3 = {darg[31], darg[30], {3{~darg[30]}}, darg[29:23], darg[22:0], {29{1'b0}}};
endfunction

Full RTL output file: test49.v (Verilog).

Simulation Test Bench

//                                                                                                                                                                                                                   
// Kiwi Scientific Acceleration                                                                                                                                                                                      
// University of Cambridge, Computer Laboratory                                                                                                                                                                      
//                                                                                                                                                                                                                   
// vsys.v - A test wrapper for simulating very simple tests with clock and reset.                                                                                                                                    
// (C) 2010-16 DJ Greaves, University of Cambridge.                                                                                                                                                                  
//                                                                                                                                                                                                                   
//                                                                                                                                                                                                                   
//                                                                                                                                                                                                                   
`timescale 1ns/1ns

module SIMSYS();
   
   reg clk, reset;
   initial begin reset = 1; clk = 1; # 33 reset = 0; end
   always #5 clk = !clk; // 10ns period for clock = 100 MHz (lowish FPGA clock freq!)                                                                                                                                
   initial begin # (100 * 1000 * 1000) $display("Finish HDL simulation on timeout %t.", $time); $finish(); end
   initial begin $dumpfile("vcd.vcd"); $dumpvars(); end

   
   DUT the_dut(.clk(clk), .reset(reset));
   endmodule

Console Output

The above, generated RTL is run on the Icarus verilog simulator.

iverilog vsys.v test49.v /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/cvgates.v  /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/cv_fparith.v
./a.out
VCD info: dumpfile vcd.vcd opened for output.
Kiwi Demo - Test49 starting.
Kiwi Demo - Test49 phase0 starting.
data 0  qfp0=333020.000000
                  qfp1=333019.968750  qfp2=0.000000  qfp3=333019
data 1  qfp0=336350.200000
                  qfp1=336350.187500  qfp2=7.123450  qfp3=336350
data 2  qfp0=339680.400000
                  qfp1=339680.375000  qfp2=14.246900  qfp3=339680
data 3  qfp0=343010.600000
                  qfp1=343010.593750  qfp2=21.370348  qfp3=343010
data 4  qfp0=346340.800000
                  qfp1=346340.781250  qfp2=28.493799  qfp3=346340
data 5  qfp0=349671.000000
                  qfp1=349670.968750  qfp2=35.617249  qfp3=349670
Kiwi Demo - Test49 phase1 starting.
phase1: data 0  is 3.141500
phase1: data 1  is 314.150000
phase1: data 2  is -96.858500
phase1: data 3  is 0.031415
phase1: data 4  is 103.141500
phase1: data 5  is 2.710000
phase1: data 0  is 3.141500
phase1: data 1  is 31415.000000
phase1: data 2  is -196.858500
phase1: data 3  is 0.000802
phase1: data 4  is 203.141500
phase1: data 5  is 2.710000
phase1: data 0  is 3.141500
phase1: data 1  is 3141500.000000
phase1: data 2  is -296.858500
phase1: data 3  is 0.000008
phase1: data 4  is 303.141500
phase1: data 5  is 2.710000
Kiwi Demo - Test49 phase1 finished.
Test49 done.
cp vcd.vcd ~/Dropbox

For comparison, output when the same .exe file is run on Mono

MONO_PATH=/home/djg11/d320/hprls/kiwipro/kiwic/distro/support mono test49.exe
Kiwi Demo - Test49 starting.
Kiwi Demo - Test49 phase0 starting.
data 0  qfp0=333020
                  qfp1=333020  qfp2=0  qfp3=333020
data 1  qfp0=336350.2
                  qfp1=336350.2  qfp2=7.12345  qfp3=336350
data 2  qfp0=339680.4
                  qfp1=339680.4  qfp2=14.2469  qfp3=339680
data 3  qfp0=343010.6
                  qfp1=343010.6  qfp2=21.37035  qfp3=343010
data 4  qfp0=346340.8
                  qfp1=346340.8  qfp2=28.4938  qfp3=346340
data 5  qfp0=349671
                  qfp1=349671  qfp2=35.61725  qfp3=349671
Kiwi Demo - Test49 phase1 starting.
data 0  is 3.1415
data 1  is 314.15
data 2  is -96.8585
data 3  is 0.031415
data 4  is 103.1415
data 5  is 2.71  
data 0  is 3.1415
data 1  is 31415
data 2  is -196.8585
data 3  is 0.00031415
data 4  is 203.1415
data 5  is 2.71  
data 0  is 3.1415
data 1  is 3141500  
data 2  is -296.8585
data 3  is 3.1415E-06
data 4  is 303.1415
data 5  is 2.71
Test49 done.

We see that the FPGA rounding is not quite right in every case. Badly wrong in a couple of cases. This is being fixed.

Conclusion

Floating point implementation on FPGA has, traditionally, shown less speed-up and power advantage than integer or bit-level computation. But FPGA vendors are increasingly adding floating point support to the point where FPGA has become (or is becoming) competitive with ASIC.


Updated April 2016               UP.