L/U Decomposition for Simultaneous Equations

Guassian elimination solves simultaneous equations. When there are multiple right-hand sides to solve, the lower/upper decomposition approach is preferred.

Here is the single-threaded implementation.

Introduction

Source Code - Single Threaded, Non-blocking Version

This is the single-threaded version. Parallel speedup is gained here through automatic unwind of the inner loops.

// Kiwi Scientific Acceleration Example - Lower/Upper Decomposition and Equation Solving.
// (C) 2014 DJ Greaves, University of Cambridge, Computer Laboratory.
//
//  Simultaneous Equations Solving using L/U decomposition. 
//  This is the single-threaded version.  
//  Parallel speedup is gained here through automatic unwind of the inner loops.
//  An explictly parallel implementation is in another demo.

using System;
using System.Text;
using KiwiSystem;
using System.Diagnostics;

//http://halcyon.usc.edu/~pk/prasannawebsite/papers/ERSAvikashd_gg.pdf
//Efficient Floating-point based Block LU Decomposition on FPGAs Vikash Daga


public class SimuSolve
{
    double [,] tmp;
    double [] yy;
    double [] xx;
    int problemSize;
    public SimuSolve(int ps) // constructor
    {
      problemSize = ps;
      tmp = new double [problemSize,problemSize];
      yy = new double[problemSize];
      xx = new double[problemSize];
    }


    
    public void LUdecompose(double [,] LL, double [,] Adata, bool pivot_enable)
    {
        //int problemSize = yy.Length;

        for (int k=0; k<problemSize; k++) { LL[k,k] = 1.0; }
        for (int k=0; k<problemSize; k++)
            {
                for (int z=0; z<problemSize; z++) { LL[z,z] = 1.0; }
/*              if (pivot_enable)
                    {
                        double p = 0.0;
                        int k1 = 0;
                        for(int i=k; i<problemSize; i++)
                            {
                                if (Math.Abs(Adata[i,k]) > p) // Pivoting : find largest 
                                    { p = Math.Abs(Adata[i,k]);
                                      k1 = k;
                                    }
                            }           
                        //Console.WriteLine("Pivot %d %f", k1, p);
                        double [] t = Adata[k]; Adata[k] = Adata[k1]; Adata[k1] = t;
                    }
*/
                for (int i=k+1; i<problemSize; i++)
                    {
                        Debug.Assert(Adata[k,k] != 0.0); // Singular matrix!
                        double mu = Adata[i,k]/Adata[k,k];
                        Console.WriteLine("mu at {0} {1} is {2}", i, k, mu);
                        Adata[i,k] = 0.0;
                        LL [i,k] = mu; // This simple store is all you need to form L.
                        for (int j=k+1; j<problemSize; j++)
                            {
                                Adata[i,j] = Adata[i,j] - mu * Adata[k,j];
                            }
                    }

            }
    }


    // Decompose to Lower and Upper. Upper is done in-place in G.
    public void DecomposeVerbose(double [,] LL, double [,] G)
    {
        Console.WriteLine("L/U Decomposition - single-threaded version.\n");
        Console.WriteLine("Initial Coefficient Matrix Pre L/U Decomposition:\n"); MatrixLib.printa(G);
        LUdecompose(LL, G, true);
        Console.WriteLine("UU="); MatrixLib.printa(G);
        Console.WriteLine("LL="); MatrixLib.printa(LL);
        Console.WriteLine("Recombine LL and RR: Should result in the original:"); MatrixLib.mpx(tmp, LL, G); MatrixLib.printa(tmp);
    }

    public double [] FwdsSubst(double [,] LL, double [] b)    // Forwards substitution to solve Ly=b
    {
        yy[0] = b[0]; 
        for (int i=1; i < problemSize; i++)
            {
                double sum = 0.0;
                for (int j=0; j < i; j++) sum += LL[i,j] *  yy[j];
                yy[i] = b[i] - sum;
            }
        return yy;
    }

    public double [] BackSubst(double [,] UU, double [] yy)    // Back substitution to solve Ux=y
    {
        for (int i=problemSize-1; i >= 0; i--)
            {
                double sum = 0.0;
                for (int j=i+1; j < problemSize; j++) sum += UU[i,j] * xx[j];
                xx[i] = (yy[i] - sum)/UU[i,i];
            }
        return xx;
    }

    public double [] SolveVerbose(double [,] LL, double [,] UU, double [] rhs)
    {
        double [] yy = FwdsSubst(LL, rhs); 
        Console.Write("After fwds subst="); MatrixLib.printa(yy);
        double [] xx = BackSubst(UU, yy);
        Console.Write("After back subst="); MatrixLib.printa(xx);
        return xx; // Return the final solution.
    }
}


class MatrixLib // It would be more OO to apply these methods to matrix instances.
{
    //------------------------------------------------------------------------
    // Matrix support functions now follow (there is similar code in java.utils).

    public static void printa (double [,] A)
    {
        for (int i=0; i<A.GetLength(0); i++)
            {
                Console.Write("    ");
                for (int j=0; j<A.GetLength(1); j++)
                    {
                        if (A[i,j]==0.0) Console.Write("-.------ ", A[i,j]);
                        else Console.Write(" {0:f4} ", A[i,j]);
                    }
                Console.WriteLine("");
            }
        Console.WriteLine("");
    }

    public static void printa (double [] AA)
    {
        Console.Write("{");
        for (int j=0; j<AA.Length; j++)
            {
            //          if (AA[j]==0.0) Console.Write("-.------ ", AA[j]);         else
                Console.Write("{0:f4} ", AA[j]);
            }
        Console.WriteLine("}");
    }


    public static double [] mpx(double [] Ans, double[,] AA, double[] BB) // Matrix multiply oblong array by column array.
    {
        for (int i=0; i<BB.Length; i++) 
            {
                double ss = 0.0;
                for (int j=0; j<BB.Length; j++) ss += AA[i,j] * BB[j];
                Ans[i] = ss;
            }
        return Ans;
    }

    public static double [,] mpx(double [,] Ans, double[,] AA, double[,] BB) // Matrix multiply oblong arrays.
    {
        //Console.WriteLine(" mpx %d,%d with %d,%d\n", AA.Length, AA[0].Length, BB.Length, BB[0].Length);       
        //Debug.Assert(AA.GetLength(1) == BB.GetLength(0));
        for (int i=0; i<AA.GetLength(0); i++)
            for (int k=0; k < BB.GetLength(1); k++)
                {
                    double sum = 0.0;
                    for (int j=0; j<AA.GetLength(1); j++) 
                        {
                            //Console.WriteLine(" mpx %d,%d with %d,%d %d %d %d\n", AA.Length, AA[0].Length, BB.Length, BB[0].Length, i, k, j); 
                            sum += AA[i,j] * BB[j,k];
                        }
                    Ans[i,k] = sum;
                }
        return Ans;
    }

    public static double[] [] transpose(double[] [] Ans, double[] [] AA)
    {
        for (int i=0; i<AA.Length; i++) for (int j=0; j<AA[0].Length; j++) Ans[j] [i] = AA[i] [j];
        return Ans;
    }

    public static double [,] copy2d(double[,] copy, double [,] original) // A deep clone - can we not just do a structure assign?.
     {
        for(int i = 0; i < original.GetLength(0); i++)
            {
                for (int j=0; j<original.GetLength(1); j++) copy[i,j] = original[i,j];
            }
        return copy;
    }
}


public class luTest
{
  const int problemSize = 8;
  static double [] target_rhs = new double [problemSize];
  static double [] res = new double [problemSize];
  static double [,] coefficients = new double [problemSize, problemSize];
  static double [,] LLtop = new double [problemSize, problemSize];
  static double [,] UUtop = new double [problemSize, problemSize];


  static void generate_example_rhs(int no, bool printme)
  {
     for (int i=0; i<problemSize; i++) target_rhs[i] = 1.0 + 2.0 * (double)i + 10.0 * (double)no;
     target_rhs[problemSize-1] = 2.71;
     if (printme) for (int i=0; i<problemSize; i++)
     {   
       Console.WriteLine("  test={2}  target_rhs [{0}] == {1}", i, target_rhs[i], no);
     }
  }

  static void generate_example_coefficients()
  {
        double pp = 10.0;
        for (int rr=0; rr<problemSize; rr++)
	    for (int cc=0; cc<problemSize; cc++)
		 { coefficients[rr,cc] = pp; pp *= 1.1; } 
        for (int i=0; i<problemSize; i++)  // += requires Array.Address backdoor to work
        { double p = coefficients[i,i];
          coefficients[i,i] = p + 10.0;
        }

        Console.WriteLine("Kiwi L/U demo - L/U decomposition target_rhs generated.");
  }

  [Kiwi.HardwareEntryPoint()]
  public static void Main()
    {
       Console.WriteLine("Kiwi Demo - L/U decomposition");
       SimuSolve ssolve = new SimuSolve(problemSize);
       Kiwi.Pause();

       generate_example_coefficients();
       MatrixLib.copy2d(UUtop, coefficients);   
       ssolve.DecomposeVerbose(LLtop, UUtop);
       Console.WriteLine("Kiwi L/U demo - coefficient matrix decomposed.");

       for (int test=0; test<3; test++)
        {
           Console.WriteLine("\nKiwi L/U demo - L/U decomposition test with rhs no {0}.", test);
           generate_example_rhs(test, true);


           double [] sol = ssolve.SolveVerbose(LLtop, UUtop, target_rhs);

           // Now see if it works as a solution
           Console.Write("Substitute back - rhs given by solution is: y="); 
           MatrixLib.printa(MatrixLib.mpx(res, coefficients, sol));     

        }
      Console.WriteLine("Kiwi L/U demo - L/U decomposition demo complete.");
   }
}
// eof

KiwiC Compile

Here we restricted Kiwi to using only one multiplier, divider and add/sub unit per thread. And there was only one thread. This will give the slowest execution on FPGA as a baseline.

gmcs lu-decomp.cs -r:/home/djg11/d320/hprls/kiwipro/kiwic/distro/support/Kiwi.dll 
/home/djg11/d320/hprls/kiwipro/kiwic/distro/bin/kiwic -give-backtrace -vnl-rootmodname=DUT -vnl=lu-decomp.v lu-decomp.exe 
 -vnl-resets=synchronous -kiwic-cil-dump=combined -kiwic-kcode-dump=enable -res2-loadstore-port-count=0 
 -vnl-roundtrip=disable -max_no_int_divs=1 -max_no_fp_divs=1 -max_no_int_muls=1 -max_no_fp_muls=1 -int_fl_limit_mul=20

Generated Verilog RTL

// CBG Orangepath HPR L/S System

// Verilog output file generated at 02/06/2016 00:02:49
// Kiwi Scientific Acceleration (KiwiC .net/CIL/C# to Verilog/SystemC compiler): Version alpha 2.15b : 1st-June-2016 Unix 3.19.8.100
//  /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/kiwic.exe -give-backtrace -vnl-rootmodname=DUT -vnl=lu-decomp.v lu-decomp.exe -sim 10000 -vnl-resets=synchronous -kiwic-cil-dump=combined -kiwic-kcode-dump=enable -res2-loadstore-port-count=0 -vnl-roundtrip=disable -max_no_int_divs=1 -max_no_fp_divs=1 -max_no_int_muls=1 -max_no_fp_muls=1 -int_fl_limit_mul=20
`timescale 1ns/1ns


module DUT(input clk, input reset);
  integer lTMT4Main_V_1;
  reg [63:0] Tlge0_8_V_0;
  integer Tlge0_8_V_1;
  integer Tlge0_8_V_2;
  integer Tlge0_8_V_3;

 ... snip ...
  CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx32_ARA0(clk, reset, A_FPD_CC_SCALbx32_ARA0_RDD0, A_FPD_CC_SCALbx32_ARA0_AD0
, A_FPD_CC_SCALbx32_ARA0_WEN0, A_FPD_CC_SCALbx32_ARA0_REN0, A_FPD_CC_SCALbx32_ARA0_WRD0);

  CV_FP_FL4_DP_ADDER CVFPADDER10(clk, reset, CVFPADDER10_FPRR, CVFPADDER10_A0, CVFPADDER10_A1, CVFPADDER10_fail);
  CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx36_ARC0(clk, reset, A_FPD_CC_SCALbx36_ARC0_RDD0, A_FPD_CC_SCALbx36_ARC0_AD0
, A_FPD_CC_SCALbx36_ARC0_WEN0, A_FPD_CC_SCALbx36_ARC0_REN0, A_FPD_CC_SCALbx36_ARC0_WRD0);

  CV_FP_CVT_FL2_F64_I32 ifpcvt10(clk, fpcvt10_result, fpcvt10_arg, fpcvt10_fail);
  CV_FP_FL3_DP_MULTIPLIER CVFPMULTIPLIER10(clk, reset, CVFPMULTIPLIER10_FPRR, CVFPMULTIPLIER10_A0, CVFPMULTIPLIER10_A1, CVFPMULTIPLIER10_fail
);
  CV_FP_CVT_FL2_F64_I32 ifpcvt12(clk, fpcvt12_result, fpcvt12_arg, fpcvt12_fail);
  CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx40_ARE0(clk, reset, A_FPD_CC_SCALbx40_ARE0_RDD0, A_FPD_CC_SCALbx40_ARE0_AD0
, A_FPD_CC_SCALbx40_ARE0_WEN0, A_FPD_CC_SCALbx40_ARE0_REN0, A_FPD_CC_SCALbx40_ARE0_WRD0);

  CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx42_ARF0(clk, reset, A_FPD_CC_SCALbx42_ARF0_RDD0, A_FPD_CC_SCALbx42_ARF0_AD0
, A_FPD_CC_SCALbx42_ARF0_WEN0, A_FPD_CC_SCALbx42_ARF0_REN0, A_FPD_CC_SCALbx42_ARF0_WRD0);

  CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx44_ARG0(clk, reset, A_FPD_CC_SCALbx44_ARG0_RDD0, A_FPD_CC_SCALbx44_ARG0_AD0
, A_FPD_CC_SCALbx44_ARG0_WEN0, A_FPD_CC_SCALbx44_ARG0_REN0, A_FPD_CC_SCALbx44_ARG0_WRD0);

  CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx46_ARH0(clk, reset, A_FPD_CC_SCALbx46_ARH0_RDD0, A_FPD_CC_SCALbx46_ARH0_AD0
, A_FPD_CC_SCALbx46_ARH0_WEN0, A_FPD_CC_SCALbx46_ARH0_REN0, A_FPD_CC_SCALbx46_ARH0_WRD0);

  CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx34_ARB0(clk, reset, A_FPD_CC_SCALbx34_ARB0_RDD0, A_FPD_CC_SCALbx34_ARB0_AD0
, A_FPD_CC_SCALbx34_ARB0_WEN0, A_FPD_CC_SCALbx34_ARB0_REN0, A_FPD_CC_SCALbx34_ARB0_WRD0);

  CV_FP_FL5_DP_DIVIDER CVFPDIVIDER10(clk, reset, CVFPDIVIDER10_FPRR, CVFPDIVIDER10_NN, CVFPDIVIDER10_DD, CVFPDIVIDER10_fail);
  CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx38_ARD0(clk, reset, A_FPD_CC_SCALbx38_ARD0_RDD0, A_FPD_CC_SCALbx38_ARD0_AD0
, A_FPD_CC_SCALbx38_ARD0_WEN0, A_FPD_CC_SCALbx38_ARD0_REN0, A_FPD_CC_SCALbx38_ARD0_WRD0);

endmodule

//Report from verilog_render:
//1 vectors of width 8
//48 vectors of width 1
//36 vectors of width 64
//4 vectors of width 6
//4 vectors of width 3
//2 vectors of width 32
//1056 bits in scalar variables
//Total state bits in module = 3516 bits.
//837 continuously assigned (wire/non-state) bits 
//Total number of leaf cells = 0
//
// eof (HPR L/S Verilog)

The full RTL output is here Verilog RTL (2400 lines).

Simulation Output

iverilog lu-decomp.v vsys.v /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/cv_fparith.v    /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/cvgates.v
./a.out
VCD info: dumpfile vcd.vcd opened for output.
Kiwi Demo - L/U decomposition
Kiwi L/U demo - L/U decomposition target_rhs generated.
L/U Decomposition - single-threaded version.

Initial Coefficient Matrix Pre L/U Decomposition:

     20.000000  11.000000  12.100000  13.310000  14.641000  16.105100  17.715610  19.487171 
     21.435888  33.579477  25.937425  28.531167  31.384284  34.522712  37.974983  41.772482 
     45.949730  50.544703  65.599173  61.159090  67.274999  74.002499  81.402749  89.543024 
     98.497327  108.347059  119.181765  141.099942  144.209936  158.630930  174.494023  191.943425 
     211.137767  232.251544  255.476699  281.024368  319.126805  340.039486  374.043434  411.447778 
     452.592556  497.851811  547.636992  602.400692  662.640761  738.904837  801.795321  881.974853 
     970.172338  1067.189572  1173.908529  1291.299382  1420.429320  1562.472252  1728.719477  1890.591425 
     2079.650567  2287.615624  2516.377186  2768.014905  3044.816395  3349.298035  3684.227838  4062.650622 

UU=
     20.000000  11.000000  12.100000  13.310000  14.641000  16.105100  17.715610  19.487171 
    -.------  21.789738  12.968712  14.265584  15.692142  17.261356  18.987492  20.886241 
    -.------ -.------  22.758109  14.033920  15.437312  16.981044  18.679148  20.547063 
    -.------ -.------ -.------  23.218565  14.540421  15.994463  17.593910  19.353301 
    -.------ -.------ -.------ -.------  23.424036  14.766439  16.243083  17.867392 
    -.------ -.------ -.------ -.------ -.------  23.513117  14.864429  16.350872 
    -.------ -.------ -.------ -.------ -.------ -.------  23.551255  14.906380 
    -.------ -.------ -.------ -.------ -.------ -.------ -.------  23.567494 

LL=
     1.000000 
     1.071794  1.000000 
     2.297486  1.159828  1.000000 
     4.924866  2.486195  1.201688  1.000000 
     10.556888  5.329379  2.575924  1.220367  1.000000 
     22.629628  11.423997  5.521723  2.615965  1.228465  1.000000 
     48.508617  24.488352  11.836303  5.607553  2.633324  1.231932  1.000000 
     103.982528  52.492957  25.372166  12.020288  5.644764  2.640756  1.233409  1.000000 

Recombine LL and RR: Should result in the original:
    
    
    
    
    
    
    
     2079.650567  2287.615624  2516.377186  2768.014905  3044.816395  3349.298035  3684.227838  4062.650622 

Kiwi L/U demo - coefficient matrix decomposed.

Kiwi L/U demo - L/U decomposition test with rhs no 0.
  test=0  target_rhs [0] == 1.000000
  test=0  target_rhs [1] == 3.000000
  test=0  target_rhs [2] == 5.000000
  test=0  target_rhs [3] == 7.000000
  test=0  target_rhs [4] == 9.000000
  test=0  target_rhs [5] == 11.000000
  test=0  target_rhs [6] == 13.000000
  test=0  target_rhs [7] == 2.710000
After fwds subst={1.000000 1.928206 0.466126 -3.278899 -9.032273 -16.557946 -25.674637 -48.525204 }
After back subst={0.088796 0.275984 0.448519 0.589646 0.663447 0.592927 0.213043 -2.058989 }
Substitute back - rhs given by solution is: y={1.000000 3.000000 5.000000 7.000000 9.000000 11.000000 13.000000 2.710000 }

Kiwi L/U demo - L/U decomposition test with rhs no 1.
  test=1  target_rhs [0] == 11.000000
  test=1  target_rhs [1] == 13.000000
  test=1  target_rhs [2] == 15.000000
  test=1  target_rhs [3] == 17.000000
  test=1  target_rhs [4] == 19.000000
  test=1  target_rhs [5] == 21.000000
  test=1  target_rhs [6] == 23.000000
  test=1  target_rhs [7] == 2.710000
After fwds subst={11.000000 1.210262 -11.676047 -26.151513 -41.584660 -57.783292 -74.693883 -114.577444 }
After back subst={1.075320 1.247095 1.386594 1.456904 1.378902 0.982981 -0.094430 -4.861673 }
Substitute back - rhs given by solution is: y={11.000000 13.000000 15.000000 17.000000 19.000000 21.000000 23.000000 2.710000 }

Kiwi L/U demo - L/U decomposition test with rhs no 2.
  test=2  target_rhs [0] == 21.000000
  test=2  target_rhs [1] == 23.000000
  test=2  target_rhs [2] == 25.000000
  test=2  target_rhs [3] == 27.000000
  test=2  target_rhs [4] == 29.000000
  test=2  target_rhs [5] == 31.000000
  test=2  target_rhs [6] == 33.000000
  test=2  target_rhs [7] == 2.710000
After fwds subst={21.000000 0.492317 -23.818220 -49.024128 -74.137047 -99.008638 -123.713129 -180.629685 }
After back subst={2.061843 2.218207 2.324669 2.324162 2.094358 1.373035 -0.401903 -7.664357 }
Substitute back - rhs given by solution is: y={21.000000 23.000000 25.000000 27.000000 29.000000 31.000000 33.000000 2.710000 }
Kiwi L/U demo - L/U decomposition demo complete.
cp vcd.vcd ~/Dropbox

Process make finished

Run Under Mono

For comparison, the same .exe file is run under mono:

MONO_PATH=/home/djg11/d320/hprls/kiwipro/kiwic/distro/support mono lu-decomp.exe
Kiwi Demo - L/U decomposition
Kiwi L/U demo - L/U decomposition target_rhs generated.
L/U Decomposition - single-threaded version.

Initial Coefficient Matrix Pre L/U Decomposition:

     20.00  11.00  12.10  13.31  14.64  16.11  17.72  19.49 
     21.44  33.58  25.94  28.53  31.38  34.52  37.97  41.77 
     45.95  50.54  65.60  61.16  67.27  74.00  81.40  89.54 
     98.50  108.35  119.18  141.10  144.21  158.63  174.49  191.94 
     211.14  232.25  255.48  281.02  319.13  340.04  374.04  411.45 
     452.59  497.85  547.64  602.40  662.64  738.90  801.80  881.97 
     970.17  1067.19  1173.91  1291.30  1420.43  1562.47  1728.72  1890.59 
     2079.65  2287.62  2516.38  2768.01  3044.82  3349.30  3684.23  4062.65 

UU=
     20.00  11.00  12.10  13.31  14.64  16.11  17.72  19.49 
    -.------  21.79  12.97  14.27  15.69  17.26  18.99  20.89 
    -.------ -.------  22.76  14.03  15.44  16.98  18.68  20.55 
    -.------ -.------ -.------  23.22  14.54  15.99  17.59  19.35 
    -.------ -.------ -.------ -.------  23.42  14.77  16.24  17.87 
    -.------ -.------ -.------ -.------ -.------  23.51  14.86  16.35 
    -.------ -.------ -.------ -.------ -.------ -.------  23.55  14.91 
    -.------ -.------ -.------ -.------ -.------ -.------ -.------  23.57 

LL=
     1.00 -.------ -.------ -.------ -.------ -.------ -.------ -.------ 
     1.07  1.00 -.------ -.------ -.------ -.------ -.------ -.------ 
     2.30  1.16  1.00 -.------ -.------ -.------ -.------ -.------ 
     4.92  2.49  1.20  1.00 -.------ -.------ -.------ -.------ 
     10.56  5.33  2.58  1.22  1.00 -.------ -.------ -.------ 
     22.63  11.42  5.52  2.62  1.23  1.00 -.------ -.------ 
     48.51  24.49  11.84  5.61  2.63  1.23  1.00 -.------ 
     103.98  52.49  25.37  12.02  5.64  2.64  1.23  1.00 

Recombine LL and RR: Should result in the original:
     20.00  11.00  12.10  13.31  14.64  16.11  17.72  19.49 
     21.44  33.58  25.94  28.53  31.38  34.52  37.97  41.77 
     45.95  50.54  65.60  61.16  67.27  74.00  81.40  89.54 
     98.50  108.35  119.18  141.10  144.21  158.63  174.49  191.94 
     211.14  232.25  255.48  281.02  319.13  340.04  374.04  411.45 
     452.59  497.85  547.64  602.40  662.64  738.90  801.80  881.97 
     970.17  1067.19  1173.91  1291.30  1420.43  1562.47  1728.72  1890.59 
     2079.65  2287.62  2516.38  2768.01  3044.82  3349.30  3684.23  4062.65 

Kiwi L/U demo - coefficient matrix decomposed.

Kiwi L/U demo - L/U decomposition test with rhs no 0.
  test=0  target_rhs [0] == 1
  test=0  target_rhs [1] == 3
  test=0  target_rhs [2] == 5
  test=0  target_rhs [3] == 7
  test=0  target_rhs [4] == 9
  test=0  target_rhs [5] == 11
  test=0  target_rhs [6] == 13
  test=0  target_rhs [7] == 2.71
After fwds subst={1.00 1.93 0.47 -3.28 -9.03 -16.56 -25.67 -48.53 }
After back subst={0.09 0.28 0.45 0.59 0.66 0.59 0.21 -2.06 }
Substitute back - rhs given by solution is: y={1.00 3.00 5.00 7.00 9.00 11.00 13.00 2.71 }

Kiwi L/U demo - L/U decomposition test with rhs no 1.
  test=1  target_rhs [0] == 11
  test=1  target_rhs [1] == 13
  test=1  target_rhs [2] == 15
  test=1  target_rhs [3] == 17
  test=1  target_rhs [4] == 19
  test=1  target_rhs [5] == 21
  test=1  target_rhs [6] == 23
  test=1  target_rhs [7] == 2.71
After fwds subst={11.00 1.21 -11.68 -26.15 -41.58 -57.78 -74.69 -114.58 }
After back subst={1.08 1.25 1.39 1.46 1.38 0.98 -0.09 -4.86 }
Substitute back - rhs given by solution is: y={11.00 13.00 15.00 17.00 19.00 21.00 23.00 2.71 }

Kiwi L/U demo - L/U decomposition test with rhs no 2.
  test=2  target_rhs [0] == 21
  test=2  target_rhs [1] == 23
  test=2  target_rhs [2] == 25
  test=2  target_rhs [3] == 27
  test=2  target_rhs [4] == 29
  test=2  target_rhs [5] == 31
  test=2  target_rhs [6] == 33
  test=2  target_rhs [7] == 2.71
After fwds subst={21.00 0.49 -23.82 -49.02 -74.14 -99.01 -123.71 -180.63 }
After back subst={2.06 2.22 2.32 2.32 2.09 1.37 -0.40 -7.66 }
Substitute back - rhs given by solution is: y={21.00 23.00 25.00 27.00 29.00 31.00 33.00 2.71 }
Kiwi L/U demo - L/U decomposition demo complete.

Allowing greater silicon use

To increase parallelism, we increase the multiplier and adder budget available and we run each right-hand-side on its own thread. The new guidelines to KiwiC were:

    -max_no_int_divs=10 
    -max_no_fp_divs=10 
    -max_no_int_muls=10
    -max_no_fp_muls=10 
    -int_fl_limit_mul=20

and KiwiC ended up using the following cells:

//   cell CV_SP_SSRAM_FL1 count=8
//   cell CV_FP_FL4_DP_ADDER count=10
//   cell CV_FP_CVT_FL2_F64_I32 count=2
//   cell CV_FP_FL3_DP_MULTIPLIER count=8
//   cell CV_FP_FL5_DP_DIVIDER count=2
// Total number of leaf cells = 30

Performance changed by ...

Conclusions

For this to useful on FPGA we need it to be exploit the parallelism of the FPGA.