Guassian elimination solves simultaneous equations. When there are multiple right-hand sides to solve, the lower/upper decomposition approach is preferred.
Here is the single-threaded implementation.
This is the single-threaded version. Parallel speedup is gained here through automatic unwind of the inner loops.
// Kiwi Scientific Acceleration Example - Lower/Upper Decomposition and Equation Solving. // (C) 2014 DJ Greaves, University of Cambridge, Computer Laboratory. // // Simultaneous Equations Solving using L/U decomposition. // This is the single-threaded version. // Parallel speedup is gained here through automatic unwind of the inner loops. // An explictly parallel implementation is in another demo. using System; using System.Text; using KiwiSystem; using System.Diagnostics; //http://halcyon.usc.edu/~pk/prasannawebsite/papers/ERSAvikashd_gg.pdf //Efficient Floating-point based Block LU Decomposition on FPGAs Vikash Daga public class SimuSolve { double [,] tmp; double [] yy; double [] xx; int problemSize; public SimuSolve(int ps) // constructor { problemSize = ps; tmp = new double [problemSize,problemSize]; yy = new double[problemSize]; xx = new double[problemSize]; } public void LUdecompose(double [,] LL, double [,] Adata, bool pivot_enable) { //int problemSize = yy.Length; for (int k=0; k<problemSize; k++) { LL[k,k] = 1.0; } for (int k=0; k<problemSize; k++) { for (int z=0; z<problemSize; z++) { LL[z,z] = 1.0; } /* if (pivot_enable) { double p = 0.0; int k1 = 0; for(int i=k; i<problemSize; i++) { if (Math.Abs(Adata[i,k]) > p) // Pivoting : find largest { p = Math.Abs(Adata[i,k]); k1 = k; } } //Console.WriteLine("Pivot %d %f", k1, p); double [] t = Adata[k]; Adata[k] = Adata[k1]; Adata[k1] = t; } */ for (int i=k+1; i<problemSize; i++) { Debug.Assert(Adata[k,k] != 0.0); // Singular matrix! double mu = Adata[i,k]/Adata[k,k]; Console.WriteLine("mu at {0} {1} is {2}", i, k, mu); Adata[i,k] = 0.0; LL [i,k] = mu; // This simple store is all you need to form L. for (int j=k+1; j<problemSize; j++) { Adata[i,j] = Adata[i,j] - mu * Adata[k,j]; } } } } // Decompose to Lower and Upper. Upper is done in-place in G. public void DecomposeVerbose(double [,] LL, double [,] G) { Console.WriteLine("L/U Decomposition - single-threaded version.\n"); Console.WriteLine("Initial Coefficient Matrix Pre L/U Decomposition:\n"); MatrixLib.printa(G); LUdecompose(LL, G, true); Console.WriteLine("UU="); MatrixLib.printa(G); Console.WriteLine("LL="); MatrixLib.printa(LL); Console.WriteLine("Recombine LL and RR: Should result in the original:"); MatrixLib.mpx(tmp, LL, G); MatrixLib.printa(tmp); } public double [] FwdsSubst(double [,] LL, double [] b) // Forwards substitution to solve Ly=b { yy[0] = b[0]; for (int i=1; i < problemSize; i++) { double sum = 0.0; for (int j=0; j < i; j++) sum += LL[i,j] * yy[j]; yy[i] = b[i] - sum; } return yy; } public double [] BackSubst(double [,] UU, double [] yy) // Back substitution to solve Ux=y { for (int i=problemSize-1; i >= 0; i--) { double sum = 0.0; for (int j=i+1; j < problemSize; j++) sum += UU[i,j] * xx[j]; xx[i] = (yy[i] - sum)/UU[i,i]; } return xx; } public double [] SolveVerbose(double [,] LL, double [,] UU, double [] rhs) { double [] yy = FwdsSubst(LL, rhs); Console.Write("After fwds subst="); MatrixLib.printa(yy); double [] xx = BackSubst(UU, yy); Console.Write("After back subst="); MatrixLib.printa(xx); return xx; // Return the final solution. } } class MatrixLib // It would be more OO to apply these methods to matrix instances. { //------------------------------------------------------------------------ // Matrix support functions now follow (there is similar code in java.utils). public static void printa (double [,] A) { for (int i=0; i<A.GetLength(0); i++) { Console.Write(" "); for (int j=0; j<A.GetLength(1); j++) { if (A[i,j]==0.0) Console.Write("-.------ ", A[i,j]); else Console.Write(" {0:f4} ", A[i,j]); } Console.WriteLine(""); } Console.WriteLine(""); } public static void printa (double [] AA) { Console.Write("{"); for (int j=0; j<AA.Length; j++) { // if (AA[j]==0.0) Console.Write("-.------ ", AA[j]); else Console.Write("{0:f4} ", AA[j]); } Console.WriteLine("}"); } public static double [] mpx(double [] Ans, double[,] AA, double[] BB) // Matrix multiply oblong array by column array. { for (int i=0; i<BB.Length; i++) { double ss = 0.0; for (int j=0; j<BB.Length; j++) ss += AA[i,j] * BB[j]; Ans[i] = ss; } return Ans; } public static double [,] mpx(double [,] Ans, double[,] AA, double[,] BB) // Matrix multiply oblong arrays. { //Console.WriteLine(" mpx %d,%d with %d,%d\n", AA.Length, AA[0].Length, BB.Length, BB[0].Length); //Debug.Assert(AA.GetLength(1) == BB.GetLength(0)); for (int i=0; i<AA.GetLength(0); i++) for (int k=0; k < BB.GetLength(1); k++) { double sum = 0.0; for (int j=0; j<AA.GetLength(1); j++) { //Console.WriteLine(" mpx %d,%d with %d,%d %d %d %d\n", AA.Length, AA[0].Length, BB.Length, BB[0].Length, i, k, j); sum += AA[i,j] * BB[j,k]; } Ans[i,k] = sum; } return Ans; } public static double[] [] transpose(double[] [] Ans, double[] [] AA) { for (int i=0; i<AA.Length; i++) for (int j=0; j<AA[0].Length; j++) Ans[j] [i] = AA[i] [j]; return Ans; } public static double [,] copy2d(double[,] copy, double [,] original) // A deep clone - can we not just do a structure assign?. { for(int i = 0; i < original.GetLength(0); i++) { for (int j=0; j<original.GetLength(1); j++) copy[i,j] = original[i,j]; } return copy; } } public class luTest { const int problemSize = 8; static double [] target_rhs = new double [problemSize]; static double [] res = new double [problemSize]; static double [,] coefficients = new double [problemSize, problemSize]; static double [,] LLtop = new double [problemSize, problemSize]; static double [,] UUtop = new double [problemSize, problemSize]; static void generate_example_rhs(int no, bool printme) { for (int i=0; i<problemSize; i++) target_rhs[i] = 1.0 + 2.0 * (double)i + 10.0 * (double)no; target_rhs[problemSize-1] = 2.71; if (printme) for (int i=0; i<problemSize; i++) { Console.WriteLine(" test={2} target_rhs [{0}] == {1}", i, target_rhs[i], no); } } static void generate_example_coefficients() { double pp = 10.0; for (int rr=0; rr<problemSize; rr++) for (int cc=0; cc<problemSize; cc++) { coefficients[rr,cc] = pp; pp *= 1.1; } for (int i=0; i<problemSize; i++) // += requires Array.Address backdoor to work { double p = coefficients[i,i]; coefficients[i,i] = p + 10.0; } Console.WriteLine("Kiwi L/U demo - L/U decomposition target_rhs generated."); } [Kiwi.HardwareEntryPoint()] public static void Main() { Console.WriteLine("Kiwi Demo - L/U decomposition"); SimuSolve ssolve = new SimuSolve(problemSize); Kiwi.Pause(); generate_example_coefficients(); MatrixLib.copy2d(UUtop, coefficients); ssolve.DecomposeVerbose(LLtop, UUtop); Console.WriteLine("Kiwi L/U demo - coefficient matrix decomposed."); for (int test=0; test<3; test++) { Console.WriteLine("\nKiwi L/U demo - L/U decomposition test with rhs no {0}.", test); generate_example_rhs(test, true); double [] sol = ssolve.SolveVerbose(LLtop, UUtop, target_rhs); // Now see if it works as a solution Console.Write("Substitute back - rhs given by solution is: y="); MatrixLib.printa(MatrixLib.mpx(res, coefficients, sol)); } Console.WriteLine("Kiwi L/U demo - L/U decomposition demo complete."); } } // eof
Here we restricted Kiwi to using only one multiplier, divider and add/sub unit per thread. And there was only one thread. This will give the slowest execution on FPGA as a baseline.
gmcs lu-decomp.cs -r:/home/djg11/d320/hprls/kiwipro/kiwic/distro/support/Kiwi.dll /home/djg11/d320/hprls/kiwipro/kiwic/distro/bin/kiwic -give-backtrace -vnl-rootmodname=DUT -vnl=lu-decomp.v lu-decomp.exe -vnl-resets=synchronous -kiwic-cil-dump=combined -kiwic-kcode-dump=enable -res2-loadstore-port-count=0 -vnl-roundtrip=disable -max_no_int_divs=1 -max_no_fp_divs=1 -max_no_int_muls=1 -max_no_fp_muls=1 -int_fl_limit_mul=20
// CBG Orangepath HPR L/S System // Verilog output file generated at 02/06/2016 00:02:49 // Kiwi Scientific Acceleration (KiwiC .net/CIL/C# to Verilog/SystemC compiler): Version alpha 2.15b : 1st-June-2016 Unix 3.19.8.100 // /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/kiwic.exe -give-backtrace -vnl-rootmodname=DUT -vnl=lu-decomp.v lu-decomp.exe -sim 10000 -vnl-resets=synchronous -kiwic-cil-dump=combined -kiwic-kcode-dump=enable -res2-loadstore-port-count=0 -vnl-roundtrip=disable -max_no_int_divs=1 -max_no_fp_divs=1 -max_no_int_muls=1 -max_no_fp_muls=1 -int_fl_limit_mul=20 `timescale 1ns/1ns module DUT(input clk, input reset); integer lTMT4Main_V_1; reg [63:0] Tlge0_8_V_0; integer Tlge0_8_V_1; integer Tlge0_8_V_2; integer Tlge0_8_V_3; ... snip ... CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx32_ARA0(clk, reset, A_FPD_CC_SCALbx32_ARA0_RDD0, A_FPD_CC_SCALbx32_ARA0_AD0 , A_FPD_CC_SCALbx32_ARA0_WEN0, A_FPD_CC_SCALbx32_ARA0_REN0, A_FPD_CC_SCALbx32_ARA0_WRD0); CV_FP_FL4_DP_ADDER CVFPADDER10(clk, reset, CVFPADDER10_FPRR, CVFPADDER10_A0, CVFPADDER10_A1, CVFPADDER10_fail); CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx36_ARC0(clk, reset, A_FPD_CC_SCALbx36_ARC0_RDD0, A_FPD_CC_SCALbx36_ARC0_AD0 , A_FPD_CC_SCALbx36_ARC0_WEN0, A_FPD_CC_SCALbx36_ARC0_REN0, A_FPD_CC_SCALbx36_ARC0_WRD0); CV_FP_CVT_FL2_F64_I32 ifpcvt10(clk, fpcvt10_result, fpcvt10_arg, fpcvt10_fail); CV_FP_FL3_DP_MULTIPLIER CVFPMULTIPLIER10(clk, reset, CVFPMULTIPLIER10_FPRR, CVFPMULTIPLIER10_A0, CVFPMULTIPLIER10_A1, CVFPMULTIPLIER10_fail ); CV_FP_CVT_FL2_F64_I32 ifpcvt12(clk, fpcvt12_result, fpcvt12_arg, fpcvt12_fail); CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx40_ARE0(clk, reset, A_FPD_CC_SCALbx40_ARE0_RDD0, A_FPD_CC_SCALbx40_ARE0_AD0 , A_FPD_CC_SCALbx40_ARE0_WEN0, A_FPD_CC_SCALbx40_ARE0_REN0, A_FPD_CC_SCALbx40_ARE0_WRD0); CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx42_ARF0(clk, reset, A_FPD_CC_SCALbx42_ARF0_RDD0, A_FPD_CC_SCALbx42_ARF0_AD0 , A_FPD_CC_SCALbx42_ARF0_WEN0, A_FPD_CC_SCALbx42_ARF0_REN0, A_FPD_CC_SCALbx42_ARF0_WRD0); CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx44_ARG0(clk, reset, A_FPD_CC_SCALbx44_ARG0_RDD0, A_FPD_CC_SCALbx44_ARG0_AD0 , A_FPD_CC_SCALbx44_ARG0_WEN0, A_FPD_CC_SCALbx44_ARG0_REN0, A_FPD_CC_SCALbx44_ARG0_WRD0); CV_SP_SSRAM_FL1 #(7'd64, 2'd3, 4'd8, 7'd64) A_FPD_CC_SCALbx46_ARH0(clk, reset, A_FPD_CC_SCALbx46_ARH0_RDD0, A_FPD_CC_SCALbx46_ARH0_AD0 , A_FPD_CC_SCALbx46_ARH0_WEN0, A_FPD_CC_SCALbx46_ARH0_REN0, A_FPD_CC_SCALbx46_ARH0_WRD0); CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx34_ARB0(clk, reset, A_FPD_CC_SCALbx34_ARB0_RDD0, A_FPD_CC_SCALbx34_ARB0_AD0 , A_FPD_CC_SCALbx34_ARB0_WEN0, A_FPD_CC_SCALbx34_ARB0_REN0, A_FPD_CC_SCALbx34_ARB0_WRD0); CV_FP_FL5_DP_DIVIDER CVFPDIVIDER10(clk, reset, CVFPDIVIDER10_FPRR, CVFPDIVIDER10_NN, CVFPDIVIDER10_DD, CVFPDIVIDER10_fail); CV_SP_SSRAM_FL1 #(7'd64, 3'd6, 7'd64, 7'd64) A_FPD_CC_SCALbx38_ARD0(clk, reset, A_FPD_CC_SCALbx38_ARD0_RDD0, A_FPD_CC_SCALbx38_ARD0_AD0 , A_FPD_CC_SCALbx38_ARD0_WEN0, A_FPD_CC_SCALbx38_ARD0_REN0, A_FPD_CC_SCALbx38_ARD0_WRD0); endmodule //Report from verilog_render: //1 vectors of width 8 //48 vectors of width 1 //36 vectors of width 64 //4 vectors of width 6 //4 vectors of width 3 //2 vectors of width 32 //1056 bits in scalar variables //Total state bits in module = 3516 bits. //837 continuously assigned (wire/non-state) bits //Total number of leaf cells = 0 // // eof (HPR L/S Verilog)
The full RTL output is here Verilog RTL (2400 lines).
iverilog lu-decomp.v vsys.v /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/cv_fparith.v /home/djg11/d320/hprls/kiwipro/kiwic/distro/lib/cvgates.v ./a.out VCD info: dumpfile vcd.vcd opened for output. Kiwi Demo - L/U decomposition Kiwi L/U demo - L/U decomposition target_rhs generated. L/U Decomposition - single-threaded version. Initial Coefficient Matrix Pre L/U Decomposition: 20.000000 11.000000 12.100000 13.310000 14.641000 16.105100 17.715610 19.487171 21.435888 33.579477 25.937425 28.531167 31.384284 34.522712 37.974983 41.772482 45.949730 50.544703 65.599173 61.159090 67.274999 74.002499 81.402749 89.543024 98.497327 108.347059 119.181765 141.099942 144.209936 158.630930 174.494023 191.943425 211.137767 232.251544 255.476699 281.024368 319.126805 340.039486 374.043434 411.447778 452.592556 497.851811 547.636992 602.400692 662.640761 738.904837 801.795321 881.974853 970.172338 1067.189572 1173.908529 1291.299382 1420.429320 1562.472252 1728.719477 1890.591425 2079.650567 2287.615624 2516.377186 2768.014905 3044.816395 3349.298035 3684.227838 4062.650622 UU= 20.000000 11.000000 12.100000 13.310000 14.641000 16.105100 17.715610 19.487171 -.------ 21.789738 12.968712 14.265584 15.692142 17.261356 18.987492 20.886241 -.------ -.------ 22.758109 14.033920 15.437312 16.981044 18.679148 20.547063 -.------ -.------ -.------ 23.218565 14.540421 15.994463 17.593910 19.353301 -.------ -.------ -.------ -.------ 23.424036 14.766439 16.243083 17.867392 -.------ -.------ -.------ -.------ -.------ 23.513117 14.864429 16.350872 -.------ -.------ -.------ -.------ -.------ -.------ 23.551255 14.906380 -.------ -.------ -.------ -.------ -.------ -.------ -.------ 23.567494 LL= 1.000000 1.071794 1.000000 2.297486 1.159828 1.000000 4.924866 2.486195 1.201688 1.000000 10.556888 5.329379 2.575924 1.220367 1.000000 22.629628 11.423997 5.521723 2.615965 1.228465 1.000000 48.508617 24.488352 11.836303 5.607553 2.633324 1.231932 1.000000 103.982528 52.492957 25.372166 12.020288 5.644764 2.640756 1.233409 1.000000 Recombine LL and RR: Should result in the original: 2079.650567 2287.615624 2516.377186 2768.014905 3044.816395 3349.298035 3684.227838 4062.650622 Kiwi L/U demo - coefficient matrix decomposed. Kiwi L/U demo - L/U decomposition test with rhs no 0. test=0 target_rhs [0] == 1.000000 test=0 target_rhs [1] == 3.000000 test=0 target_rhs [2] == 5.000000 test=0 target_rhs [3] == 7.000000 test=0 target_rhs [4] == 9.000000 test=0 target_rhs [5] == 11.000000 test=0 target_rhs [6] == 13.000000 test=0 target_rhs [7] == 2.710000 After fwds subst={1.000000 1.928206 0.466126 -3.278899 -9.032273 -16.557946 -25.674637 -48.525204 } After back subst={0.088796 0.275984 0.448519 0.589646 0.663447 0.592927 0.213043 -2.058989 } Substitute back - rhs given by solution is: y={1.000000 3.000000 5.000000 7.000000 9.000000 11.000000 13.000000 2.710000 } Kiwi L/U demo - L/U decomposition test with rhs no 1. test=1 target_rhs [0] == 11.000000 test=1 target_rhs [1] == 13.000000 test=1 target_rhs [2] == 15.000000 test=1 target_rhs [3] == 17.000000 test=1 target_rhs [4] == 19.000000 test=1 target_rhs [5] == 21.000000 test=1 target_rhs [6] == 23.000000 test=1 target_rhs [7] == 2.710000 After fwds subst={11.000000 1.210262 -11.676047 -26.151513 -41.584660 -57.783292 -74.693883 -114.577444 } After back subst={1.075320 1.247095 1.386594 1.456904 1.378902 0.982981 -0.094430 -4.861673 } Substitute back - rhs given by solution is: y={11.000000 13.000000 15.000000 17.000000 19.000000 21.000000 23.000000 2.710000 } Kiwi L/U demo - L/U decomposition test with rhs no 2. test=2 target_rhs [0] == 21.000000 test=2 target_rhs [1] == 23.000000 test=2 target_rhs [2] == 25.000000 test=2 target_rhs [3] == 27.000000 test=2 target_rhs [4] == 29.000000 test=2 target_rhs [5] == 31.000000 test=2 target_rhs [6] == 33.000000 test=2 target_rhs [7] == 2.710000 After fwds subst={21.000000 0.492317 -23.818220 -49.024128 -74.137047 -99.008638 -123.713129 -180.629685 } After back subst={2.061843 2.218207 2.324669 2.324162 2.094358 1.373035 -0.401903 -7.664357 } Substitute back - rhs given by solution is: y={21.000000 23.000000 25.000000 27.000000 29.000000 31.000000 33.000000 2.710000 } Kiwi L/U demo - L/U decomposition demo complete. cp vcd.vcd ~/Dropbox Process make finished
For comparison, the same .exe file is run under mono:
MONO_PATH=/home/djg11/d320/hprls/kiwipro/kiwic/distro/support mono lu-decomp.exe Kiwi Demo - L/U decomposition Kiwi L/U demo - L/U decomposition target_rhs generated. L/U Decomposition - single-threaded version. Initial Coefficient Matrix Pre L/U Decomposition: 20.00 11.00 12.10 13.31 14.64 16.11 17.72 19.49 21.44 33.58 25.94 28.53 31.38 34.52 37.97 41.77 45.95 50.54 65.60 61.16 67.27 74.00 81.40 89.54 98.50 108.35 119.18 141.10 144.21 158.63 174.49 191.94 211.14 232.25 255.48 281.02 319.13 340.04 374.04 411.45 452.59 497.85 547.64 602.40 662.64 738.90 801.80 881.97 970.17 1067.19 1173.91 1291.30 1420.43 1562.47 1728.72 1890.59 2079.65 2287.62 2516.38 2768.01 3044.82 3349.30 3684.23 4062.65 UU= 20.00 11.00 12.10 13.31 14.64 16.11 17.72 19.49 -.------ 21.79 12.97 14.27 15.69 17.26 18.99 20.89 -.------ -.------ 22.76 14.03 15.44 16.98 18.68 20.55 -.------ -.------ -.------ 23.22 14.54 15.99 17.59 19.35 -.------ -.------ -.------ -.------ 23.42 14.77 16.24 17.87 -.------ -.------ -.------ -.------ -.------ 23.51 14.86 16.35 -.------ -.------ -.------ -.------ -.------ -.------ 23.55 14.91 -.------ -.------ -.------ -.------ -.------ -.------ -.------ 23.57 LL= 1.00 -.------ -.------ -.------ -.------ -.------ -.------ -.------ 1.07 1.00 -.------ -.------ -.------ -.------ -.------ -.------ 2.30 1.16 1.00 -.------ -.------ -.------ -.------ -.------ 4.92 2.49 1.20 1.00 -.------ -.------ -.------ -.------ 10.56 5.33 2.58 1.22 1.00 -.------ -.------ -.------ 22.63 11.42 5.52 2.62 1.23 1.00 -.------ -.------ 48.51 24.49 11.84 5.61 2.63 1.23 1.00 -.------ 103.98 52.49 25.37 12.02 5.64 2.64 1.23 1.00 Recombine LL and RR: Should result in the original: 20.00 11.00 12.10 13.31 14.64 16.11 17.72 19.49 21.44 33.58 25.94 28.53 31.38 34.52 37.97 41.77 45.95 50.54 65.60 61.16 67.27 74.00 81.40 89.54 98.50 108.35 119.18 141.10 144.21 158.63 174.49 191.94 211.14 232.25 255.48 281.02 319.13 340.04 374.04 411.45 452.59 497.85 547.64 602.40 662.64 738.90 801.80 881.97 970.17 1067.19 1173.91 1291.30 1420.43 1562.47 1728.72 1890.59 2079.65 2287.62 2516.38 2768.01 3044.82 3349.30 3684.23 4062.65 Kiwi L/U demo - coefficient matrix decomposed. Kiwi L/U demo - L/U decomposition test with rhs no 0. test=0 target_rhs [0] == 1 test=0 target_rhs [1] == 3 test=0 target_rhs [2] == 5 test=0 target_rhs [3] == 7 test=0 target_rhs [4] == 9 test=0 target_rhs [5] == 11 test=0 target_rhs [6] == 13 test=0 target_rhs [7] == 2.71 After fwds subst={1.00 1.93 0.47 -3.28 -9.03 -16.56 -25.67 -48.53 } After back subst={0.09 0.28 0.45 0.59 0.66 0.59 0.21 -2.06 } Substitute back - rhs given by solution is: y={1.00 3.00 5.00 7.00 9.00 11.00 13.00 2.71 } Kiwi L/U demo - L/U decomposition test with rhs no 1. test=1 target_rhs [0] == 11 test=1 target_rhs [1] == 13 test=1 target_rhs [2] == 15 test=1 target_rhs [3] == 17 test=1 target_rhs [4] == 19 test=1 target_rhs [5] == 21 test=1 target_rhs [6] == 23 test=1 target_rhs [7] == 2.71 After fwds subst={11.00 1.21 -11.68 -26.15 -41.58 -57.78 -74.69 -114.58 } After back subst={1.08 1.25 1.39 1.46 1.38 0.98 -0.09 -4.86 } Substitute back - rhs given by solution is: y={11.00 13.00 15.00 17.00 19.00 21.00 23.00 2.71 } Kiwi L/U demo - L/U decomposition test with rhs no 2. test=2 target_rhs [0] == 21 test=2 target_rhs [1] == 23 test=2 target_rhs [2] == 25 test=2 target_rhs [3] == 27 test=2 target_rhs [4] == 29 test=2 target_rhs [5] == 31 test=2 target_rhs [6] == 33 test=2 target_rhs [7] == 2.71 After fwds subst={21.00 0.49 -23.82 -49.02 -74.14 -99.01 -123.71 -180.63 } After back subst={2.06 2.22 2.32 2.32 2.09 1.37 -0.40 -7.66 } Substitute back - rhs given by solution is: y={21.00 23.00 25.00 27.00 29.00 31.00 33.00 2.71 } Kiwi L/U demo - L/U decomposition demo complete.
To increase parallelism, we increase the multiplier and adder budget available and we run each right-hand-side on its own thread. The new guidelines to KiwiC were:
-max_no_int_divs=10 -max_no_fp_divs=10 -max_no_int_muls=10 -max_no_fp_muls=10 -int_fl_limit_mul=20
and KiwiC ended up using the following cells:
// cell CV_SP_SSRAM_FL1 count=8 // cell CV_FP_FL4_DP_ADDER count=10 // cell CV_FP_CVT_FL2_F64_I32 count=2 // cell CV_FP_FL3_DP_MULTIPLIER count=8 // cell CV_FP_FL5_DP_DIVIDER count=2 // Total number of leaf cells = 30
Performance changed by ...
For this to useful on FPGA we need it to be exploit the parallelism of the FPGA.