// // Kiwi Scientific Acceleration // K-Means Clustering demo - Lloyds Algorithm on FPGA. // (C) 2009 DJ Greaves - University of Cambridge, Computer Laboratory // // // Let's explore the parallelism in a straighforward (naive) implementation of Lloyd's Algorithm for K-Means. // // This selects NMeans central points that best fit NPoints data points // which are in a multi-dimensional space. Typically NPoints is at least two order of magnitude larger than NMeans. In this article we use 2-D space. // // There are two mutable data structures needed. One is just a vector of means that is initialised with the seeds and returned as the final result. The other is // is an array of size NPoints where the entries range over 0..NMeans-1 saying which group a point is currently in. // Some points about the algorithm are: // 1. It is non-deterministic, in the sense that the answer can depend on the initial seeding of the central points. // 2. It is iterative and will decrease an error metric on each iteration. Some number of iterations, such as 5 to 50, is useful in practice. // 3. The input data needs to be frequently accessed and so can be replicated to increase read bandwidth. // 4. There is considerable opportunity for data parallelism which we hope an HLS tool can automatically find. // 5. There is also task-level parallelism when we make several attempts from different starting seeds, but we do not discuss that here since it raises no further points of interest. // Each iteration of the algorithm runs two phases, one after the other, and these have different data access patterns. // Question: Is there a good layout of data that will boost performance. We need to consider three // scales of operation according to the amount of data: // 1. Small data size, up to a megabyte or so, that can be stored in SRAM on an FPGA. // 2. Larger data that can be held in a DRAM bank // 3. Big data that must be streamed from the file-server to the compute engine as few times as possible. // The phases are allocateCentresToPoints and adaptCentres. // The allocateCentresToPoints loop nest computes a norm for each data point with respect to the current vector of centres. It compares all data items with all centres and so has NP*NM // cost. It is embarrassingly parallel in the data direction and for each datum it is a map reduce where the reduction operator is the associative and commutative 'keep the best so far' operator. // The obvious ideal implementation is to farm a mirror of the centres to each of N units that each handles NP/N data points. // // The adaptCentres loop again has cost NP*NM but with the parallelism paradigms interchanged. It is embarrassingly parallel in the centres direction and is a map reduce over the data dimension where the associative reduction operator is the addition intrinsic to computing the arithmetic mean. using System; using KiwiSystem; public class KMeansDemo { public const int NPoints = 32; public const int NMeans = 3; const int manual_unwind_factor = 4; public struct kpoint { public float xx, yy; public float m2(kpoint other) { float dx = other.xx - xx, dy = other.yy - yy; Console.Write(" Compare ({0}, {1}) ", xx, yy); Console.Write(" with ({0}, {1}) ", other.xx, other.yy); Console.WriteLine(" distances: dx={0} dy={1}", dx, dy); return dx * dx + dy * dy; } } public kpoint [] Points = new kpoint [NPoints]; public int [] Mapping = new int [NPoints]; public kpoint [] Centroids = new kpoint [NMeans]; public bool [] Deads = new bool [NMeans]; int prbv; int randy() { Kiwi.Pause(); int nv = 1 & ((prbv >> 14) ^ (prbv >> 13)); prbv = ((prbv << 1) + nv) & 0x7FFF; return prbv; } public void GenerateData(bool randf, int seeder) { Kiwi.Pause(); prbv = seeder; randf = true; for (int i=0; i<NPoints; i++) { int xx = (randf ? randy(): 3 + i); int yy = (randf ? randy(): 3 - i); int qq = randy(); Kiwi.Pause(); kpoint pp; pp.xx = (float)xx; pp.yy = (float)yy; Points[i] = pp; Kiwi.Pause(); } Kiwi.Pause(); } const int scale_factor = 32768/512; static readonly byte[] colourmap = new byte[4] { 3, 12, 128, 15 }; void PlotDatum(float xx, float yy, byte col) { int xi = (int)xx; int yi = (int)yy; ushort xxs = (ushort)(xi/scale_factor); ushort yys = (ushort)(yi/scale_factor); Console.WriteLine(" Data plot ushort {0} {1}", xxs, yys); FrameStoreDriver.framestore_draw_square(xxs, yys, 3, col); } public void PlotData() { Kiwi.Pause(); for (int pnt=0; pnt<NPoints; pnt++) { Kiwi.Pause(); kpoint pp = Points[pnt]; Console.WriteLine(" Data plot {0} {1} {2}", pnt, pp.xx, pp.yy); int d = Mapping[pnt]; byte col = colourmap[d]; PlotDatum(pp.xx, pp.yy, col); } Kiwi.Pause(); } public void PlotCentroids() { Kiwi.Pause(); for (int m=0; m<NMeans; m++) { Kiwi.Pause(); kpoint pp = Centroids[m]; Console.Write(" Centroid plot {0} ", m); // if (Deads[m]) Console.WriteLine(" Dead"); // else { Console.WriteLine(" ({0}, {1})", pp.xx, pp.yy); PlotDatum(pp.xx, pp.yy, 30); } } Kiwi.Pause(); } public void PrintData() { Kiwi.Pause(); for (int i=0; i<NPoints; i++) { Kiwi.Pause(); kpoint pp = Points[i]; Console.WriteLine(" Data print {0} {1} {2}", i, pp.xx, pp.yy); } Kiwi.Pause(); } public void SeedCentroids(int seeder) { for (int m=0; m<NMeans; m++) { Kiwi.Pause(); int pt = (m * seeder) % NPoints; Centroids[m] = Points[pt]; } Kiwi.Pause(); } public void PrintCentroids() { Kiwi.Pause(); for (int m=0; m<NMeans; m++) { Kiwi.Pause(); kpoint pp = Centroids[m]; Console.Write(" Centroid print {0} ", m); // if (Deads[m]) Console.WriteLine(" Dead"); // else Console.WriteLine(" ({0}, {1})", pp.xx, pp.yy); } Kiwi.Pause(); } float AllocateCentresToPoints(int pnt) // Aim to be pauseless within this body { int centre = 0; float best_value = 0.0f; // mcs requires these clears. Kiwi.Pause(); for (int m=0; m<NMeans; m++) { Kiwi.Pause(); float dx = Points[pnt].m2(Centroids[m]); if (m==0 || dx < best_value) { centre = m; best_value = dx; } } Mapping[pnt] = centre; Console.WriteLine(" chose {0} for point {1}", centre, pnt); return best_value; } public float AllocateCentresToPointsAll() { float errorsum = 0.0f; for (int i=0; i<NPoints-1; i++) { Kiwi.Pause(); errorsum += AllocateCentresToPoints(i); } return errorsum / (float)NPoints; } void AdaptCentre(int centre) // Aim to do each call to this in parallel. { int count = 0; float avx = 0.0f, avy = 0.0f; int pnt = 0; while (pnt<NPoints) { Kiwi.Pause(); for (int pt=0; pt < manual_unwind_factor; pt ++) { if (Mapping[pnt] == centre) { count += 1; avx += Points[pnt].xx; avy += Points[pnt].yy; Console.WriteLine(" asummer ({0}, {1})", avx, avy); } pnt++; } } Kiwi.Pause(); if (count == 0) { if (!Deads[centre]) Console.WriteLine("Centre {0} now dead.", centre); Deads[centre] = true; } else { float fc = (float) count; kpoint qq; qq.xx = avx / fc; qq.yy = avy / fc; Console.WriteLine(" Adapt ({0}, {1})", qq.xx, qq.yy); Centroids[centre] = qq; } } public void AdaptCentresAll() { for (int m=0; m<NMeans; m++) AdaptCentre(m); } public float SingleThreadedIteration() { float error = AllocateCentresToPointsAll(); AdaptCentresAll(); return error; } public void RePlot() { if (false) { PlotData(); PlotCentroids(); } } public void RunTest(int seeder) { Kiwi.Pause(); return; GenerateData(true, seeder); Console.WriteLine("KMeans Data Generated. NPoints={0} NMeans={1}", KMeansDemo.NPoints, KMeansDemo.NMeans); Kiwi.KppMark("DATA GENERATED"); PlotData(); SeedCentroids(seeder); SingleThreadedIteration(); PlotCentroids(); return; Kiwi.Pause(); PrintData(); Kiwi.Pause(); Console.WriteLine("Start find"); for (int iteration = 0; iteration < 5; iteration ++) { Kiwi.Pause(); PrintCentroids(); float error = SingleThreadedIteration(); Console.WriteLine("Iteration {0}, error={1}", iteration, error); } } } class InteractiveApp { const int no_of_sticks = 2; static short [] origin_x = new short[no_of_sticks]; static short [] origin_y = new short[no_of_sticks]; static void calibrate_origins() { for (int s = 0; s< no_of_sticks; s++) { int q = JoyStickUnit.ReadStick(s); short xpos_raw = (short)(q & 0xFFF); short ypos_raw = (short)((q>>16) & 0xFFF); origin_x[s] = xpos_raw; origin_y[s] = ypos_raw; } } static bool read_stick_button(int s) { int q = JoyStickUnit.ReadStick(s); return (q < 0); // When the stick is pushed down, the msb is set. } static void read_stick_nice(int s, out short xx, out short yy) { int q = JoyStickUnit.ReadStick(s); if (q < 0) { xx = 0; // When the stick is pushed its value is perturbed so mask here. yy = 0; } else { short xpos_raw = (short)(q & 0xFFF); short ypos_raw = (short)((q>>16) & 0xFFF); xx = (short)((xpos_raw - origin_x[s])/16); yy = (short)((ypos_raw - origin_y[s])/16); } } [Kiwi.InputBitPort("rtc50")] static bool rtc50; // 50 Hz clock or so [Kiwi.OutputWordPort("mon32")] static int mon32; // Debug monitoring [Kiwi.InputBitPort("recal")] static bool recal; static int recalibrate_time = 1000; static void adapt_centroids(KMeansDemo kmd) { kmd.SingleThreadedIteration(); kmd.RePlot(); } public static void RunInt(int seeder, KMeansDemo kmd) { Kiwi.Pause(); FrameStoreDriver.framestore_draw_diagonal(); Console.WriteLine("Diag"); Kiwi.KppMark("DIAG DONE"); FrameStoreDriver.framestore_draw_square(4, 4, 20); Kiwi.KppMark("SQ DONE"); calibrate_origins(); mon32 = 100; Kiwi.KppMark("CALIBRATED"); if (false) for (int x=0; x<512;x+= 22) for (int y=0; y<512; y+=22) FrameStoreDriver.framestore_draw_square((uint)x, (uint)y, 3); Console.WriteLine("Sq1"); Kiwi.KppMark("SQ1"); FrameStoreDriver.framestore_draw_square(30, 30, 30); Console.WriteLine("Sq2"); Kiwi.KppMark("SQ2"); const int width = 512; const int height = 512; short pos_x = width/2, pos_y = height/2; int cursor_col = 0; bool old_rtc50 = false; bool npush, oldpush = false; Kiwi.KppMark("MAINLOOP"); while(true) { if (recalibrate_time > 0) { if (recalibrate_time == 1) calibrate_origins(); recalibrate_time -= 1; } npush = read_stick_button(0); if (npush && !oldpush) { if (false) adapt_centroids(kmd); } oldpush = npush; while (old_rtc50 == rtc50) { Kiwi.Pause(); } old_rtc50 = rtc50; if (recal) { calibrate_origins(); continue; } // for (int delay=0;delay<32768; delay++) Kiwi.Pause(); Kiwi.Pause(); cursor_col += 1; short stick_xx, stick_yy; read_stick_nice(0, out stick_xx, out stick_yy); short vel_x = stick_xx; short vel_y = stick_yy; pos_x = (short)(pos_x - vel_x); // Correct joystick sign and graphical +ve y being down. pos_y = (short)(pos_y - vel_y); const int denom = 32; Kiwi.Pause(); if (pos_x < 0) pos_x = 0; else if (pos_x >= 512*denom) pos_x = 511*denom; if (pos_y < 0) pos_y = 0; else if (pos_y >= 512*denom) pos_y = 511*denom; Kiwi.Pause(); mon32 = ((stick_xx) << 16) + (stick_yy); // mon32 = ((pos_x/denom) << 16) + (pos_y/denom); FrameStoreDriver.framestore_draw_crosshair_cursor((ushort)(pos_x/denom), (ushort)(pos_y/denom), (uint)cursor_col); Kiwi.Pause(); } return; } } public static class KMeansBench { [Kiwi.OutputBitPort("done")] static bool done; public static void Main() { RunHW(); } [Kiwi.HardwareEntryPoint()] public static void RunHW() { Console.WriteLine("Kiwi K-Means Clustering demo - Lloyds Algorithm on FPGA --- Start"); Kiwi.KppMark("START"); var kmd = new KMeansDemo(); kmd.RunTest(1); Console.WriteLine("Kiwi K-Means Clustering demo - Plotted"); // InteractiveApp.RunInt(1, kmd); Kiwi.KppMark("FINISH"); Console.WriteLine("KMeans Demo Finish"); done = true; } } // eof