Burrows-Wheeler String Search coded in C#

using System;

//
// CBG Squanderer: Burrows-Wheeler Transform and String Searcher in C#.
// (C) 2009 David J Greaves, University of Cambridge Computer Laboratory.
// With decimated rank index for linear space lookup.

public class fwd_bwt
{
  char eos;            // Terminator.
  char [] string_chars;
  int [] ptrs;
  int length;
  int [] tots = new int [256];     // Character occurrence counts
  int [] Tots_before = new int [256];
  int [,] Ranks;
  int alphabet_size = 0;
  int [] compress_map = new int[256];  // Mapping of each char in use to a small integer

  int rank_decimation_factor = 1;

  int compress(char cc, bool insert_if_missing=false) // Map each character in use to a small integer in the range 0 to alphabet_size-1.
  {
    if (cc == eos)
         { Console.WriteLine("Eos char appears in needle (search string).");
           Environment.Exit(1);
         }  

    if (compress_map[cc]==0) // The mapping holds that integer + 1.
    {
      if (!insert_if_missing) return -1;
      compress_map[cc] = ++ alphabet_size; 
    }
    return compress_map[cc] - 1;
  }

  int bound_log2(int n)
  {
    int r = 1;
    while (r < n) r *= 2;
    return r;
  }


  void compute_ranks(bool disable_decimate=false) 
  {
    Console.WriteLine ("BWT: Starting Alphabet Arity Find.");
    for (int i=0; i 5 * alphabet_size) rank_decimation_factor = bound_log2(alphabet_size);
    Console.WriteLine ("BWT: Starting Ranking with decimation_factor {0}.", rank_decimation_factor);

    Ranks = new int [(length + rank_decimation_factor -1) / rank_decimation_factor, alphabet_size]; 
    
    int o=0;
    int [] lranks = new int [alphabet_size];
    for (int x=0; x= 0; xx--)  ranks += Ranks[r0, xx] + " ";
         }  }
      }
      Console.WriteLine("   hay= {4} sort= {2}  bwt= {3}    {0}  {1}   {5}", i, ptrs[i], string_chars[ptrs[i]], bwt(i), string_chars[i], ranks); // print sorted and bwt
    }
  }

  int ranks(int row, int xx, bool upwards)
  {
    int r0 =  row / rank_decimation_factor;
    int rem = row % rank_decimation_factor;

    int ans = Ranks[r0, xx];
    if (rem != 0)
    {
      r0 *= rank_decimation_factor;
      while (rem != 0) 
      {
	 rem --;
	 r0 ++;
	 if (compress(bwt(r0)) == xx) ans ++;
	 //Console.WriteLine("row find {3}: scan {0} {1} ans={2}", rem, r0, ans, row);
       }
    }
    return ans;

  }

  public int find_string(char [] query) // The search function
  {
     int start = 1;
     int end   = length;
     for (int idx = query.Length-1; idx >= 0; idx --)
        {
 	  char ccc = query[idx];
          Console.WriteLine("Search {3}th char, '{0}', in the range {1}..{2}", ccc, start, end-1, idx);
          int xx = compress(ccc);
	  if (xx < 0)
	  {
            Console.WriteLine("{0} is not in haystack alphabet, so string not found.", ccc);
            return -1;
          }
	  int nn = Tots_before [ccc];
	  Console.WriteLine("           {1}th letter of compressed alphabet, {0} tots before", nn, xx);
          start = nn + ranks(start-1, xx, true); 
          end   = nn + ranks(end-1,   xx, false);
          Console.WriteLine("    will need to look in region starting {0} with range {1}..{2}", nn, start, end-1);
	  if (start == end)
	  {
             Console.WriteLine("Suffix not found");
             return -1;
          }
        }
    Console.WriteLine("Search done in {0}..{1}, {2} instances found.", start, end-1, end-start);
    return ptrs[start];
    // TODO: Could return the range of occurrences using (map ptrs [start .. end-1]).
  }

  public fwd_bwt(char [] data, char terminator) // constructor
  {
    eos = terminator;
    string_chars = data;
    length = data.Length;
    ptrs  = new int [length];
    tots  = new int [256];
    for (int i=0; i cb)  { /*Console.WriteLine("  stop {0} > {1} true => false", ca, cb); */ans = false; break; }

      a = (a == length-1) ? 0: a+1;
      b = (b == length-1) ? 0: b+1;
    }
    // Strings are the same
    if (x==length) ans = false;

    //    Console.WriteLine("is_lower {0} {1} {2} ", ls, rs, ans);
    return ans;
  }

  bool is_higher(int a, int b)
  {
  //    Console.WriteLine("      is_higher {0} {1}  start", a, b);
    for (int x=0; x cb) return true;
      else if (ca < cb) return false;

      a = (a == length-1) ? 0: a+1;
      b = (b == length-1) ? 0: b+1;
    }
   // Strings are the same
   return false;
  }

  void quicksort(int low, int high)
  {
    //Console.WriteLine ("quicksort {0}..{1}", low, high);
    if (high - low < 7) bubblesort(low, high);
    else if (low < high)
      {
        int pivot_loc = partition(low, high);
	quicksort(low, pivot_loc);
	//Console.WriteLine("Done low {0}", pivot_loc);
        quicksort(pivot_loc+1, high);
	//Console.WriteLine("Done high {0}", pivot_loc);
      }
  }

  void bubblesort(int low, int high)
  {
    //    Console.WriteLine ("bubblesort {0}..{1}", low, high);
    for (int i=low; i= high) return low;
    while (low < high)
    {
       while (low < high)
       {
           if (is_lower(ptrs[low], pivot)) low ++;
           else break;
       }
       while (low < high)
       {
           if (is_higher(ptrs[high], pivot)) high --;
           else break;
       }
       if (low < high) swap(low, high);
       low ++; high --;
    }
    return low;
  }

  void swap(int a, int b)
  {
    int temp = ptrs[a];
    ptrs[a] = ptrs[b];
    ptrs[b] = temp;
  }
}


public static class bwt_test
{


  static string lyrics = @"
    (C) Northern Songs 196X.
    She loves you, yeah, yeah, yeah
    She loves you, yeah, yeah, yeah
    She loves you, yeah, yeah, yeah, yeah

    You think you lost your love
    When I saw her yesterday
    It's you she's thinking of
    And she told me what to say
    She says she loves you
    And you know that can't be bad
    Yes, she loves you
    And you know you should be glad

    She said you hurt her so
    She almost lost her mind
    And now she says she knows
    You're not the hurting kind
    She says she loves you
    And you know that can't be bad
    Yes, she loves you
    And you know you should be glad, ooh

    She loves you, yeah, yeah, yeah
    She loves you, yeah, yeah, yeah
    And with a love like that
    You know you should be glad

    You know it's up to you
    I think it's only fair
    Pride can hurt you too
    Apologize to her
    Because she loves you
    And you know that can't be bad
    Yes, she loves you
    And you know you should be glad, ooh

    She loves you, yeah, yeah, yeah
    She loves you, yeah, yeah, yeah
    With a love like that
    You know you should be glad
    With a love like that
    You know you should be glad
    With a love like that
    You know you should be glad
    Yeah, yeah, yeah
    Yeah, yeah, yeah, yeah.\u0003";


  static public void bwt_verbose_test(string needle0, string needle1, string haystack)
  {
    fwd_bwt bwt = new fwd_bwt(haystack.ToCharArray(), '$');
    bwt.stats();
    int l0 = bwt.find_string (needle0.ToCharArray());
    Console.WriteLine("Found string {0} at {1}", needle0, l0);
    int l1 = bwt.find_string (needle1.ToCharArray());
    Console.WriteLine("Found string {0} at {1}", needle1, l1);
  }

  static public void bwt_beatles_test()
  {
    string needle0 = "love";
    string needle1 = "told";

    fwd_bwt bwt = new fwd_bwt(lyrics.ToCharArray(), '\u0003');
    bwt.stats();
    int l0 = bwt.find_string (needle0.ToCharArray());
    Console.WriteLine("Found Beatles string {0} at {1}", needle0, l0);
    int l1 = bwt.find_string (needle1.ToCharArray());
    Console.WriteLine("Found Beatles string {0} at {1}", needle1, l1);
  }
}
// eof

Example Output

BWT:Starting Sort
BWT: Starting Cumulative Index
BWT:   Index $ range 0..0
BWT:   Index I range 1..4
BWT:   Index M range 5..5
BWT:   Index P range 6..7
BWT:   Index S range 8..11
BWT: Starting Alphabet Arity Find.
Alphabet has 4 characters.
BWT: Starting Ranking with decimation_factor 1.
BWT:Ranking done.
   hay= M sort= $  bwt= I    0  11   ranks=0 0 1 0 
   hay= I sort= I  bwt= P    1  10   ranks=1 0 1 0 
   hay= S sort= I  bwt= S    2  7   ranks=1 1 1 0 
   hay= S sort= I  bwt= S    3  4   ranks=1 2 1 0 
   hay= I sort= I  bwt= M    4  1   ranks=1 2 1 1 
   hay= S sort= M  bwt= $    5  0   ranks=1 2 1 1 
   hay= S sort= P  bwt= P    6  9   ranks=2 2 1 1 
   hay= I sort= P  bwt= I    7  8   ranks=2 2 2 1 
   hay= P sort= S  bwt= S    8  6   ranks=2 3 2 1 
   hay= P sort= S  bwt= S    9  3   ranks=2 4 2 1 
   hay= I sort= S  bwt= I    10  5   ranks=2 4 3 1 
   hay= $ sort= S  bwt= I    11  2   ranks=2 4 4 1 
Search 2th char, 'S', in the range 1..11
           2th letter of compressed alphabet, 8 tots before
    will need to look in region starting 8 with range 8..11
Search 1th char, 'I', in the range 8..11
           1th letter of compressed alphabet, 1 tots before
    will need to look in region starting 1 with range 3..4
Search 0th char, 'S', in the range 3..4
           2th letter of compressed alphabet, 8 tots before
    will need to look in region starting 8 with range 9..9
Search done in 9..9, 1 instances found.

A Decimated Search

BWT: Starting Alphabet Arity Find.
Alphabet has 45 characters.
BWT: Starting Ranking with decimation_factor 32.
BWT:Ranking done.

Search 3th char, 'e', in the range 1..1303
           10th letter of compressed alphabet, 619 tots before
    will need to look in region starting 619 with range 619..718
Search 2th char, 'v', in the range 619..718
           21th letter of compressed alphabet, 1203 tots before
    will need to look in region starting 1203 with range 1203..1220
Search 1th char, 'o', in the range 1203..1220
           6th letter of compressed alphabet, 940 tots before
    will need to look in region starting 940 with range 1007..1024
Search 0th char, 'l', in the range 1007..1024
           20th letter of compressed alphabet, 857 tots before
    will need to look in region starting 857 with range 880..897
Search done in 880..897, 18 instances found.
Found Beatles string love at 173
Search 3th char, 'd', in the range 1..1303
           32th letter of compressed alphabet, 587 tots before
    will need to look in region starting 587 with range 587..618
Search 2th char, 'l', in the range 587..618
           20th letter of compressed alphabet, 857 tots before
    will need to look in region starting 857 with range 864..871
Search 1th char, 'o', in the range 864..871
           6th letter of compressed alphabet, 940 tots before
    will need to look in region starting 940 with range 949..949
Search 0th char, 't', in the range 949..949
           8th letter of compressed alphabet, 1108 tots before
    will need to look in region starting 1108 with range 1149..1149
Search done in 1149..1149, 1 instances found.
Found Beatles string told at 250
0.07user 0.02system 0:00.09elapsed 116%CPU (0avgtext+0avgdata 6224maxresident)k
0inputs+0outputs (0major+1713minor)pagefaults 0swaps