Boyer-Moore Algorithm Examples

From CodeCodex

Implementations[edit]

Here are example implementations of the Boyer-Moore algorithm in Java, C, Scala, and Ruby.

Java[edit]

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;

public class BoyerMoore {
	public static List<Integer> match(String pattern, String text) {
		List<Integer> matches = new ArrayList<Integer>();
		int m = text.length();
		int n = pattern.length();
		Map<Character, Integer> rightMostIndexes = preprocessForBadCharacterShift(pattern);
		int alignedAt = 0;
		while (alignedAt + (n - 1) < m) {
			for (int indexInPattern = n - 1; indexInPattern >= 0; indexInPattern--) {
			int indexInText = alignedAt + indexInPattern;
			char x = text.charAt(indexInText);
			char y = pattern.charAt(indexInPattern);
				if (indexInText >= m) break;
				if (x != y) {
					Integer r = rightMostIndexes.get(x);
					if (r == null) {
						alignedAt = indexInText + 1;
					}
					else {
						int shift = indexInText - (alignedAt + r);
						alignedAt += shift > 0 ? shift : 1;
					}
					break;
				}
				else if (indexInPattern == 0) {
					matches.add(alignedAt);
					alignedAt++;
				}
			}
		}
		return matches;
	}
	private static Map<Character, Integer> preprocessForBadCharacterShift(
			String pattern) {
		Map<Character, Integer> map = new HashMap<Character, Integer>();
		for (int i = pattern.length() - 1; i >= 0; i--) {
			char c = pattern.charAt(i);
			if (!map.containsKey(c)) map.put(c, i);
		}
		return map;
	}
	public static void main(String[] args) {
		List<Integer> matches = match("ana", "bananas");
		for (Integer integer : matches) System.out.println("Match at: " + integer);
		System.out.println((matches.equals(Arrays.asList(1, 3)) ? "OK" : "Failed"));
	}
}

C[edit]

Note: The method of constructing the good-match table (skip[]) in this example is slower than it needs to be (for simplicity of implementation). It does not make a fair comparison to other algorithms, should you try to compare their speed. A faster method should be used instead.

#include <string.h>
#include <limits.h>

/* This helper function checks, whether the last "portion" bytes
 * of "needle" (which is "nlen" bytes long) exist within the "needle"
 * at offset "offset" (counted from the end of the string),
 * and whether the character preceding "offset" is not a match.
 * Notice that the range being checked may reach beyond the
 * beginning of the string. Such range is ignored.
 */
static int boyermoore_needlematch
    (const unsigned char* needle, size_t nlen, size_t portion, size_t offset)
{
    ssize_t virtual_begin = nlen-offset-portion;
    ssize_t ignore = 0;
    if(virtual_begin < 0) { ignore = -virtual_begin; virtual_begin = 0; }
    
    if(virtual_begin > 0 && needle[virtual_begin-1] == needle[nlen-portion-1])
        return 0;

    return
        memcmp(needle + nlen - portion + ignore,
               needle + virtual_begin,
               portion - ignore) == 0;
}   

static size_t max(ssize_t a, ssize_t b) { return a>b ? a : b; }

/* Returns a pointer to the first occurrence of "needle"
 * within "haystack", or NULL if not found.
 */
const unsigned char* memmem_boyermoore
    (const unsigned char* haystack, size_t hlen,
     const unsigned char* needle,   size_t nlen)
{
    size_t skip[nlen]; /* Array of shifts with self-substring match check */
    ssize_t occ[UCHAR_MAX+1]; /* Array of last occurrence of each character */
    size_t a, hpos;
    
    if(nlen > hlen || nlen <= 0 || !haystack || !needle) return NULL;

    /* Preprocess #1: init occ[]*/
    
    /* Initialize the table to default value */
    for(a=0; a<UCHAR_MAX+1; ++a) occ[a] = -1;
    
    /* Then populate it with the analysis of the needle */
    /* But ignoring the last letter */
    for(a=0; a<nlen-1; ++a) occ[needle[a]] = a;
    
    /* Preprocess #2: init skip[] */  
    /* Note: This step could be made a lot faster.
     * A simple implementation is shown here. */
    for(a=0; a<nlen; ++a)
    {
        size_t value = 0;
        while(value < nlen && !boyermoore_needlematch(needle, nlen, a, value))
            ++value;
        skip[nlen-a-1] = value;
    }
    
    /* Search: */
    for(hpos=0; hpos <= hlen-nlen; )
    {
        size_t npos=nlen-1;
        while(needle[npos] == haystack[npos+hpos])
        {
            if(npos == 0) return haystack + hpos;
            --npos;
        }
        hpos += max(skip[npos], npos - occ[haystack[npos+hpos]]);
    }
    return NULL;
}

Ruby[edit]

# Hash impersonator that accepts regular expressions as keys.  But the regular
# expression lookups are slow, so don't use them (unless you have to). 
class RichHash
  def initialize
    @regexps = {}
    @regular = {}
  end

  def [](k)
    regular = @regular[k]
    return regular if regular
    if @regexps.size > 0
      @regexps.each do |regex,v| # linear search is going to be slow
        return v if regex.match(k) 
      end
    end
    nil
  end

  def []=(k,v)
    if k.kind_of?(Regexp)
      @regexps[k] = v
    else
      @regular[k] = v
    end
  end
end

# ported directly from this version wikipedia:
# http://en.wikipedia.org/w/index.php?title=Boyer%E2%80%93Moore_string_search_algorithm&diff=391986850&oldid=391398281
# it's not very rubyish but it works
module BoyerMoore    

  def self.compute_prefix(str) 
    size = str.length
    k = 0
    result = [0]
    1.upto(size - 1) do |q|
      while (k > 0) && (str[k] != str[q])
        k = result[k-1]
      end
      k += 1 if(str[k] == str[q])
      result[q] = k
    end
    result
  end

  def self.prepare_badcharacter_heuristic(str)
    result = RichHash.new
    0.upto(str.length - 1) do |i|
      result[str[i]] = i
    end
    result
  end

  def self.prepare_goodsuffix_heuristic(normal)
    size = normal.size
    result = []

    reversed = normal.dup.reverse
    prefix_normal = compute_prefix(normal)
    prefix_reversed = compute_prefix(reversed)

    0.upto(size) do |i|
      result[i] = size - prefix_normal[size-1]
    end

    0.upto(size-1) do |i|
      j = size - prefix_reversed[i]
      k = i - prefix_reversed[i]+1
      result[j] = k if result[j] > k
    end
    result
  end

  def self.search(haystack, needle)
    needle_len = needle.size
    haystack_len = haystack.size

    return nil if haystack_len == 0
    return haystack if needle_len == 0

    badcharacter = self.prepare_badcharacter_heuristic(needle)
    goodsuffix   = self.prepare_goodsuffix_heuristic(needle)

    s = 0
    while s <= haystack_len - needle_len
      j = needle_len
      while (j > 0) && self.needle_matches?(needle[j-1], haystack[s+j-1])
        j -= 1
      end

      if(j > 0)
        k = badcharacter[haystack[s+j-1]]
        k = -1 unless k
        if (k < j) && (m = j-k-1) > goodsuffix[j]
          s += m
        else
          s += goodsuffix[j]
        end
      else
        return s
      end
    end
    return nil
  end

  def self.needle_matches?(needle, haystack)
    if needle.kind_of?(Regexp)
      needle.match(haystack) ? true : false 
    else
      needle == haystack
    end
  end
end

Scala[edit]

//http://blog.aunndroid.com/2011/11/learning-scala-boyermoore-search-1.html
object boyer_more {
	def main(args : Array[String]) = {
		if (args.length == 2 && args(0).length >= args(1).length) {
			val result = boyer_more_search(args(0), args(1))
			println("Match found : " + result.length +
				" Pos : " + result.mkString(" "))
		} else {
			println
			println("Usage: boyer_more string_body search_string")
			val searchString = "EXAMPLE"
			//val searchString="ANPANMAN"
			val searchBody =  "HERE IS A SIMPLE EXAMPLE"
			println("e.g. : boyer_more \"" + searchBody +
				"\" \"" + searchString + "\"")
			val result = boyer_more_search(searchBody, searchString)
			println("Match found : " + result.length +
				" Pos : " + result.mkString(" "))
		}
	}

	def boyer_more_search(searchBody : String, searchString : String) : List[Int] = {
		val table2 = createTable2(searchString)
		println(table2)
		val table1 = createTable1(searchString)
		println(table1)
		startSearch(searchBody, searchString, 0, table1, table2)
	}

	def startSearch(searchBody : String, searchString : String,
		pos : Int, table1 : Map[Int, Int],
		table2 : Map[Char, Int]) : List[Int] = {

		def search(pos : Int, inc : Int = 0) : List[Int] = {
			if (pos >= searchBody.length - 1) return List[Int]()
			println(" " * pos + searchString)
			println(searchBody)
			println(" " * (pos + (searchString.length - 1 - inc)) + "-")
			val jump = matchChar(pos + searchString.length - 1, inc)
			if (jump > 0) return search(pos + jump, 0)
			/* found a match character, check the next character */
			if (jump == 0) return search(pos, inc + 1)
			/* found a search string, jump the whole search string's length */
			if (jump == -1) return pos :: search(pos + searchString.length, 0)
			Nil
		}

		def matchChar(pos : Int, inc : Int) : Int = {
			if (!searchBody.isDefinedAt(pos - inc)) return -2

			/* invalid char at the end of String, return search string length */
			if (!table2.isDefinedAt(searchBody(pos))) return searchString.length
			if (searchBody(pos - inc) == searchString(searchString.length - 1 - inc)) {
				if (searchString.length - 1 - (inc + 1) == -1) return -1
				return 0
			}
			val result1 = table1.get(inc).get

			/* invalid char inside the String, return table1 jump value */
			if (!table2.isDefinedAt(searchBody(pos - inc))) return result1			
			val result2 = table2.get(searchBody(pos - inc)).get
			if (result2 > result1) return result2
			result1
		}

		search(pos)
	}

	import scala.collection.immutable._

	def createTable2(str : String) : Map[Char, Int] = {
	  /* reverse: to use foldLeft, 
	      tail: the last(now head) charcter, which is not neede, is dropped */
    val result = str.reverse.tail.foldLeft(Map[Char,Int](),1)  {
      case ((mMap,pos),ch) if ! mMap.isDefinedAt(ch) => (mMap ++ Map(ch -> pos), pos+1)
      case ((mMap,pos),ch)                           => (mMap,pos +1)
    }
    return result._1
	}

	def createTable1(str : String) : Map[Int, Int] = {
		val strOps = str.reverse

		/* create a list of valid characters from the str */
		val strList = strOps.foldLeft(List[Char]()) {
			case (l, ch) if !l.contains(ch) => ch :: l
			case (l, ch) => l
		}

		val result = strOps.foldLeft(Map[Int, Int](), 0, "") {
			case ((mMap, pos, subStr), ch) if pos != 0 => (mMap ++ Map(pos -> calculate(pos, subStr, `str`,
			  /* a valid characters list is reconstructed again without its actual character */
				`strList`.filter {
					case x if x == ch => false
					case _ => true
				})), pos + 1, ch + subStr)
			case ((mMap, pos, subStr), ch) => (Map(0 -> 1), pos + 1, ch + subStr)
		}
		result._1
	}

	def calculate(pos : Int, subStr : String,
		str : String, strList : List[Char]=Nil) : Int = {

		/* strList will be empty when the recursive call to this function 
		    for backtracking subStr is invoked*/
		if (strList.isEmpty) {
			val jListMin = subSetJump(subStr, str)

			/* the current subStr is not a valid sub string, 
			  backtrack 1 and calculate to find again*/
			if (jListMin < 0)
				return calculate(pos - 1, subStr.tail,str)

			/* the current subStr is a valid sub String 
			    but the jump is 0 and no moving. backtrack 1 and calculate */		
			if (str.length - 1 - pos - jListMin == 0)
				return calculate(pos - 1, subStr.tail,str)

			/* difference of current pos(counting from front) 
			  with the position first of the first valid subset */		
			return str.length - 1 - pos - jListMin
		}

    /* get the positions of all valid sub strings 
      constructed with valid characters */
		val jList = strList.foldLeft(List[Int]()) {
			(j, ch) => subSetJump(ch + `subStr`, `str`) :: j
		}
    /* jList.max is less than 0, the rest will be the same */
		if (jList.max < 0) {
			if (subStr.length == 1)	return str.length
			return calculate(pos - 1, subStr,str)
		}

		/* we will use the minimum positive value including 0*/
		val jListmin=(jList.filter{
		  case x if x >= 0 => true
		  case x           => false}).min

		if (str.length - 1 - pos - jListmin == 0)
			return calculate(pos - 1, subStr.tail,str)
		return str.length - 1 - pos - jListmin
	}

	def subSetJump(subStr : String, str : String) : Int = {
		val list = (for (i <- 0 until str.length)
			yield str.slice(i, i + subStr.length)).toList
		list.indexOf(subStr)
	}
}