internal RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture)
{
// Sorry, you just can't use Boyer-Moore to find an empty pattern.
// We're doing this for your own protection. (Really, for speed.)
Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf");
int beforefirst;
int last;
int bump;
int examine;
int scan;
int match;
char ch;
// We do the ToLower character by character for consistency. With surrogate chars, doing
// a ToLower on the entire string could actually change the surrogate pair. This is more correct
// linguistically, but since Regex doesn't support surrogates, it's more important to be
// consistent.
if (caseInsensitive)
{
StringBuilder sb = StringBuilderCache.Acquire(pattern.Length);
for (int i = 0; i < pattern.Length; i++)
sb.Append(culture.TextInfo.ToLower(pattern[i]));
pattern = StringBuilderCache.GetStringAndRelease(sb);
}
_pattern = pattern;
_rightToLeft = rightToLeft;
_caseInsensitive = caseInsensitive;
_culture = culture;
if (!rightToLeft)
{
beforefirst = -1;
last = pattern.Length - 1;
bump = 1;
}
else
{
beforefirst = pattern.Length;
last = 0;
bump = -1;
}
// PART I - the good-suffix shift table
//
// compute the positive requirement:
// if char "i" is the first one from the right that doesn't match,
// then we know the matcher can advance by _positive[i].
//
// This algorithm is a simplified variant of the standard
// Boyer-Moore good suffix calculation.
_positive = new int[pattern.Length];
examine = last;
ch = pattern[examine];
_positive[examine] = bump;
examine -= bump;
for (; ;)
{
// find an internal char (examine) that matches the tail
for (; ;)
{
if (examine == beforefirst)
goto OuterloopBreak;
if (pattern[examine] == ch)
break;
examine -= bump;
}
match = last;
scan = examine;
// find the length of the match
for (; ;)
{
if (scan == beforefirst || pattern[match] != pattern[scan])
{
// at the end of the match, note the difference in _positive
// this is not the length of the match, but the distance from the internal match
// to the tail suffix.
if (_positive[match] == 0)
_positive[match] = match - scan;
// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
break;
}
scan -= bump;
match -= bump;
}
examine -= bump;
}
OuterloopBreak:
match = last - bump;
// scan for the chars for which there are no shifts that yield a different candidate
// The inside of the if statement used to say
// "_positive[match] = last - beforefirst;"
// This is slightly less aggressive in how much we skip, but at worst it
// should mean a little more work rather than skipping a potential match.
while (match != beforefirst)
{
if (_positive[match] == 0)
_positive[match] = bump;
match -= bump;
}
// PART II - the bad-character shift table
//
// compute the negative requirement:
// if char "ch" is the reject character when testing position "i",
// we can slide up by _negative[ch];
// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
//
// the lookup table is divided into ASCII and Unicode portions;
// only those parts of the Unicode 16-bit code set that actually
// appear in the string are in the table. (Maximum size with
// Unicode is 65K; ASCII only case is 512 bytes.)
_negativeASCII = new int[128];
for (int i = 0; i < 128; i++)
_negativeASCII[i] = last - beforefirst;
_lowASCII = 127;
_highASCII = 0;
for (examine = last; examine != beforefirst; examine -= bump)
{
ch = pattern[examine];
if (ch < 128)
{
if (_lowASCII > ch)
_lowASCII = ch;
if (_highASCII < ch)
_highASCII = ch;
if (_negativeASCII[ch] == last - beforefirst)
_negativeASCII[ch] = last - examine;
}
else
{
int i = ch >> 8;
int j = ch & 0xFF;
if (_negativeUnicode == null)
{
_negativeUnicode = new int[256][];
}
if (_negativeUnicode[i] == null)
{
int[] newarray = new int[256];
for (int k = 0; k < 256; k++)
newarray[k] = last - beforefirst;
if (i == 0)
{
Array.Copy(_negativeASCII, 0, newarray, 0, 128);
_negativeASCII = newarray;
}
_negativeUnicode[i] = newarray;
}
if (_negativeUnicode[i][j] == last - beforefirst)
_negativeUnicode[i][j] = last - examine;
}
}
}