private static string ConvertRegex(string perlExpr, PerlRegexOptions opt, Encoding/*!*/ encoding)
{
// Ranges in bracket expressions should be replaced with appropriate characters
// assume no conversion will be performed, create string builder with exact length. Only in
// case there is a range StringBuilder would be prolonged, +1 for Anchored
StringBuilder result = new StringBuilder(perlExpr.Length + 1);
// Anchored means that the string should match only at the start of the string, add '^'
// at the beginning if there is no one
if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^'))
result.Append('^');
// set to true after a quantifier is matched, if there is second quantifier just behind the
// first it is an error
bool last_quantifier = false;
// 4 means we're switching from 3 back to 2 - ie. "a-b-c"
// (we need to make a difference here because second "-" shouldn't be expanded)
bool leaving_range = false;
bool escaped = false;
int state = 0;
int group_state = 0;
int i = 0;
while (i < perlExpr.Length)
{
char ch = perlExpr[i];
escaped = false;
if (ch == '\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped))
{
i++;
//Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash.");
ch = perlExpr[i];
// some characters (like '_') don't need to be escaped in .net
if (ch == '_') escaped = false; else escaped = true;
}
switch (state)
{
case 0: // outside of character class
if (escaped)
{
result.Append('\\');
result.Append(ch);
last_quantifier = false;
break;
}
// In perl regexps, named groups are written like this: "(?P<name> ... )"
// If the group is starting here, we need to skip the 'P' character (see state 4)
switch (group_state)
{
case 0: group_state = (ch == '(') ? 1 : 0; break;
case 1: group_state = (ch == '?') ? 2 : 0; break;
case 2: if (ch == 'P') { i++; continue; } break;
}
if ((opt & PerlRegexOptions.Ungreedy) != 0)
{
// match quantifier ?,*,+,{n,m} at the position i:
Match m = quantifiers.Match(perlExpr, i);
// quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?'
if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '('))
{
// two quantifiers:
if (last_quantifier)
throw new ArgumentException("regexp_duplicate_quantifier");
// append quantifier:
result.Append(perlExpr, i, m.Length);
i += m.Length;
if (i < perlExpr.Length && perlExpr[i] == '?')
{
// skip question mark to make the quantifier greedy:
i++;
}
else if (i < perlExpr.Length && perlExpr[i] == '+')
{
// TODO: we do not yet support possesive quantifiers
// so we just skip the attribute it and pray
// nobody will ever realize :-)
i++;
}
else
{
// add question mark to make the quantifier lazy:
if (result.Length != 0 && result[result.Length - 1] == '?')
{
// HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!!
// (for example "^(ab)??$" matches with "abab", but it shouldn't!!)
}
else
result.Append('?');
}
last_quantifier = true;
continue;
}
}
last_quantifier = false;
if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0)
{
// replaces '$' with '\z':
result.Append(@"\z");
break;
}
if (ch == '[')
state = 1;
result.Append(ch);
break;
case 1: // first character of character class
if (escaped)
{
result.Append('\\');
result.Append(ch);
state = 2;
break;
}
// special characters:
if (ch == '^' || ch == ']' || ch == '-')
{
result.Append(ch);
}
else
{
// other characters are not consumed here, for example [[:space:]abc] will not match if the first
// [ is appended here.
state = 2;
goto case 2;
}
break;
case 2: // inside of character class
if (escaped)
{
result.Append('\\');
result.Append(ch);
leaving_range = false;
break;
}
if (ch == '-' && !leaving_range)
{
state = 3;
break;
}
leaving_range = false;
// posix character classes
Match match = posixCharClasses.Match(perlExpr.Substring(i), 0);
if (match.Success)
{
string chars = CountCharacterClass(match.Groups[2].Value);
if (chars == null)
throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value));
if (match.Groups[1].Value.Length > 0)
throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported.");
result.Append(chars);
i += match.Length - 1; // +1 is added just behind the switch
break;
}
if (ch == ']')
state = 0;
if (ch == '-')
result.Append("\\x2d");
else
result.Append(ch);
break;
case 3: // range previous character was '-'
if (!escaped && ch == ']')
{
result.Append("-]");
state = 0;
break;
}
string range;
int error;
if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding))
{
if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range)))
{
//Debug.Assert(error == 2);
throw new ArgumentException("range_first_character_greater");
}
}
result.Append(EscapeBracketExpressionSpecialChars(range)); // left boundary is duplicated, but doesn't matter...
state = 2;
leaving_range = true;
break;
}
i++;
}
return result.ToString();
}