Udger.Parser.PerlRegExpConverter.ConvertRegex C# (CSharp) Méthode

ConvertRegex() private static méthode

Converts Perl match expression (only, without delimiters, options etc.) to .NET regular expression.
private static ConvertRegex ( string perlExpr, PerlRegexOptions opt, Encoding encoding ) : string
perlExpr string Perl regular expression to convert.
opt PerlRegexOptions Regexp options - some of them must be processed by changes in match string.
encoding System.Text.Encoding Encoding.
Résultat string
        private static string ConvertRegex(string perlExpr, PerlRegexOptions opt, Encoding/*!*/ encoding)
        {
            // Ranges in bracket expressions should be replaced with appropriate characters

            // assume no conversion will be performed, create string builder with exact length. Only in
            // case there is a range StringBuilder would be prolonged, +1 for Anchored
            StringBuilder result = new StringBuilder(perlExpr.Length + 1);

            // Anchored means that the string should match only at the start of the string, add '^'
            // at the beginning if there is no one
            if ((opt & PerlRegexOptions.Anchored) != 0 && (perlExpr.Length == 0 || perlExpr[0] != '^'))
                result.Append('^');

            // set to true after a quantifier is matched, if there is second quantifier just behind the
            // first it is an error
            bool last_quantifier = false;

            // 4 means we're switching from 3 back to 2 - ie. "a-b-c" 
            // (we need to make a difference here because second "-" shouldn't be expanded)
            bool leaving_range = false;

            bool escaped = false;
            int state = 0;
            int group_state = 0;

            int i = 0;
            while (i < perlExpr.Length)
            {
                char ch = perlExpr[i];

                escaped = false;
                if (ch == '\\' && !ParseEscapeCode(encoding, perlExpr, ref i, ref ch, ref escaped))
                {
                    i++;
                    //Debug.Assert(i < perlExpr.Length, "Regex cannot end with backslash.");
                    ch = perlExpr[i];

                    // some characters (like '_') don't need to be escaped in .net
                    if (ch == '_') escaped = false; else escaped = true;
                }

                switch (state)
                {
                    case 0: // outside of character class
                        if (escaped)
                        {
                            result.Append('\\');
                            result.Append(ch);
                            last_quantifier = false;
                            break;
                        }

                        // In perl regexps, named groups are written like this: "(?P<name> ... )"
                        // If the group is starting here, we need to skip the 'P' character (see state 4)
                        switch (group_state)
                        {
                            case 0: group_state = (ch == '(') ? 1 : 0; break;
                            case 1: group_state = (ch == '?') ? 2 : 0; break;
                            case 2: if (ch == 'P') { i++; continue; } break;
                        }

                        if ((opt & PerlRegexOptions.Ungreedy) != 0)
                        {
                            // match quantifier ?,*,+,{n,m} at the position i:
                            Match m = quantifiers.Match(perlExpr, i);

                            // quantifier matched; quentifier '?' hasn't to be preceded by '(' - a grouping construct '(?'
                            if (m.Success && (m.Value != "?" || i == 0 || perlExpr[i - 1] != '('))
                            {
                                // two quantifiers: 
                                if (last_quantifier)
                                    throw new ArgumentException("regexp_duplicate_quantifier");

                                // append quantifier:
                                result.Append(perlExpr, i, m.Length);
                                i += m.Length;

                                if (i < perlExpr.Length && perlExpr[i] == '?')
                                {
                                    // skip question mark to make the quantifier greedy:
                                    i++;
                                }
                                else if (i < perlExpr.Length && perlExpr[i] == '+')
                                {
                                    // TODO: we do not yet support possesive quantifiers
                                    //       so we just skip the attribute it and pray
                                    //       nobody will ever realize :-)
                                    i++;
                                }
                                else
                                {
                                    // add question mark to make the quantifier lazy:
                                    if (result.Length != 0 && result[result.Length - 1] == '?')
                                    {
                                        // HACK: Due to the issue in .NET regex we can't use "??" because it isn't interpreted correctly!!
                                        // (for example "^(ab)??$" matches with "abab", but it shouldn't!!)
                                    }
                                    else
                                        result.Append('?');
                                }

                                last_quantifier = true;
                                continue;
                            }
                        }

                        last_quantifier = false;

                        if (ch == '$' && (opt & PerlRegexOptions.DollarMatchesEndOfStringOnly) != 0)
                        {
                            // replaces '$' with '\z': 
                            result.Append(@"\z");
                            break;
                        }

                        if (ch == '[')
                            state = 1;

                        result.Append(ch);
                        break;

                    case 1: // first character of character class
                        if (escaped)
                        {
                            result.Append('\\');
                            result.Append(ch);
                            state = 2;
                            break;
                        }

                        // special characters:
                        if (ch == '^' || ch == ']' || ch == '-')
                        {
                            result.Append(ch);
                        }
                        else
                        {
                            // other characters are not consumed here, for example [[:space:]abc] will not match if the first
                            // [ is appended here.
                            state = 2;
                            goto case 2;
                        }
                        break;

                    case 2: // inside of character class
                        if (escaped)
                        {
                            result.Append('\\');
                            result.Append(ch);
                            leaving_range = false;
                            break;
                        }

                        if (ch == '-' && !leaving_range)
                        {
                            state = 3;
                            break;
                        }
                        leaving_range = false;

                        // posix character classes
                        Match match = posixCharClasses.Match(perlExpr.Substring(i), 0);
                        if (match.Success)
                        {
                            string chars = CountCharacterClass(match.Groups[2].Value);
                            if (chars == null)
                                throw new ArgumentException(/*TODO*/ String.Format("Unknown character class '{0}'", match.Groups[2].Value));

                            if (match.Groups[1].Value.Length > 0)
                                throw new ArgumentException(/*TODO*/ "POSIX character classes negation not supported.");

                            result.Append(chars);
                            i += match.Length - 1; // +1 is added just behind the switch
                            break;
                        }

                        if (ch == ']')
                            state = 0;
                        if (ch == '-')
                            result.Append("\\x2d");
                        else
                            result.Append(ch);
                        break;

                    case 3: // range previous character was '-'
                        if (!escaped && ch == ']')
                        {
                            result.Append("-]");
                            state = 0;
                            break;
                        }

                        string range;
                        int error;
                        if (!CountRange(result[result.Length - 1], ch, out range, out error, encoding))
                        {
                            if ((error != 1) || (!CountUnicodeRange(result[result.Length - 1], ch, out range)))
                            {
                                //Debug.Assert(error == 2);
                                throw new ArgumentException("range_first_character_greater");
                            }
                        }
                        result.Append(EscapeBracketExpressionSpecialChars(range)); // left boundary is duplicated, but doesn't matter...
                        state = 2;
                        leaving_range = true;
                        break;
                }

                i++;
            }

            return result.ToString();
        }