RTools.Util.StreamTokenizer.NextToken C# (CSharp) Method

NextToken() public method

Get the next token. The last token will be an EofToken unless there's an unterminated quote or unterminated block comment and Settings.DoUntermCheck is true, in which case this throws an exception of type StreamTokenizerUntermException or sub-class.
public NextToken ( Token &token ) : bool
token Token The output token.
return bool
        public bool NextToken(out Token token)
        {
            token = null;
            int thisChar = 0; // current character
            byte ctype; // type of this character

            NextTokenState state = NextTokenState.Start;
            int prevChar = 0; // previous character
            byte prevCtype = (byte)CharTypeBits.Eof;

            // get previous char from nextTokenSb if there
            // (nextTokenSb is a StringBuilder containing the characters
            //  of the next token to be emitted)
            if (nextTokenSb.Length > 0)
            {
                prevChar = nextTokenSb[nextTokenSb.Length - 1];
                prevCtype = settings.CharTypes[prevChar];
                state = PickNextState(prevCtype, prevChar);
            }

            // extra state for number parse
            int seenDot = 0; // how many .'s in the number
            int seenE = 0; // how many e's or E's have we seen in the number
            bool seenDigit = false; // seen any digits (numbers can start with -)

            // lineNumber can change with each GetNextChar()
            // tokenLineNumber is the line on which the token started
            int tokenLineNumber = lineNumber;

            // State Machine: Produces a single token.
            // Enter a state based on a single character.
            // Generally, being in a state means we're currently collecting chars 
            // in that type of token.
            // We do state machine until it builds a token (Eof is a token), then
            // return that token.
            thisChar = prevChar;  // for first iteration, since prevChar is set to this 
            bool done = false; // optimization
            while (!done)
            {
                prevChar = thisChar;
                thisChar = GetNextChar();
                if (thisChar >= settings.CharTypes.Length)
                {
                    // greater than 7-bit ascii, treat as word character
                    ctype = (byte)CharTypeBits.Word;
                }
                else ctype = settings.CharTypes[thisChar];

#if DEBUG
                log.Debug("Before switch: state = {0}, thisChar = '{1}'", state, (char)thisChar);
#endif

                // see if we need to change states, or emit a token
                switch (state)
                {
                    case NextTokenState.Start:
                        // RESET
                        state = PickNextState(ctype, thisChar);
                        tokenLineNumber = lineNumber;
                        break;

                    case NextTokenState.Char:
                        token = new CharToken((char)prevChar, tokenLineNumber);
                        done = true;
                        nextTokenSb.Length = 0;
                        break;

                    case NextTokenState.Word:
                        if ((!settings.IsCharType(ctype, CharTypeBits.Word))
                            && (!settings.IsCharType(ctype, CharTypeBits.Digit)))
                        {
                            // end of word, emit
                            token = new WordToken(nextTokenSb.ToString(), tokenLineNumber);
                            done = true;
                            nextTokenSb.Length = 0;
                        }
                        break;

                    case NextTokenState.Whitespace:
                        if (!settings.IsCharType(ctype, CharTypeBits.Whitespace)
                            || (settings.GrabEol && (thisChar == 10)))
                        {
                            // end of whitespace, emit
                            if (settings.GrabWhitespace)
                            {
                                token = new WhitespaceToken(nextTokenSb.ToString(), tokenLineNumber);
                                done = true;
                                nextTokenSb.Length = 0;
                            }
                            else
                            {
                                // RESET
                                nextTokenSb.Length = 0;
                                tokenLineNumber = lineNumber;
                                state = PickNextState(ctype, thisChar);
                            }
                        }
                        break;

                    case NextTokenState.EndQuote:
                        // we're now 1 char after end of quote
                        token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber);
                        done = true;
                        nextTokenSb.Length = 0;
                        break;

                    case NextTokenState.Quote:
                        // looking for end quote matching char that started the quote
                        if (thisChar == nextTokenSb[0])
                        {
                            // handle escaped backslashes: count the immediately prior backslashes 
                            // - even (including 0) means it's not escaped 
                            // - odd means it is escaped 
                            int backSlashCount = 0;
                            for (int i = nextTokenSb.Length - 1; i >= 0; i--)
                            {
                                if (nextTokenSb[i] == '\\') backSlashCount++;
                                else break;
                            }

                            if ((backSlashCount % 2) == 0)
                            {
                                state = NextTokenState.EndQuote;
                            }
                        }

                        if ((state != NextTokenState.EndQuote) && (thisChar == Eof))
                        {
                            if (settings.DoUntermCheck)
                            {
                                nextTokenSb.Length = 0;
                                throw new StreamTokenizerUntermQuoteException("Unterminated quote");
                            }

                            token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber);
                            done = true;
                            nextTokenSb.Length = 0;
                        }
                        break;

                    case NextTokenState.MaybeComment:
                        if (thisChar == Eof)
                        {
                            token = new CharToken(nextTokenSb.ToString(), tokenLineNumber);
                            done = true;
                            nextTokenSb.Length = 0;
                        }
                        else
                        {
                            // if we get the right char, we're in a comment
                            if (settings.SlashSlashComments && (thisChar == '/'))
                                state = NextTokenState.LineComment;
                            else if (settings.SlashStarComments && (thisChar == '*'))
                                state = NextTokenState.BlockComment;
                            else
                            {
                                token = new CharToken(nextTokenSb.ToString(), tokenLineNumber);
                                done = true;
                                nextTokenSb.Length = 0;
                            }
                        }
                        break;

                    case NextTokenState.LineComment:
                        if (thisChar == Eof)
                        {
                            if (settings.GrabComments)
                            {
                                token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                                done = true;
                                nextTokenSb.Length = 0;
                            }
                            else
                            {
                                // RESET
                                nextTokenSb.Length = 0;
                                tokenLineNumber = lineNumber;
                                state = PickNextState(ctype, thisChar);
                            }
                        }
                        else
                        {
                            if (thisChar == '\n')
                            {
                                if (settings.GrabComments)
                                {
                                    token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                                    done = true;
                                    nextTokenSb.Length = 0;
                                }
                                else
                                {
                                    // RESET
                                    nextTokenSb.Length = 0;
                                    tokenLineNumber = lineNumber;
                                    state = PickNextState(ctype, thisChar);
                                }
                            }
                        }
                        break;

                    case NextTokenState.BlockComment:
                        if (thisChar == Eof)
                        {
                            if (settings.DoUntermCheck)
                            {
                                nextTokenSb.Length = 0;
                                throw new StreamTokenizerUntermCommentException("Unterminated comment.");
                            }

                            if (settings.GrabComments)
                            {
                                token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                                done = true;
                                nextTokenSb.Length = 0;
                            }
                            else
                            {
                                // RESET
                                nextTokenSb.Length = 0;
                                tokenLineNumber = lineNumber;
                                state = PickNextState(ctype, thisChar);
                            }
                        }
                        else
                        {
                            if ((thisChar == '/') && (prevChar == '*'))
                            {
                                state = NextTokenState.EndBlockComment;
                            }
                        }
                        break;

                    // special case for 2-character token termination
                    case NextTokenState.EndBlockComment:
                        if (settings.GrabComments)
                        {
                            token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
                            done = true;
                            nextTokenSb.Length = 0;
                        }
                        else
                        {
                            // RESET
                            nextTokenSb.Length = 0;
                            tokenLineNumber = lineNumber;
                            state = PickNextState(ctype, thisChar);
                        }
                        break;

                    case NextTokenState.MaybeHex:
                        // previous char was 0
                        if (thisChar != 'x')
                        {
                            // back up and try non-hex
                            // back up to the 0
                            nextTokenSb.Append((char)thisChar);
                            backString.Append(nextTokenSb);
                            nextTokenSb.Length = 0;

                            // reset state and don't choose MaybeNumber state.
                            // pull char from backString
                            thisChar = backString[0];
                            backString.Remove(0, 1);
                            state = PickNextState(settings.CharTypes[thisChar], (int)thisChar,
                                NextTokenState.MaybeHex);
#if DEBUG
                            log.Debug("HexGot0x: Next state on '{0}' is {1}", (char)thisChar,
                                state);
#endif
                        }
                        else state = NextTokenState.HexGot0x;
                        break;

                    case NextTokenState.HexGot0x:
                        if (!settings.IsCharType(ctype, CharTypeBits.HexDigit))
                        {
                            // got 0x but now a non-hex char
                            // back up to the 0
                            nextTokenSb.Append((char)thisChar);
                            backString.Append(nextTokenSb);
                            nextTokenSb.Length = 0;

                            // reset state and don't choose MaybeNumber state.
                            // pull char from backString
                            thisChar = backString[0];
                            backString.Remove(0, 1);
                            state = PickNextState(settings.CharTypes[thisChar], (int)thisChar,
                                NextTokenState.MaybeHex);
#if DEBUG
                            log.Debug("HexGot0x: Next state on '{0}' is {1}", ((char)thisChar).ToString(),
                                state.ToString());
#endif
                        }
                        else state = NextTokenState.HexNumber;
                        break;

                    case NextTokenState.HexNumber:
                        if (!settings.IsCharType(ctype, CharTypeBits.HexDigit))
                        {
                            // emit the hex number we've collected
#if DEBUG
                            log.Debug("Emit hex IntToken from string '{0}'", nextTokenSb);
#endif
                            token = IntToken.ParseHex(nextTokenSb.ToString(), tokenLineNumber);
                            done = true;
                            nextTokenSb.Length = 0;
                        }
                        break;

                    case NextTokenState.MaybeNumber:
                        //
                        // Determine whether or not to stop collecting characters for
                        // the number parse.  We terminate when it's clear it's not
                        // a number or no longer a number.
                        //
                        bool term = false;

                        if (settings.IsCharType(ctype, CharTypeBits.Digit)
                            || settings.IsCharType(prevChar, CharTypeBits.Digit)) seenDigit = true;

                        // term conditions
                        if (thisChar == '.')
                        {
                            seenDot++;
                            if (seenDot > 1) term = true;  // more than one dot, it aint a number
                        }
                        else if (((thisChar == 'e') || (thisChar == 'E')))
                        {
                            seenE++;
                            if (!seenDigit) term = true;  // e before any digits is bad
                            else if (seenE > 1) term = true;  // more than 1 e is bad
                            else
                            {
                                term = true; // done regardless

                                // scan the exponent, put its characters into
                                // nextTokenSb, if there are any
                                char c;
                                expSb.Clear();
                                expSb.Append((char)thisChar);
                                if (GrabInt(expSb, true, out c))
                                {
                                    // we got a good exponent, tack it on
                                    nextTokenSb.Append(expSb);
                                    thisChar = c; // and continue after the exponent's characters
                                }
                            }
                        }
                        else if (thisChar == Eof) term = true;
                        // or a char that can't be in a number
                        else if ((!settings.IsCharType(ctype, CharTypeBits.Digit)
                            && (thisChar != 'e') && (thisChar != 'E')
                            && (thisChar != '-') && (thisChar != '.'))
                            || ((thisChar == '+') && (seenE == 0)))
                        {
                            // it's not a normal number character
                            term = true;
                        }
                        // or a dash not after e
                        else if ((thisChar == '-') && (!((prevChar == 'e') || (prevChar == 'E')))) term = true;

                        if (term)
                        {
                            // we are terminating a number, or it wasn't a number
                            if (seenDigit)
                            {
                                if ((nextTokenSb.IndexOf('.') >= 0)
                                    || (nextTokenSb.IndexOf('e') >= 0)
                                    || (nextTokenSb.IndexOf('E') >= 0)
                                    || (nextTokenSb.Length >= 19) // probably too large for Int64, use float
                                    )
                                {
                                    token = new FloatToken(nextTokenSb.ToString(), tokenLineNumber);
#if DEBUG
                                    log.Debug("Emit FloatToken from string '{0}'", nextTokenSb);
#endif
                                }
                                else
                                {
#if DEBUG
                                    log.Debug("Emit IntToken from string '{0}'", nextTokenSb);
#endif
                                    token = new IntToken(nextTokenSb.ToString(), tokenLineNumber);
                                }
                                done = true;
                                nextTokenSb.Length = 0;
                            }
                            else
                            {
                                // -whatever or -.whatever
                                // didn't see any digits, must have gotten here by a leading -
                                // and no digits after it
                                // back up to -, pick next state excluding numbers
                                nextTokenSb.Append((char)thisChar);
                                backString.Append(nextTokenSb);
                                nextTokenSb.Length = 0;

                                // restart on the - and don't choose MaybeNumber state
                                // pull char from backString
                                thisChar = backString[0];
                                backString.Remove(0, 1);
                                state = PickNextState(settings.CharTypes[thisChar], (int)thisChar,
                                    NextTokenState.MaybeNumber);
#if DEBUG
                                log.Debug("MaybeNumber: Next state on '{0}' is {1}", (char)thisChar,
                                    state);
#endif
                            }
                        }
                        break;

                    case NextTokenState.Eol:
                        // tokenLineNumber - 1 because the newline char is on the previous line
                        token = new EolToken(tokenLineNumber - 1);
                        done = true;
                        nextTokenSb.Length = 0;
                        break;

                    case NextTokenState.Eof:
                        token = new EofToken(tokenLineNumber);
                        done = true;
                        nextTokenSb.Length = 0;
                        return (false);

                    case NextTokenState.Invalid:
                    default:
                        // not a good sign, some unrepresented state?
                        log.Error("NextToken: Hit unrepresented state {0}", state.ToString());
                        return (false);
                }

                // use a StringBuilder to accumulate characters which are part of this token
                if (thisChar != Eof) nextTokenSb.Append((char)thisChar);
#if DEBUG
                log.Debug("After switch: state = {0}, nextTokenSb = '{1}', backString = '{2}'",
                    state, nextTokenSb, backString);
#endif
            }

#if DEBUG
            log.Debug("Got token {0}", token.ToDebugString());
#endif
            return (true);
        }