public bool NextToken(out Token token)
{
token = null;
int thisChar = 0; // current character
byte ctype; // type of this character
NextTokenState state = NextTokenState.Start;
int prevChar = 0; // previous character
byte prevCtype = (byte)CharTypeBits.Eof;
// get previous char from nextTokenSb if there
// (nextTokenSb is a StringBuilder containing the characters
// of the next token to be emitted)
if (nextTokenSb.Length > 0)
{
prevChar = nextTokenSb[nextTokenSb.Length - 1];
prevCtype = settings.CharTypes[prevChar];
state = PickNextState(prevCtype, prevChar);
}
// extra state for number parse
int seenDot = 0; // how many .'s in the number
int seenE = 0; // how many e's or E's have we seen in the number
bool seenDigit = false; // seen any digits (numbers can start with -)
// lineNumber can change with each GetNextChar()
// tokenLineNumber is the line on which the token started
int tokenLineNumber = lineNumber;
// State Machine: Produces a single token.
// Enter a state based on a single character.
// Generally, being in a state means we're currently collecting chars
// in that type of token.
// We do state machine until it builds a token (Eof is a token), then
// return that token.
thisChar = prevChar; // for first iteration, since prevChar is set to this
bool done = false; // optimization
while (!done)
{
prevChar = thisChar;
thisChar = GetNextChar();
if (thisChar >= settings.CharTypes.Length)
{
// greater than 7-bit ascii, treat as word character
ctype = (byte)CharTypeBits.Word;
}
else ctype = settings.CharTypes[thisChar];
#if DEBUG
log.Debug("Before switch: state = {0}, thisChar = '{1}'", state, (char)thisChar);
#endif
// see if we need to change states, or emit a token
switch (state)
{
case NextTokenState.Start:
// RESET
state = PickNextState(ctype, thisChar);
tokenLineNumber = lineNumber;
break;
case NextTokenState.Char:
token = new CharToken((char)prevChar, tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
break;
case NextTokenState.Word:
if ((!settings.IsCharType(ctype, CharTypeBits.Word))
&& (!settings.IsCharType(ctype, CharTypeBits.Digit)))
{
// end of word, emit
token = new WordToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
break;
case NextTokenState.Whitespace:
if (!settings.IsCharType(ctype, CharTypeBits.Whitespace)
|| (settings.GrabEol && (thisChar == 10)))
{
// end of whitespace, emit
if (settings.GrabWhitespace)
{
token = new WhitespaceToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
else
{
// RESET
nextTokenSb.Length = 0;
tokenLineNumber = lineNumber;
state = PickNextState(ctype, thisChar);
}
}
break;
case NextTokenState.EndQuote:
// we're now 1 char after end of quote
token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
break;
case NextTokenState.Quote:
// looking for end quote matching char that started the quote
if (thisChar == nextTokenSb[0])
{
// handle escaped backslashes: count the immediately prior backslashes
// - even (including 0) means it's not escaped
// - odd means it is escaped
int backSlashCount = 0;
for (int i = nextTokenSb.Length - 1; i >= 0; i--)
{
if (nextTokenSb[i] == '\\') backSlashCount++;
else break;
}
if ((backSlashCount % 2) == 0)
{
state = NextTokenState.EndQuote;
}
}
if ((state != NextTokenState.EndQuote) && (thisChar == Eof))
{
if (settings.DoUntermCheck)
{
nextTokenSb.Length = 0;
throw new StreamTokenizerUntermQuoteException("Unterminated quote");
}
token = new QuoteToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
break;
case NextTokenState.MaybeComment:
if (thisChar == Eof)
{
token = new CharToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
else
{
// if we get the right char, we're in a comment
if (settings.SlashSlashComments && (thisChar == '/'))
state = NextTokenState.LineComment;
else if (settings.SlashStarComments && (thisChar == '*'))
state = NextTokenState.BlockComment;
else
{
token = new CharToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
}
break;
case NextTokenState.LineComment:
if (thisChar == Eof)
{
if (settings.GrabComments)
{
token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
else
{
// RESET
nextTokenSb.Length = 0;
tokenLineNumber = lineNumber;
state = PickNextState(ctype, thisChar);
}
}
else
{
if (thisChar == '\n')
{
if (settings.GrabComments)
{
token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
else
{
// RESET
nextTokenSb.Length = 0;
tokenLineNumber = lineNumber;
state = PickNextState(ctype, thisChar);
}
}
}
break;
case NextTokenState.BlockComment:
if (thisChar == Eof)
{
if (settings.DoUntermCheck)
{
nextTokenSb.Length = 0;
throw new StreamTokenizerUntermCommentException("Unterminated comment.");
}
if (settings.GrabComments)
{
token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
else
{
// RESET
nextTokenSb.Length = 0;
tokenLineNumber = lineNumber;
state = PickNextState(ctype, thisChar);
}
}
else
{
if ((thisChar == '/') && (prevChar == '*'))
{
state = NextTokenState.EndBlockComment;
}
}
break;
// special case for 2-character token termination
case NextTokenState.EndBlockComment:
if (settings.GrabComments)
{
token = new CommentToken(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
else
{
// RESET
nextTokenSb.Length = 0;
tokenLineNumber = lineNumber;
state = PickNextState(ctype, thisChar);
}
break;
case NextTokenState.MaybeHex:
// previous char was 0
if (thisChar != 'x')
{
// back up and try non-hex
// back up to the 0
nextTokenSb.Append((char)thisChar);
backString.Append(nextTokenSb);
nextTokenSb.Length = 0;
// reset state and don't choose MaybeNumber state.
// pull char from backString
thisChar = backString[0];
backString.Remove(0, 1);
state = PickNextState(settings.CharTypes[thisChar], (int)thisChar,
NextTokenState.MaybeHex);
#if DEBUG
log.Debug("HexGot0x: Next state on '{0}' is {1}", (char)thisChar,
state);
#endif
}
else state = NextTokenState.HexGot0x;
break;
case NextTokenState.HexGot0x:
if (!settings.IsCharType(ctype, CharTypeBits.HexDigit))
{
// got 0x but now a non-hex char
// back up to the 0
nextTokenSb.Append((char)thisChar);
backString.Append(nextTokenSb);
nextTokenSb.Length = 0;
// reset state and don't choose MaybeNumber state.
// pull char from backString
thisChar = backString[0];
backString.Remove(0, 1);
state = PickNextState(settings.CharTypes[thisChar], (int)thisChar,
NextTokenState.MaybeHex);
#if DEBUG
log.Debug("HexGot0x: Next state on '{0}' is {1}", ((char)thisChar).ToString(),
state.ToString());
#endif
}
else state = NextTokenState.HexNumber;
break;
case NextTokenState.HexNumber:
if (!settings.IsCharType(ctype, CharTypeBits.HexDigit))
{
// emit the hex number we've collected
#if DEBUG
log.Debug("Emit hex IntToken from string '{0}'", nextTokenSb);
#endif
token = IntToken.ParseHex(nextTokenSb.ToString(), tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
}
break;
case NextTokenState.MaybeNumber:
//
// Determine whether or not to stop collecting characters for
// the number parse. We terminate when it's clear it's not
// a number or no longer a number.
//
bool term = false;
if (settings.IsCharType(ctype, CharTypeBits.Digit)
|| settings.IsCharType(prevChar, CharTypeBits.Digit)) seenDigit = true;
// term conditions
if (thisChar == '.')
{
seenDot++;
if (seenDot > 1) term = true; // more than one dot, it aint a number
}
else if (((thisChar == 'e') || (thisChar == 'E')))
{
seenE++;
if (!seenDigit) term = true; // e before any digits is bad
else if (seenE > 1) term = true; // more than 1 e is bad
else
{
term = true; // done regardless
// scan the exponent, put its characters into
// nextTokenSb, if there are any
char c;
expSb.Clear();
expSb.Append((char)thisChar);
if (GrabInt(expSb, true, out c))
{
// we got a good exponent, tack it on
nextTokenSb.Append(expSb);
thisChar = c; // and continue after the exponent's characters
}
}
}
else if (thisChar == Eof) term = true;
// or a char that can't be in a number
else if ((!settings.IsCharType(ctype, CharTypeBits.Digit)
&& (thisChar != 'e') && (thisChar != 'E')
&& (thisChar != '-') && (thisChar != '.'))
|| ((thisChar == '+') && (seenE == 0)))
{
// it's not a normal number character
term = true;
}
// or a dash not after e
else if ((thisChar == '-') && (!((prevChar == 'e') || (prevChar == 'E')))) term = true;
if (term)
{
// we are terminating a number, or it wasn't a number
if (seenDigit)
{
if ((nextTokenSb.IndexOf('.') >= 0)
|| (nextTokenSb.IndexOf('e') >= 0)
|| (nextTokenSb.IndexOf('E') >= 0)
|| (nextTokenSb.Length >= 19) // probably too large for Int64, use float
)
{
token = new FloatToken(nextTokenSb.ToString(), tokenLineNumber);
#if DEBUG
log.Debug("Emit FloatToken from string '{0}'", nextTokenSb);
#endif
}
else
{
#if DEBUG
log.Debug("Emit IntToken from string '{0}'", nextTokenSb);
#endif
token = new IntToken(nextTokenSb.ToString(), tokenLineNumber);
}
done = true;
nextTokenSb.Length = 0;
}
else
{
// -whatever or -.whatever
// didn't see any digits, must have gotten here by a leading -
// and no digits after it
// back up to -, pick next state excluding numbers
nextTokenSb.Append((char)thisChar);
backString.Append(nextTokenSb);
nextTokenSb.Length = 0;
// restart on the - and don't choose MaybeNumber state
// pull char from backString
thisChar = backString[0];
backString.Remove(0, 1);
state = PickNextState(settings.CharTypes[thisChar], (int)thisChar,
NextTokenState.MaybeNumber);
#if DEBUG
log.Debug("MaybeNumber: Next state on '{0}' is {1}", (char)thisChar,
state);
#endif
}
}
break;
case NextTokenState.Eol:
// tokenLineNumber - 1 because the newline char is on the previous line
token = new EolToken(tokenLineNumber - 1);
done = true;
nextTokenSb.Length = 0;
break;
case NextTokenState.Eof:
token = new EofToken(tokenLineNumber);
done = true;
nextTokenSb.Length = 0;
return (false);
case NextTokenState.Invalid:
default:
// not a good sign, some unrepresented state?
log.Error("NextToken: Hit unrepresented state {0}", state.ToString());
return (false);
}
// use a StringBuilder to accumulate characters which are part of this token
if (thisChar != Eof) nextTokenSb.Append((char)thisChar);
#if DEBUG
log.Debug("After switch: state = {0}, nextTokenSb = '{1}', backString = '{2}'",
state, nextTokenSb, backString);
#endif
}
#if DEBUG
log.Debug("Got token {0}", token.ToDebugString());
#endif
return (true);
}