public virtual IEnumerable <Token> Tokenize(string source)
{
var tokens = new TokenList();
var syntaxchars = _grammar.Syntax.SyntaxChars;
var symbols = _grammar.Syntax.Symbols;
var constants = _grammar.Syntax.Constants;
var quote = _grammar.Syntax.Quote;
var escapedSyntax = false;
int position = 0;
var tokenizer = new Tokenizer(source, _grammar.Syntax.NonBreakingIdentifierChars, _grammar.Syntax.Quote, _grammar.Syntax.LineComment);
try
{
while (!tokenizer.Eof())
{
tokenizer.SkipSpaces();
if (tokenizer.Eof())
{
return(tokens);
}
position = tokenizer.Position + 1;
var ch = tokenizer.Peek();
Token token;
TokenType tokenType;
if (!escapedSyntax && syntaxchars.TryGetValue(ch, out tokenType))
{
#region Handle syntax chars first
switch (tokenType)
{
case TokenType.SyntaxEscape:
tokenizer.Next();
escapedSyntax = true;
continue;
case TokenType.Negate:
TokenType?tokenType1;
token = new Token
{
Type = tokenType, // negate
Lexeme = ReadSyntax(tokenizer, out tokenType1),
Position = position,
Source = source
};
if (tokenType1 != null)
{
// more chars have been read, the token read has another meaning...
token.Type = tokenType1.Value;
}
else
{
var ptoken = tokens.LastOrDefault();
if (ptoken != null && !ptoken.IsDelimiter() && !_grammar.IsOperator(ptoken) &&
!ptoken.IsLParen() && !ptoken.IsLBracket())
{
// if any previous token, and ptoken is not: ',' '(' or operator, than it is NOT a negation
// does it have another meaning, e.g., substraction?
if (symbols.TryGetValue(token.Lexeme, out tokenType))
{
// yes
token.Type = tokenType;
}
else
{
// there was no additional meaning defined, so throw error
throw new XPressionException(source, "invalid negate: " + (char)ch, position);
}
}
}
break;
default:
TokenType?t;
var lex = ReadSyntax(tokenizer, out t);
token = Token.Create(t.GetValueOrDefault(tokenType), source, lex, position);
if (token.Type == TokenType.Insignificant)
{
continue;
}
if (token.Type == TokenType.UnKnown)
{
// explicitly marked as unknwon character, probably in incomple or wrong context
throw new XPressionException(source, "UnKnown character: " + (char)ch, position);
}
break;
}
tokens.Add(token);
continue;
#endregion
}
string lexeme;
Tokenizer.StringTokenType stringTokenType;
if (escapedSyntax)
{
// try read a numeric or identifier token (not a date, time, duration, etc...)
if (!tokenizer.TryReadNumeric(out lexeme, out stringTokenType))
{
if (!tokenizer.TryReadIdentifier(out lexeme, out stringTokenType))
{
// could this happen?
// ignore escape
escapedSyntax = false;
lexeme = tokenizer.NextToken(out stringTokenType);
}
}
}
else
{
// let the tokenizer decide what will be the next token
lexeme = tokenizer.NextToken(out stringTokenType);
}
switch (stringTokenType)
{
case Tokenizer.StringTokenType.Comment:
continue;
case Tokenizer.StringTokenType.Identifier:
if (tokenizer.PeekChar() == quote)
{
// it is a type specifier, followed by a string value
// ignore escape
TypeParser parser;
if (!_grammar.TryGetTypeParser(lexeme, out parser))
{
throw new XPressionException(source, "invalid type: " + lexeme, position);
}
position = tokenizer.Position;
lexeme = tokenizer.NextToken(out stringTokenType);
token = LiteralToken.CreateLiteral(source, lexeme, position, parser.Parse(lexeme));
break;
}
if (!escapedSyntax)
{
// find out if the identifier is a syntax related symbol
object value;
if (constants.TryGetValue(lexeme, out value))
{
// it is a constant, like: true,false,null,....
token = LiteralToken.CreateLiteral(source, lexeme, position, value);
break;
}
if (symbols.TryGetValue(lexeme, out tokenType))
{
// it may be an operator, like: like,mod,contains,....
token = Token.Create(tokenType, source, lexeme, position);
}
else
{
// default: it is an identifier
token = Token.Create(TokenType.Identifier, source, lexeme, position);
}
}
else
{
// escaped: always an identifier
token = Token.Create(TokenType.Identifier, source, lexeme, position);
}
break;
case Tokenizer.StringTokenType.String:
token = LiteralToken.CreateLiteral(source, lexeme, position, lexeme);
break;
case Tokenizer.StringTokenType.Date:
case Tokenizer.StringTokenType.DateTime:
token = LiteralToken.CreateLiteral(source, lexeme, position,
XmlConvert.ToDateTime(lexeme, XmlDateTimeSerializationMode.Local));
break;
case Tokenizer.StringTokenType.DateUtc:
case Tokenizer.StringTokenType.DateTimeUtc:
token = LiteralToken.CreateLiteral(source, lexeme, position,
XmlConvert.ToDateTime(lexeme, XmlDateTimeSerializationMode.Utc));
break;
case Tokenizer.StringTokenType.DateOffset:
case Tokenizer.StringTokenType.DateTimeOffset:
token = LiteralToken.CreateLiteral(source, lexeme, position, XmlConvert.ToDateTimeOffset(lexeme));
break;
case Tokenizer.StringTokenType.Time:
token = LiteralToken.CreateLiteral(source, lexeme, position, XmlConvert.ToDateTime("2000-01-01T" + lexeme, XmlDateTimeSerializationMode.Local).TimeOfDay);
break;
case Tokenizer.StringTokenType.TimeUtc:
case Tokenizer.StringTokenType.TimeOffset:
token = LiteralToken.CreateLiteral(source, lexeme, position, XmlConvert.ToDateTimeOffset("2000-01-01T" + lexeme).ToUniversalTime().TimeOfDay);
break;
case Tokenizer.StringTokenType.Duration:
token = LiteralToken.CreateLiteral(source, lexeme, position, XmlConvert.ToTimeSpan(lexeme));
break;
case Tokenizer.StringTokenType.Int16:
token = LiteralToken.CreateLiteral(source, lexeme, position,
Int16.Parse(lexeme, CultureInfo.InvariantCulture));
break;
case Tokenizer.StringTokenType.Int32:
token = LiteralToken.CreateLiteral(source, lexeme, position,
Int32.Parse(lexeme, CultureInfo.InvariantCulture));
break;
case Tokenizer.StringTokenType.Int64:
token = LiteralToken.CreateLiteral(source, lexeme, position,
Int64.Parse(lexeme, CultureInfo.InvariantCulture));
break;
case Tokenizer.StringTokenType.Single:
token = LiteralToken.CreateLiteral(source, lexeme, position,
Single.Parse(lexeme, CultureInfo.InvariantCulture));
break;
case Tokenizer.StringTokenType.Double:
token = LiteralToken.CreateLiteral(source, lexeme, position,
Double.Parse(lexeme, CultureInfo.InvariantCulture));
break;
case Tokenizer.StringTokenType.Decimal:
token = LiteralToken.CreateLiteral(source, lexeme, position,
Decimal.Parse(lexeme, CultureInfo.InvariantCulture));
break;
case Tokenizer.StringTokenType.Guid:
token = LiteralToken.CreateLiteral(source, lexeme, position, Guid.Parse(lexeme));
break;
case Tokenizer.StringTokenType.JSON:
token = LiteralToken.CreateLiteral(source, lexeme, position, lexeme); //or else parse JObject??
break;
case Tokenizer.StringTokenType.Hexadecimal:
object hexvalue;
if (lexeme.Length <= 4)
{
hexvalue = Convert.ToByte(lexeme, 16);
}
else if (lexeme.Length <= 6)
{
hexvalue = Convert.ToUInt16(lexeme, 16);
}
else if (lexeme.Length <= 10)
{
hexvalue = Convert.ToUInt32(lexeme, 16);
}
else
{
hexvalue = Convert.ToUInt64(lexeme, 16);
}
token = LiteralToken.CreateLiteral(source, lexeme, position, hexvalue);
break;
//case Tokenizer.StringTokenType.Char:
default:
throw new XPressionException(source, "invalid token: " + lexeme, position);
}
escapedSyntax = false;
tokens.Add(token);
}
return(tokens);
}
catch (Exception ex)
{
throw new XPressionException(source, ex.Message, position, ex);
}
}