public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true)
{
var result = new List<Token>();
var start = 0;
if (mode == TokenizerMode.Default)
{
foreach (var w in Cut(text, hmm: hmm))
{
var width = w.Length;
result.Add(new Token(w, start, start + width));
start += width;
}
}
else
{
foreach (var w in Cut(text, hmm: hmm))
{
var width = w.Length;
if (width > 2)
{
for (var i = 0; i < width - 1; i++)
{
var gram2 = w.Substring(i, 2);
if (WordDict.ContainsWord(gram2))
{
result.Add(new Token(gram2, start + i, start + i + 2));
}
}
}
if (width > 3)
{
for (var i = 0; i < width - 2; i++)
{
var gram3 = w.Substring(i, 3);
if (WordDict.ContainsWord(gram3))
{
result.Add(new Token(gram3, start + i, start + i + 3));
}
}
}
result.Add(new Token(w, start, start + width));
start += width;
}
}
return result;
}