public Tokenize ( string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true ) : IEnumerable |
||
text | string | |
mode | TokenizerMode | |
hmm | bool | |
return | IEnumerable |
public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Default, bool hmm = true)
{
var result = new List<Token>();
var start = 0;
if (mode == TokenizerMode.Default)
{
foreach (var w in Cut(text, hmm: hmm))
{
var width = w.Length;
result.Add(new Token(w, start, start + width));
start += width;
}
}
else
{
foreach (var w in Cut(text, hmm: hmm))
{
var width = w.Length;
if (width > 2)
{
for (var i = 0; i < width - 1; i++)
{
var gram2 = w.Substring(i, 2);
if (WordDict.ContainsWord(gram2))
{
result.Add(new Token(gram2, start + i, start + i + 2));
}
}
}
if (width > 3)
{
for (var i = 0; i < width - 2; i++)
{
var gram3 = w.Substring(i, 3);
if (WordDict.ContainsWord(gram3))
{
result.Add(new Token(gram3, start + i, start + i + 3));
}
}
}
result.Add(new Token(w, start, start + width));
start += width;
}
}
return result;
}
public JiebaTokenizer(JiebaSegmenter seg, string input) { segmenter = seg; termAtt = AddAttribute<ITermAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); var text = input; tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList(); }