public override bool IncrementToken()
{
while (true)
{
if (HasBufferedBigram)
{
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
if (outputUnigrams)
{
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
// the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState)
{
FlushBigram();
}
else
{
FlushUnigram();
index--;
}
ngramState = !ngramState;
}
else
{
FlushBigram();
}
return true;
}
else if (DoNext())
{
// case 2: look at the token type. should we form any n-grams?
string type = typeAtt.Type;
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul)
{
// acceptable CJK type: we form n-grams from these.
// as long as the offsets are aligned, we just add these to our current buffer.
// otherwise, we clear the buffer and start over.
if (offsetAtt.StartOffset() != lastEndOffset) // unaligned, clear queue
{
if (HasBufferedUnigram)
{
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because the offsets are unaligned. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = CaptureState();
FlushUnigram();
return true;
}
index = 0;
bufferLen = 0;
}
Refill();
}
else
{
// not a CJK type: we just return these as-is.
if (HasBufferedUnigram)
{
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because its not a CJK type. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = CaptureState();
FlushUnigram();
return true;
}
return true;
}
}
else
{
// case 3: we have only zero or 1 codepoints buffered,
// so not enough to form a bigram. But, we also have no
// more input. So if we have a buffered codepoint, emit
// a unigram, otherwise, its end of stream.
if (HasBufferedUnigram)
{
FlushUnigram(); // flush our remaining unigram
return true;
}
return false;
}
}
}