Lucene.Net.Analysis.Cjk.CJKBigramFilter.IncrementToken C# (CSharp) Method

CJKBigramFilter Class Documentation Show file Open project: apache/lucenenet
IncrementToken() public method

public IncrementToken ( ) : bool
return	bool
        public override bool IncrementToken()
        {
            while (true)
            {
                if (HasBufferedBigram)
                {

                    // case 1: we have multiple remaining codepoints buffered,
                    // so we can emit a bigram here.

                    if (outputUnigrams)
                    {

                        // when also outputting unigrams, we output the unigram first,
                        // then rewind back to revisit the bigram.
                        // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
                        // the logic in hasBufferedUnigram ensures we output the C, 
                        // even though it did actually have adjacent CJK characters.

                        if (ngramState)
                        {
                            FlushBigram();
                        }
                        else
                        {
                            FlushUnigram();
                            index--;
                        }
                        ngramState = !ngramState;
                    }
                    else
                    {
                        FlushBigram();
                    }
                    return true;
                }
                else if (DoNext())
                {

                    // case 2: look at the token type. should we form any n-grams?

                    string type = typeAtt.Type;
                    if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul)
                    {

                        // acceptable CJK type: we form n-grams from these.
                        // as long as the offsets are aligned, we just add these to our current buffer.
                        // otherwise, we clear the buffer and start over.

                        if (offsetAtt.StartOffset() != lastEndOffset) // unaligned, clear queue
                        {
                            if (HasBufferedUnigram)
                            {

                                // we have a buffered unigram, and we peeked ahead to see if we could form
                                // a bigram, but we can't, because the offsets are unaligned. capture the state 
                                // of this peeked data to be revisited next time thru the loop, and dump our unigram.

                                loneState = CaptureState();
                                FlushUnigram();
                                return true;
                            }
                            index = 0;
                            bufferLen = 0;
                        }
                        Refill();
                    }
                    else
                    {

                        // not a CJK type: we just return these as-is.

                        if (HasBufferedUnigram)
                        {

                            // we have a buffered unigram, and we peeked ahead to see if we could form
                            // a bigram, but we can't, because its not a CJK type. capture the state 
                            // of this peeked data to be revisited next time thru the loop, and dump our unigram.

                            loneState = CaptureState();
                            FlushUnigram();
                            return true;
                        }
                        return true;
                    }
                }
                else
                {

                    // case 3: we have only zero or 1 codepoints buffered, 
                    // so not enough to form a bigram. But, we also have no
                    // more input. So if we have a buffered codepoint, emit
                    // a unigram, otherwise, its end of stream.

                    if (HasBufferedUnigram)
                    {
                        FlushUnigram(); // flush our remaining unigram
                        return true;
                    }
                    return false;
                }
            }
        }
CJKBigramFilter
DoNext
FlushBigram
FlushUnigram
IncrementToken
Refill
Reset