private SuperLinkedList<WordInfo> PreSegment(String text)
{
SuperLinkedList<WordInfo> result = GetInitSegment(text);
SuperLinkedListNode<WordInfo> cur = result.First;
while (cur != null)
{
if (_Options.IgnoreSpace)
{
if (cur.Value.WordType == WordType.Space)
{
SuperLinkedListNode<WordInfo> lst = cur;
cur = cur.Next;
result.Remove(lst);
continue;
}
}
switch (cur.Value.WordType)
{
case WordType.SimplifiedChinese:
string inputText = cur.Value.Word;
WordType originalWordType = WordType.SimplifiedChinese;
if (_Options.TraditionalChineseEnabled)
{
string simplified = Microsoft.VisualBasic.Strings.StrConv(cur.Value.Word, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0);
if (simplified != cur.Value.Word)
{
originalWordType = WordType.TraditionalChinese;
inputText = simplified;
}
}
PanGu.Framework.AppendList<Dict.PositionLength> pls = _WordDictionary.GetAllMatchs(inputText, _Options.ChineseNameIdentify);
PanGu.Match.ChsFullTextMatch chsMatch = new PanGu.Match.ChsFullTextMatch(_WordDictionary);
chsMatch.Options = _Options;
chsMatch.Parameters = _Parameters;
SuperLinkedList<WordInfo> chsMatchWords = chsMatch.Match(pls.Items, cur.Value.Word, pls.Count);
SuperLinkedListNode<WordInfo> curChsMatch = chsMatchWords.First;
while (curChsMatch != null)
{
WordInfo wi = curChsMatch.Value;
wi.Position += cur.Value.Position;
wi.OriginalWordType = originalWordType;
wi.WordType = originalWordType;
if (_Options.OutputSimplifiedTraditional)
{
if (_Options.TraditionalChineseEnabled)
{
string newWord;
WordType wt;
if (originalWordType == WordType.SimplifiedChinese)
{
newWord = Microsoft.VisualBasic.Strings.StrConv(wi.Word,
Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);
wt = WordType.TraditionalChinese;
}
else
{
newWord = Microsoft.VisualBasic.Strings.StrConv(wi.Word,
Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0);
wt = WordType.SimplifiedChinese;
}
if (newWord != wi.Word)
{
WordInfo newWordInfo = new WordInfo(wi);
newWordInfo.Word = newWord;
newWordInfo.OriginalWordType = originalWordType;
newWordInfo.WordType = wt;
newWordInfo.Rank = _Parameters.SimplifiedTraditionalRank;
newWordInfo.Position = wi.Position;
chsMatchWords.AddBefore(curChsMatch, newWordInfo);
}
}
}
curChsMatch = curChsMatch.Next;
}
SuperLinkedListNode<WordInfo> lst = result.AddAfter(cur, chsMatchWords);
SuperLinkedListNode<WordInfo> removeItem = cur;
cur = lst.Next;
result.Remove(removeItem);
break;
case WordType.English:
cur.Value.Rank = _Parameters.EnglishRank;
List<string> output;
cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word);
if (_Options.EnglishSegment)
{
string lower = cur.Value.Word.ToLower();
if (lower != cur.Value.Word)
{
result.AddBefore(cur, new WordInfo(lower, cur.Value.Position, POS.POS_A_NX, 1,
_Parameters.EnglishLowerRank, WordType.English, WordType.English));
}
string stem = GetStem(lower);
if (!string.IsNullOrEmpty(stem))
{
if (lower != stem)
{
result.AddBefore(cur, new WordInfo(stem, cur.Value.Position, POS.POS_A_NX, 1,
_Parameters.EnglishStemRank, WordType.English, WordType.English));
}
}
}
else if (_Options.IgnoreCapital)
{
cur.Value.Word = cur.Value.Word.ToLower();
}
if (_Options.EnglishMultiDimensionality)
{
bool needSplit = false;
foreach (char c in cur.Value.Word)
{
if ((c >= '0' && c <= '9') || (c == '_'))
{
needSplit = true;
break;
}
}
if (needSplit)
{
if (Framework.Regex.GetMatchStrings(cur.Value.Word, PATTERNS, true, out output))
{
int outputCount = 0;
foreach (string str in output)
{
if (!string.IsNullOrEmpty(str))
{
outputCount++;
if (outputCount > 1)
{
break;
}
}
}
if (outputCount > 1)
{
int position = cur.Value.Position;
foreach (string splitWord in output)
{
if (string.IsNullOrEmpty(splitWord))
{
continue;
}
WordInfo wi;
if (splitWord[0] >= '0' && splitWord[0] <= '9')
{
wi = new WordInfo(splitWord, POS.POS_A_M, 1);
wi.Position = position;
wi.Rank = _Parameters.NumericRank;
wi.OriginalWordType = WordType.English;
wi.WordType = WordType.Numeric;
}
else
{
wi = new WordInfo(splitWord, POS.POS_A_NX, 1);
wi.Position = position;
wi.Rank = _Parameters.EnglishRank;
wi.OriginalWordType = WordType.English;
wi.WordType = WordType.English;
}
result.AddBefore(cur, wi);
position += splitWord.Length;
}
}
}
}
}
if (!MergeEnglishSpecialWord(text, result, ref cur))
{
cur = cur.Next;
}
break;
case WordType.Numeric:
cur.Value.Word = ConvertChineseCapitalToAsiic(cur.Value.Word);
cur.Value.Rank = _Parameters.NumericRank;
if (!MergeEnglishSpecialWord(text, result, ref cur))
{
cur = cur.Next;
}
//cur = cur.Next;
break;
case WordType.Symbol:
cur.Value.Rank = _Parameters.SymbolRank;
cur = cur.Next;
break;
default:
cur = cur.Next;
break;
}
}
return result;
}