public static Dictionary<int, int> UnclosedTags(string articleText)
{
Dictionary<int, int> back = new Dictionary<int, int>();
// Performance: get all tags, filter to the ones we're checking, compare the count of matched tags of same name
// Then do full tag search if unmatched tags found
// get all tags in format <tag...> in article
MatchCollection anyTagMatchCollection = AnyTag.Matches(articleText);
List<string> AnyTagList = (from Match m in anyTagMatchCollection
select m.Groups[1].Value.Trim().ToLower()).ToList();
// discard self-closing tags in <tag/> format, discard wiki comments
AnyTagList = AnyTagList.FindAll(s => !s.EndsWith("/") && !s.StartsWith("!--"));
// remove any text after first space, so we're left with tag name only
AnyTagList = AnyTagList.Select(s => s.Contains(" ") ? s.Substring(0, s.IndexOf(" ")).Trim() : s).ToList();
// filter to only the tags we're checking
AnyTagList = AnyTagList.FindAll(s => MathSourceCodeNowikiPreTagList.Contains(s.TrimStart('/')));
// Count the tag names in use, determine if unmatched tags by comparing count of opening and closing tags
bool unmatched = false;
Dictionary<string, int> tagCounts = AnyTagList.GroupBy(x => x).ToDictionary(x => x.Key, y => y.Count());
foreach(KeyValuePair<string, int> kvp in tagCounts)
{
int matchedCount = 0;
string othertag = kvp.Key.StartsWith("/") ? kvp.Key.TrimStart('/') : "/" + kvp.Key;
if (tagCounts.TryGetValue(othertag, out matchedCount) && matchedCount == kvp.Value)
continue;
unmatched = true;
break;
}
// check for any unmatched tags or unclosed part tag
if(!unmatched)
{
// now check for unclosed part tag
string noTags = Tools.ReplaceWithSpaces(articleText, anyTagMatchCollection);
int tagOpen = noTags.IndexOf('<');
if(tagOpen == -1 || (tagOpen > 0 && noTags.Substring(tagOpen).Contains('>')))
return back;
}
// if here then have some unmatched tags, so do full clear down and search
// performance of Refs/SourceCode is better if IgnoreCase avoided
articleText = articleText.ToLower();
articleText = Tools.ReplaceWithSpaces(articleText, WikiRegexes.UnformattedText);
articleText = Tools.ReplaceWithSpaces(articleText, WikiRegexes.GalleryTag, 2);
articleText = Tools.ReplaceWithSpaces(articleText, new Regex(WikiRegexes.Refs.ToString(), RegexOptions.Singleline));
// some (badly done) List of pages can have hundreds of unclosed small or center tags, causes regex bactracking when using <DEPTH>
// so workaround solution: if > 10 unclosed tags, only remove tags without other tags embedded in them
// Workaround constraint: we might incorrectly report some valid tags with < or > in them as unclosed
if (AnyTagList.Count(s => !s.StartsWith("/")) > (AnyTagList.Count(s => s.StartsWith("/")) + 10))
{
while(SimpleTagPair.IsMatch(articleText))
articleText = Tools.ReplaceWithSpaces(articleText, SimpleTagPair);
}
else
{
articleText = Tools.ReplaceWithSpaces(articleText, new Regex(WikiRegexes.SourceCode.ToString(), RegexOptions.Singleline));
articleText = Tools.ReplaceWithSpaces(articleText, CenterTag, 2);
articleText = Tools.ReplaceWithSpaces(articleText, WikiRegexes.Small);
articleText = Tools.ReplaceWithSpaces(articleText, SupTag, 2);
articleText = Tools.ReplaceWithSpaces(articleText, SubTag, 2);
}
foreach (Match m in MathSourceCodeNowikiPreTag.Matches(articleText))
{
back.Add(m.Index, m.Length);
}
return back;
}