WikiFunctions.Parse.MetaDataSorter.RemoveCats C# (CSharp) Method

RemoveCats() public method

Extracts DEFAULTSORT + categories from the article text; removes duplicate categories, cleans whitespace and underscores
public RemoveCats ( string &articleText, string articleTitle ) : string
articleText string The wiki text of the article.
articleTitle string Title of the article
return string
        public string RemoveCats(ref string articleText, string articleTitle)
        {
            // don't pull category from redirects to a category e.g. page Hello is #REDIRECT[[Category:Hello]]
            string rt = Tools.RedirectTarget(articleText);
            if (rt.Length > 0 && WikiRegexes.Category.IsMatch(@"[[" + rt + @"]]"))
                return "";

            List<string> categoryList = new List<string>();
            string originalArticleText = articleText;
            string articleTextNoComments = Tools.ReplaceWithSpaces(articleText, WikiRegexes.Comments.Matches(articleText));

            // don't operate on pages with (incorrectly) multiple defaultsorts
            // ignore commented out DEFAULTSORT – https://en.wikipedia.org/wiki/Wikipedia_talk:AutoWikiBrowser/Bugs/Archive_12#Moving_DEFAULTSORT_in_HTML_comments
            MatchCollection mc = WikiRegexes.Defaultsort.Matches(articleTextNoComments);
            if (mc.Count > 1)
            {
                Tools.WriteDebug("RemoveCats", "Page " + articleTitle + " has multiple DEFAULTSORTs");
                return "";
            }

            string defaultSort = "";
            bool defaultSortRemoved = false;

            // allow comments between categories, and keep them in the same place, only grab any comment after the last category if on same line
            // whitespace: remove all whitespace after, but leave a blank newline before a heading (rare case where category not in last section)

            // performance: apply regex on portion of article containing category links rather than whole text
            Match cq = WikiRegexes.CategoryQuick.Match(articleTextNoComments);

            if (cq.Success)
            {
                int cutoff = Math.Max(0, cq.Index - 500);
                string cut = articleText.Substring(cutoff);
                cut = WikiRegexes.RemoveCatsAllCats.Replace(cut, m => {
                                                                       if (!CatsForDeletion.IsMatch(m.Value))
                                                                           categoryList.Add(m.Value.Trim());

                                                                       // if category not at start of line, leave newline, otherwise text on next line moved up
                                                                       if (m.Index > 2 && !cut.Substring(m.Index - 2, 2).Trim().Equals(""))
                                                                           return "\r\n";

                                                                       return "";
                                                                      });

                // if category tidying has changed comments/nowikis return with no changes – we've pulled a cat from a comment
                if (!Tools.UnformattedTextNotChanged(originalArticleText.Substring(cutoff), cut))
                {
                    articleText = originalArticleText;
                    return "";
                }

                if (AddCatKey)
                    categoryList = CatKeyer(categoryList, articleTitle);

                // now refresh defaultsort to pick up any comment on same line after it
                if (mc.Count > 0)
                    mc = Regex.Matches(articleText, WikiRegexes.Defaultsort.ToString() + @"(?: *<!--[^<>]*-->)?");

                // remove defaultsort now if we can, faster to remove from cut than whole articleText
                if (mc.Count > 0 && cut.Contains(mc[0].Value))
                {
                    cut = cut.Replace(mc[0].Value, "");
                    defaultSortRemoved = true;
                }

                articleText = articleText.Substring(0, cutoff) + cut;

                if (CatCommentRegex.IsMatch(cut))
                    articleText = CatCommentRegex.Replace(articleText, m =>
                                                          {
                                                              categoryList.Insert(0, m.Value);
                                                              return "";
                                                          }, 1);

            }

            if (Variables.LangCode.Equals("sl") && LifeTime.IsMatch(articleText))
            {
                defaultSort = LifeTime.Match(articleText).Value;
            }
            else if (mc.Count > 0)
                    defaultSort = mc[0].Value;

            if (!string.IsNullOrEmpty(defaultSort))
            {
                // if defaultsort wasn't in the cut area before the categories, remove now
                if (!defaultSortRemoved)
                    articleText = articleText.Replace(defaultSort, "");

                if (defaultSort.ToUpper().Contains("DEFAULTSORT"))
                    defaultSort = TalkPageFixes.FormatDefaultSort(defaultSort);
                defaultSort += "\r\n";
            }

            // Extract any {{uncategorized}} template, but not uncat stub templates
            // remove exact duplicates
            string uncat = "";
            if (TemplateExists(Parsers.GetAllTemplates(originalArticleText), WikiRegexes.Uncat) && WikiRegexes.Uncat.IsMatch(articleTextNoComments))
            {
                articleText = WikiRegexes.Uncat.Replace(articleText, uncatm =>
                                                        {
                                                            if (WikiRegexes.PossiblyCommentedStub.IsMatch(uncatm.Value))
                                                                return uncatm.Value;

                                                            // remove exact duplicates
                                                            if (!uncat.Contains(uncatm.Value))
                                                                uncat += uncatm.Value + "\r\n";

                                                            return "";
                                                        });
            }

            return uncat + defaultSort + ListToString(categoryList);
        }

Usage Example

Example #1
0
        /// <summary>
        /// determines whether the article is about a person by looking for persondata/birth death categories, bio stub etc. for en wiki only
        /// Should only return true if the article is the principle article about the individual (not early life/career/discography etc.)
        /// </summary>
        /// <param name="articleText">The wiki text of the article.</param>
        /// <param name="articleTitle">Title of the article</param>
        /// <param name="parseTalkPage"></param>
        /// <returns></returns>
        public static bool IsArticleAboutAPerson(string articleText, string articleTitle, bool parseTalkPage)
        {
            #if DEBUG || UNITTEST
            if (Globals.UnitTestMode)
                parseTalkPage = false;
            #endif
            
            // fix for duplicate living people categories being miscounted as article about multiple people
            MetaDataSorter MDS = new MetaDataSorter();
            string cats = MDS.RemoveCats(ref articleText, articleTitle);

            articleText += cats;

            if (!Variables.LangCode.Equals("en")
                || Namespace.Determine(articleTitle).Equals(Namespace.Category)
                || articleTitle.StartsWith(@"List of ")
                || articleTitle.StartsWith(@"Murder of ")
                || articleTitle.StartsWith(@"Deaths ")
                || articleTitle.EndsWith("discography") || articleTitle.EndsWith(" murders")
                || articleText.Contains(@"[[Category:Multiple people]]")
                || articleText.Contains(@"[[Category:Married couples")
                || articleText.Contains(@"[[Category:Fictional")
                || Regex.IsMatch(articleText, @"\[\[Category:\d{4} animal")
                || articleText.Contains(@"[[fictional character")
                || InUniverse.IsMatch(articleText)
                || articleText.Contains(@"[[Category:Presidencies")
                || articleText.Contains(@"[[Category:Military careers")
                || Regex.IsMatch(articleText, @"[[Category:[^\[\]\|]*[nN]oble families")
                || CategoryCharacters.IsMatch(articleText)
                || Tools.NestedTemplateRegex("Infobox cricketer tour biography").IsMatch(articleText)
                || WikiRegexes.Disambigs.IsMatch(articleText)
                || WikiRegexes.DeathsOrLivingCategory.Matches(articleText).Count > 1
                || WikiRegexes.InfoBox.Match(articleText).Groups[1].Value.ToLower().Contains("organization")
               )
                return false;
            
            string MABackground =
                Tools.GetTemplateParameterValue(IMA.Match(articleText).Value,
                                                "Background", true);

            if(MABackground.Contains("band") || MABackground.Contains("classical_ensemble") || MABackground.Contains("temporary"))
                return false;
            
            string CLSA = Tools.NestedTemplateRegex(@"Infobox Chinese-language singer and actor").Match(articleText).Value;
            if(CLSA.Length > 0)
            {
                if (Tools.GetTemplateParameterValue(CLSA, "currentmembers").Length > 0
                    || Tools.GetTemplateParameterValue(CLSA, "pastmembers").Length > 0)
                    return false;
            }
            
            string zerothSection = WikiRegexes.ZerothSection.Match(articleText).Value;

            // not about a person if it's not the principle article on the subject
            if (SeeAlsoOrMain.IsMatch(zerothSection))
                return false;

            // not about one person if multiple different birth or death date templates
            List<string> BD = new List<string>();
            foreach(Match m in BirthDate.Matches(articleText))
            {
                if(BD.Count > 0 && !BD.Contains(m.Value))
                    return false;
                
                BD.Add(m.Value);
            }
            
            List<string> DD = new List<string>();
            foreach(Match m in DeathDate.Matches(articleText))
            {
                if(DD.Count > 0 && !DD.Contains(m.Value))
                    return false;
                
                DD.Add(m.Value);
            }
            
            if(WikiRegexes.PeopleInfoboxTemplates.Matches(articleText).Count > 1)
                return false;
            
            if (WikiRegexes.Persondata.Matches(articleText).Count == 1
                || articleText.Contains(@"-bio-stub}}")
                || articleText.Contains(@"[[Category:Living people")
                || WikiRegexes.PeopleInfoboxTemplates.Matches(zerothSection).Count == 1)
                return true;

            // articles with bold linking to another article may be linking to the main article on the person the article is about
            // e.g. '''military career of [[Napoleon Bonaparte]]'''
            string zerothSectionNoTemplates = WikiRegexes.Template.Replace(zerothSection, "");
            foreach(Match m in WikiRegexes.Bold.Matches(zerothSectionNoTemplates))
            {
                if(WikiRegexes.WikiLink.IsMatch(m.Value))
                    return false;
            }

            int dateBirthAndAgeCount =BirthDate.Matches(zerothSection).Count;
            int dateDeathCount = DeathDate.Matches(zerothSection).Count;

            if (dateBirthAndAgeCount == 1 || dateDeathCount == 1)
                return true;

            return WikiRegexes.DeathsOrLivingCategory.IsMatch(articleText)
                || WikiRegexes.LivingPeopleRegex2.IsMatch(articleText)
                || WikiRegexes.BirthsCategory.IsMatch(articleText)
                || WikiRegexes.BLPSources.IsMatch(BLPUnsourcedSection.Replace(articleText, ""))
                || RefImproveBLP.IsMatch(articleText);
            /*    || (!string.IsNullOrEmpty(articleTitle) && articleText.Length < 10000 && parseTalkPage &&
                    TryGetArticleText(Variables.Namespaces[Namespace.Talk] + articleTitle).Contains(@"{{WPBiography"))*/
        }