Coon.Compass.ProteinHoarder.ProteinHoarder.GroupProteins C# (CSharp) Method

GroupProteins() private method

Groups proteins into groups based on the peptides in the proteins. Combines Proteins if they contain all the same peptide sequences (Indistinquishable) and removes groups that can be made up by other groups in its entirety (Subsumable). Lastly, it filters for false discovery.
private GroupProteins ( List proteins, bool printMessages = true ) : List
proteins List A list of unique proteins to group together
printMessages bool
return List
        private List<ProteinGroup> GroupProteins(List<Protein> proteins, bool printMessages = true)
        {
            if (printMessages)
                Log("Grouping proteins into protein groups...");

            // A list of protein groups that, at the end of this method, will have distinct protein groups.
            List<ProteinGroup> proteinGroups = new List<ProteinGroup>();
            if (printMessages)
                Log("{0:N0} original proteins (maximum proteins identified)", proteins.Count);

            // 1) Find Indistinguishable Proteins and group them together into Protein Groups
            // If they are not indistinguishable, then they are still converted to Protein Groups
            // but only contain one protein.
            // A 1 2 3 4
            // B 1 2 3 4
            // C 1   3 4
            // Proteins A and B are indistinguisable (have same set of peptides 1,2,3,4), and thus would become a Protein Group (PG1 [a,b])
            // C is distinguishable and would become a Protein Group (PG2 [c]).
            #region Indistinguishable

            // Loop over each protein
            int p1 = 0;
            while (p1 < proteins.Count)
            {
                // Grab the next protein and its associated peptides from the list of all proteins
                Protein protein = proteins[p1];
                HashSet<Peptide> peptides = protein.Peptides;

                // Check to see if this protein has enough peptides to be considered indentified
                //if (peptides.Count < MinPeptidesPerGroup)
                //{
                //    // This protein didn't have enough peptides, so remove it from future consideration
                //    proteins.RemoveAt(p1);

                //    // Increase the counter
                //    numberRemovedForNotEnoughPeptides++;

                //    // Go to the next protein on the list
                //    continue;
                //}

                // Start off making the protein into a protein group with its associated peptides
                ProteinGroup pg = new ProteinGroup(protein, peptides);

                // Start looking at the next protein in the list
                int p2 = p1 + 1;

                // Loop over each other protein skipping the one you just made into the PG
                while (p2 < proteins.Count)
                {
                    // Does the next protein contain the same set of peptides as the protein group?
                    if (proteins[p2].Peptides.SetEquals(peptides))
                    {
                        // Yes they are indistinguishable (i.e. proteins A and B from above), so add this protein to the protein group
                        pg.Add(proteins[p2]);

                        // Then remove this protein from the list of all proteins as not to make it into its own PG later
                        proteins.RemoveAt(p2);
                    }
                    else
                    {
                        // Go to next protein in question
                        p2++;
                    }
                }

                // We have gone through every protein possible and thus have completed the grouping of this PG
                proteinGroups.Add(pg);
                p1++;
            }
            if (printMessages)
                Log("{0:N0} protein groups are left after combining indistinguishable proteins (having the exact same set of peptides)", proteinGroups.Count);

            #endregion Indistinguishable

            // 2) Find Subsumable Proteins
            // Sort proteins from worst to best to remove the worst scoring groups first (note well, lower p-values mean better scores)
            // Case Example: P-Value, Protein Group, Peptides
            // 0.1  A 1 2
            // 0.05 B 1   3
            // 0.01 C   2 3
            // These are subsumable and we remove the worst scoring protein group (in this case, Protein Group A at p-value of 0.1) first. This would leave:
            // 0.05 B 1   3
            // 0.01 C   2 3
            // Which would mean Protein Group B and C are distinct groups, but share a common peptide (3), peptides 1 and 2 would remain unshared.
            // Protein Group A is removed, as it its peptides can be explained by groups B and C.
            #region Subsumable

            // First, make sure all the peptides know which protein groups they belong too, so we can determined shared peptides
            // and thus get correct p-value for the PGs.
            //MappedPeptidesToProteinGroups(proteinGroups);

            // First update each protein's p-value
            foreach (ProteinGroup proteinGroup in proteinGroups)
            {
                proteinGroup.UpdatePValue(PScoreCalculationMethod, UseConservativePScore);
            }

            // Then sort the groups on decreasing p-values
            proteinGroups.Sort(ProteinGroup.CompareDecreasing);

            p1 = 0;
            while (p1 < proteinGroups.Count)
            {
                // Get the peptides in the protein group
                ProteinGroup proteinGroup = proteinGroups[p1];
                HashSet<Peptide> referencePeptides = proteinGroup.Peptides;

                // Check if all the peptides are shared, if they are then the protein group is subsumable and should be removed
                if (referencePeptides.All(p => p.IsShared))
                {
                    // Since this protein group is being eliminated, remove its reference from all the peptides
                    foreach (Peptide pep in referencePeptides)
                    {
                        pep.ProteinGroups.Remove(proteinGroup);
                    }

                    // Remove the protein group from the master list
                    proteinGroups.RemoveAt(p1);
                }
                else
                {
                    p1++;
                }
            }

            if (printMessages)
                Log("{0:N0} protein groups are left after removing subsumable groups (peptides can be explain by other groups)", proteinGroups.Count);

            #endregion Subsumable

            // 3) Remove protein groups that do not have enough peptides within them
            #region MinimumGroupSize

            // No need to filter if this is one or less
            if (MinPeptidesPerGroup > 1)
            {
                p1 = 0;
                while (p1 < proteinGroups.Count)
                {
                    ProteinGroup proteinGroup = proteinGroups[p1];

                    // Check to see if this protein has enough peptides to be considered indentified
                    if (proteinGroup.Peptides.Count < MinPeptidesPerGroup)
                    {
                        // Since this protein group is being eliminated, remove its reference from all the peptides
                        foreach (Peptide pep in proteinGroup.Peptides)
                        {
                            pep.ProteinGroups.Remove(proteinGroup);
                        }

                        // This protein didn't have enough peptides, so remove it from future consideration
                        proteinGroups.RemoveAt(p1);
                    }
                    else
                    {
                        p1++;
                    }
                }
                if (printMessages)
                    Log("{0:N0} protein groups are left after removing groups with < {1:N0} peptides [parsimonious proteins]", proteinGroups.Count, MinPeptidesPerGroup);
            }

            #endregion

            // 4) Apply false discovery filtering at the protein level
            #region FDR filtering

            proteinGroups.Sort();
            // Mark each protein group that passes fdr filtering
            int count = 0;
            foreach (ProteinGroup proteinGroup in FalseDiscoveryRate<ProteinGroup, double>.Filter(proteinGroups, MaxFdr / 100, true))
            {
                proteinGroup.PassesFDR = true;
                count++;
            }

            if (printMessages)
                Log("{0:N0} protein groups are left after applying FDR of {1:N2}% [parsimonious proteins filtered]", count, MaxFdr);

            #endregion FDR filtering

            return proteinGroups;
        }