private List<ProteinGroup> GroupProteins(List<Protein> proteins, bool printMessages = true)
{
if (printMessages)
Log("Grouping proteins into protein groups...");
// A list of protein groups that, at the end of this method, will have distinct protein groups.
List<ProteinGroup> proteinGroups = new List<ProteinGroup>();
if (printMessages)
Log("{0:N0} original proteins (maximum proteins identified)", proteins.Count);
// 1) Find Indistinguishable Proteins and group them together into Protein Groups
// If they are not indistinguishable, then they are still converted to Protein Groups
// but only contain one protein.
// A 1 2 3 4
// B 1 2 3 4
// C 1 3 4
// Proteins A and B are indistinguisable (have same set of peptides 1,2,3,4), and thus would become a Protein Group (PG1 [a,b])
// C is distinguishable and would become a Protein Group (PG2 [c]).
#region Indistinguishable
// Loop over each protein
int p1 = 0;
while (p1 < proteins.Count)
{
// Grab the next protein and its associated peptides from the list of all proteins
Protein protein = proteins[p1];
HashSet<Peptide> peptides = protein.Peptides;
// Check to see if this protein has enough peptides to be considered indentified
//if (peptides.Count < MinPeptidesPerGroup)
//{
// // This protein didn't have enough peptides, so remove it from future consideration
// proteins.RemoveAt(p1);
// // Increase the counter
// numberRemovedForNotEnoughPeptides++;
// // Go to the next protein on the list
// continue;
//}
// Start off making the protein into a protein group with its associated peptides
ProteinGroup pg = new ProteinGroup(protein, peptides);
// Start looking at the next protein in the list
int p2 = p1 + 1;
// Loop over each other protein skipping the one you just made into the PG
while (p2 < proteins.Count)
{
// Does the next protein contain the same set of peptides as the protein group?
if (proteins[p2].Peptides.SetEquals(peptides))
{
// Yes they are indistinguishable (i.e. proteins A and B from above), so add this protein to the protein group
pg.Add(proteins[p2]);
// Then remove this protein from the list of all proteins as not to make it into its own PG later
proteins.RemoveAt(p2);
}
else
{
// Go to next protein in question
p2++;
}
}
// We have gone through every protein possible and thus have completed the grouping of this PG
proteinGroups.Add(pg);
p1++;
}
if (printMessages)
Log("{0:N0} protein groups are left after combining indistinguishable proteins (having the exact same set of peptides)", proteinGroups.Count);
#endregion Indistinguishable
// 2) Find Subsumable Proteins
// Sort proteins from worst to best to remove the worst scoring groups first (note well, lower p-values mean better scores)
// Case Example: P-Value, Protein Group, Peptides
// 0.1 A 1 2
// 0.05 B 1 3
// 0.01 C 2 3
// These are subsumable and we remove the worst scoring protein group (in this case, Protein Group A at p-value of 0.1) first. This would leave:
// 0.05 B 1 3
// 0.01 C 2 3
// Which would mean Protein Group B and C are distinct groups, but share a common peptide (3), peptides 1 and 2 would remain unshared.
// Protein Group A is removed, as it its peptides can be explained by groups B and C.
#region Subsumable
// First, make sure all the peptides know which protein groups they belong too, so we can determined shared peptides
// and thus get correct p-value for the PGs.
//MappedPeptidesToProteinGroups(proteinGroups);
// First update each protein's p-value
foreach (ProteinGroup proteinGroup in proteinGroups)
{
proteinGroup.UpdatePValue(PScoreCalculationMethod, UseConservativePScore);
}
// Then sort the groups on decreasing p-values
proteinGroups.Sort(ProteinGroup.CompareDecreasing);
p1 = 0;
while (p1 < proteinGroups.Count)
{
// Get the peptides in the protein group
ProteinGroup proteinGroup = proteinGroups[p1];
HashSet<Peptide> referencePeptides = proteinGroup.Peptides;
// Check if all the peptides are shared, if they are then the protein group is subsumable and should be removed
if (referencePeptides.All(p => p.IsShared))
{
// Since this protein group is being eliminated, remove its reference from all the peptides
foreach (Peptide pep in referencePeptides)
{
pep.ProteinGroups.Remove(proteinGroup);
}
// Remove the protein group from the master list
proteinGroups.RemoveAt(p1);
}
else
{
p1++;
}
}
if (printMessages)
Log("{0:N0} protein groups are left after removing subsumable groups (peptides can be explain by other groups)", proteinGroups.Count);
#endregion Subsumable
// 3) Remove protein groups that do not have enough peptides within them
#region MinimumGroupSize
// No need to filter if this is one or less
if (MinPeptidesPerGroup > 1)
{
p1 = 0;
while (p1 < proteinGroups.Count)
{
ProteinGroup proteinGroup = proteinGroups[p1];
// Check to see if this protein has enough peptides to be considered indentified
if (proteinGroup.Peptides.Count < MinPeptidesPerGroup)
{
// Since this protein group is being eliminated, remove its reference from all the peptides
foreach (Peptide pep in proteinGroup.Peptides)
{
pep.ProteinGroups.Remove(proteinGroup);
}
// This protein didn't have enough peptides, so remove it from future consideration
proteinGroups.RemoveAt(p1);
}
else
{
p1++;
}
}
if (printMessages)
Log("{0:N0} protein groups are left after removing groups with < {1:N0} peptides [parsimonious proteins]", proteinGroups.Count, MinPeptidesPerGroup);
}
#endregion
// 4) Apply false discovery filtering at the protein level
#region FDR filtering
proteinGroups.Sort();
// Mark each protein group that passes fdr filtering
int count = 0;
foreach (ProteinGroup proteinGroup in FalseDiscoveryRate<ProteinGroup, double>.Filter(proteinGroups, MaxFdr / 100, true))
{
proteinGroup.PassesFDR = true;
count++;
}
if (printMessages)
Log("{0:N0} protein groups are left after applying FDR of {1:N2}% [parsimonious proteins filtered]", count, MaxFdr);
#endregion FDR filtering
return proteinGroups;
}