private Dictionary<string, Peptide> GetAllUniquePeptides(IEnumerable<CsvFile> csvFiles)
{
Log("Reading in unique peptides sequences from all .csv files...");
Dictionary<string, Peptide> peptides = new Dictionary<string, Peptide>(1 << 16);
Proteases = new HashSet<Protease>();
int psmCount = 0;
// Loop over each input file and read its contents
foreach (CsvFile csvfile in csvFiles)
{
// Keep a list of all the proteases used
Proteases.Add(Protease.GetProtease(csvfile.Protease));
// Counter for the number of PSMs loaded in this csvfile
int csvPsmCount = 0;
string sequenceString = "Peptide";
string pvalueString = "P-value";
bool proteomeDiscover = false;
// Open up the csvfile and read its contents, skipping the header
using (CsvReader reader = new CsvReader(new StreamReader(csvfile.FilePath), true))
{
if (reader.GetFieldHeaders().Contains("XCorr"))
{
sequenceString = "Sequence";
pvalueString = "PEP";
proteomeDiscover = true;
}
// Read each line of the csv
while (reader.ReadNextRecord())
{
// Remove leucine / isoleucine ambiguity
string leuSeq = reader[sequenceString].ToUpper().Replace('I', 'L');
double rt = 0;
int specNum = 0;
if (proteomeDiscover)
{
if (ProteinsPerMinute)
rt = double.Parse(reader["RT [min]"]);
}
else
{
specNum = int.Parse(reader["Spectrum number"]);
if (ProteinsPerMinute)
rt = double.Parse(omssaRTRegex.Match(reader["Filename/id"]).Groups[1].Value);
}
double pvalue = double.Parse(reader[pvalueString]);
// Create a new peptide spectral match
PSM psm = new PSM(csvfile, specNum, rt, pvalue);
// Add to the list of the all the unique peptides
Peptide realPep;
if (peptides.TryGetValue(leuSeq, out realPep)) // Faster than contains key since you only try to hash once
{
realPep.PSMs.Add(psm);
}
else
{
realPep = new Peptide(leuSeq);
realPep.PSMs.Add(psm);
peptides.Add(leuSeq, realPep);
// Check to see if the peptide was the biggest or smallest
if (leuSeq.Length < _smallestPeptide)
{
_smallestPeptide = leuSeq.Length;
}
if (leuSeq.Length > _largestPeptide)
{
_largestPeptide = leuSeq.Length;
}
}
// General psm counters;
csvPsmCount++;
}
}
// Total psms loaded
psmCount += csvPsmCount;
Log("{0:N0} PSMs were loaded from {1}", csvPsmCount, csvfile);
}
Log("{0:N0} unique peptides were found from the {1:N0} PSMs loaded ({2:F1}%). (I/L ambiguity removed)", peptides.Count, psmCount, 100.0 * ((double)peptides.Count / psmCount));
return peptides;
}