private void WriteSequenceMaps(List<ProteinGroup> proteinGroups, string outputDirectory)
{
string fileName = Path.Combine(outputDirectory, "Sequence Coverage Map.txt");
Log("Writing file " + fileName);
string csvFile = Path.Combine(outputDirectory, "data.csv");
string identifiedFile = Path.Combine(outputDirectory, "identifiedSequences.fasta");
string observedFile = Path.Combine(outputDirectory, "observedProteins.fasta");
using (StreamWriter csvWriter = new StreamWriter(csvFile))
using (FastaWriter identifiedWriter = new FastaWriter(identifiedFile))
using (FastaWriter observedWriter = new FastaWriter(observedFile))
using (StreamWriter writer = new StreamWriter(fileName))
{
csvWriter.WriteLine("protein,position,count,coverage,protein name,# peptides");
int proteinID = 0;
foreach (ProteinGroup proteinGroup in proteinGroups.Where(g => !g.IsDecoy).OrderBy(g => g.LongestProteinLen))
{
proteinID++;
string sequence = proteinGroup.RepresentativeProtein.Sequence;
string leusequence = proteinGroup.RepresentativeProtein.LeucineSequence;
int length = sequence.Length;
int[] bits = proteinGroup.RepresentativeProtein.GetSequenceCoverage(proteinGroup.Peptides);
ISet<Peptide> peptides = proteinGroup.Peptides;
// Write the header data
writer.WriteLine("========");
writer.WriteLine("Proteins = {0}", proteinGroup.Count);
writer.WriteLine(proteinGroup.RepresentativeProtein.Description);
foreach(Protein prot in proteinGroup) {
if(prot != proteinGroup.RepresentativeProtein)
writer.WriteLine(prot.Description);
}
writer.WriteLine("Length = {0}", proteinGroup.RepresentativeProtein.Length);
writer.WriteLine("Redundacy = {0:g3}%", proteinGroup.SequenceRedundacy);
writer.WriteLine("Coverage = {0:g3}%, {1} AA", proteinGroup.SequenceCoverage, bits.Count(bit => bit > 0));
int shared = peptides.Count(pep => pep.IsShared);
if (shared > 0)
{
int[] bits2 = proteinGroup.RepresentativeProtein.GetSequenceCoverage(peptides.Where(pep => !pep.IsShared));
int observedAminoAcids = bits2.Count(bit => bit > 0);
double coverage = (double)observedAminoAcids / bits2.Length * 100.0;
writer.WriteLine("Coverage = {0:g3}%, {1} AA (unshared only)", coverage, observedAminoAcids);
writer.WriteLine("Peptides = {0}, {1} are shared (marked by *)", peptides.Count, shared);
}
else
{
writer.WriteLine("Peptides = {0}", peptides.Count);
}
writer.WriteLine("========");
if (proteinGroup.Count > 1)
{
writer.WriteLine("Protein Sequences (Differences marked by *)");
writer.Write(' ');
for (int i = 0; i < sequence.Length; i++)
{
bool same = true;
char c = sequence[i];
foreach (Protein prot in proteinGroup)
{
if (i >= prot.Length || !prot.Sequence[i].Equals(c))
{
same = false;
break;
}
}
writer.Write(same ? ' ' : '*');
}
writer.WriteLine();
foreach (Protein prot in proteinGroup)
{
writer.Write(' ');
writer.WriteLine(prot.Sequence);
}
writer.WriteLine("========");
}
// Write the amino acid numbers
writer.Write(" 1");
int size = 2;
for (int i = 10; i < length; i += 10)
{
for (int j = 0; j < 10 - size; j++)
{
writer.Write(' ');
}
writer.Write(i);
if (i < 100)
{
size = 2;
}
else if (i < 1000)
{
size = 3;
}
else if (i < 10000)
{
size = 4;
}
else
{
size = 5;
}
}
writer.WriteLine();
// Write the complete sequence
writer.WriteLine(" "+sequence);
observedWriter.Write(sequence, proteinGroup.RepresentativeProtein.Description);
// Write the combined mapped sequence
StringBuilder compressedSequence = new StringBuilder();
writer.Write(" ");
int startIndex = -1;
bool started = false;
for (int i = 0; i < bits.Length; i++)
{
if (bits[i] > 0)
{
writer.Write(sequence[i]);
compressedSequence.Append(sequence[i]);
if(!started)
{
startIndex = i;
started = true;
}
}
else
{
if (started)
{
started = false;
csvWriter.WriteLine("{0},{1},{2},{3},{4},{5}", proteinID, startIndex, i - startIndex, proteinGroup.SequenceCoverage, proteinGroup.Description, peptides.Count);
}
writer.Write(' ');
compressedSequence.Append(' ');
}
}
if (started)
{
csvWriter.WriteLine("{0},{1},{2},{3},{4},{5}", proteinID, startIndex, bits.Length - startIndex, proteinGroup.SequenceCoverage, proteinGroup.Description, peptides.Count);
}
identifiedWriter.Write(compressedSequence.ToString(), proteinGroup.RepresentativeProtein.Description);
writer.WriteLine();
writer.WriteLine();
// Write the each peptide
foreach (Peptide peptide in peptides.OrderBy(pep => leusequence.IndexOf(pep.LeucineSequence, 0)).ThenByDescending(pep => pep.Length))
{
writer.Write((peptide.IsShared) ? "*" : " ");
int start_index = 0;
while (true)
{
int index = leusequence.IndexOf(peptide.LeucineSequence, start_index);
if (index < 0)
{
break;
}
// Write blank spaces
writer.Write(new string(' ', index));
writer.Write(peptide.LeucineSequence);
start_index = index + 1;
writer.WriteLine();
//for (int aa = 0; aa < peptide.Length; aa++)
//{
// writer.Write(peptide.LeucineSequence[aa]);
//}
}
}
// Give some room between proteins
writer.WriteLine();
}
}
}