public static void WriteCoveragePlotData(List<CanvasSegment> segments, double? normalDiploidCoverage, PloidyInfo referencePloidy,
string filePath, string referenceFolder)
{
if (segments.Any() && !normalDiploidCoverage.HasValue)
throw new ApplicationException("normal diploid coverage must be specified");
Dictionary<string, List<CanvasSegment>> segmentsByChromosome = GetSegmentsByChromosome(segments);
GenomeMetadata genome = new GenomeMetadata();
genome.Deserialize(Path.Combine(referenceFolder, "GenomeSize.xml"));
int pointLength = 100000;
List<float> counts = new List<float>();
List<float> MAF = new List<float>();
List<float> VF = new List<float>();
using (StreamWriter writer = new StreamWriter(filePath))
{
writer.NewLine = "\n";
writer.Write("#Chromosome\tStart\tEnd\tCopyNumber\tMajorChromosomeCount\tMedianHits\tNormalizedCoverage\tMedianMinorAlleleFrequency\tReferencePloidy\t");
for (int i = 0; i < NumberVariantFrequencyBins; i++) { writer.Write("VariantFrequencyBin{0}\t", i); }
writer.WriteLine();
foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
{
if (chromosome.IsMito()) continue;
int pointStartPos = 0; // 0-based start
while (pointStartPos < chromosome.Length)
{
int pointEndPos = (int)Math.Min(chromosome.Length, pointStartPos + pointLength); // 1-based end
counts.Clear();
MAF.Clear();
VF.Clear();
Dictionary<string, long> CopyNumberAndChromCount = new Dictionary<string, long>();
Dictionary<int, long> basesByCopyNumber = new Dictionary<int, long>();
// Accumulate counts and MAF from the segments:
List<CanvasSegment> chrSegments = new List<CanvasSegment>();
if (segmentsByChromosome.ContainsKey(chromosome.Name)) chrSegments = segmentsByChromosome[chromosome.Name];
List<CanvasSegment> overlapSegments = new List<CanvasSegment>();
foreach (CanvasSegment segment in chrSegments)
{
if (segment.Begin > pointEndPos) continue;
if (segment.End < pointStartPos) continue;
int weight = Math.Min(segment.End, pointEndPos) - Math.Max(segment.Begin, pointStartPos);
string key = string.Format("{0} {1}", segment.copyNumber, segment.MajorChromosomeCount);
if (!CopyNumberAndChromCount.ContainsKey(key)) CopyNumberAndChromCount[key] = 0;
CopyNumberAndChromCount[key] += weight;
if (!basesByCopyNumber.ContainsKey(segment.copyNumber)) basesByCopyNumber[segment.copyNumber] = 0;
basesByCopyNumber[segment.copyNumber] += weight;
overlapSegments.Add(segment);
}
// Note the most common copy number:
long bestCount = 0;
int majorCopyNumber = 0;
foreach (int key in basesByCopyNumber.Keys)
{
if (basesByCopyNumber[key] > bestCount)
{
bestCount = basesByCopyNumber[key];
majorCopyNumber = key;
}
}
// Find the most common major chromosome count, for the most common copy number:
int? majorChromosomeCount = null;
bestCount = 0;
foreach (string key in CopyNumberAndChromCount.Keys)
{
string[] bits = key.Split();
if (bits[1].Length == 0) continue;
if (int.Parse(bits[0]) != majorCopyNumber) continue;
long count = CopyNumberAndChromCount[key];
if (count < bestCount) continue;
bestCount = count;
majorChromosomeCount = int.Parse(bits[1]);
}
// Note allele frequency and coverage info, for all overlap segments that match (more or less)
// the most common copy number:
foreach (CanvasSegment segment in overlapSegments)
{
if ((majorCopyNumber == 2 && segment.copyNumber != 2) ||
(majorCopyNumber < 2 && segment.copyNumber >= 2) ||
(majorCopyNumber > 2 && segment.copyNumber <= 2))
continue;
float segLength = segment.End - segment.Begin;
// Add counts to the overall list:
int firstIndex = 0;
if (pointStartPos > segment.Begin)
{
firstIndex = (int)((float)segment.Counts.Count * (pointStartPos - segment.Begin) / segLength);
}
int lastIndex = segment.Counts.Count;
if (pointEndPos < segment.End)
{
lastIndex = (int)((float)segment.Counts.Count * (pointEndPos - segment.Begin) / segLength);
}
for (int index = firstIndex; index < lastIndex; index++) counts.Add(segment.Counts[index]);
// Add MAF to the overall list:
firstIndex = 0;
if (pointStartPos > segment.Begin)
{
firstIndex = (int)((float)segment.VariantFrequencies.Count * (pointStartPos - segment.Begin) / segLength);
}
lastIndex = segment.VariantFrequencies.Count;
if (pointEndPos < segment.End)
{
lastIndex = (int)((float)segment.VariantFrequencies.Count * (pointEndPos - segment.Begin) / segLength);
}
for (int index = firstIndex; index < lastIndex; index++)
{
float tempMAF = segment.VariantFrequencies[index];
VF.Add(tempMAF);
if (tempMAF > 0.5) tempMAF = 1 - tempMAF;
MAF.Add(tempMAF);
}
}
// Write output for this point:
writer.Write("{0}\t{1}\t{2}\t", chromosome.Name, pointStartPos, pointEndPos);
// Write counts if we have reasonable amounts of data; write MAF if we have reasonable amounts of data.
// (Note: Observed that for germline data on chrY we often had well under 100 counts given the new, smaller bin size)
if (counts.Count >= 30)
{
writer.Write("{0}\t", majorCopyNumber);
writer.Write("{0}\t", majorChromosomeCount);
counts.Sort();
double medianHits = counts[counts.Count / 2];
writer.Write("{0:F2}\t", medianHits);
double normalizedCount = 2 * medianHits / normalDiploidCoverage.Value;
writer.Write("{0:F2}\t", normalizedCount);
if (MAF.Count >= 10)
{
MAF.Sort();
writer.Write("{0}\t", MAF[MAF.Count / 2]);
}
else
{
writer.Write("\t");
}
int refPloidy = 2;
if (referencePloidy != null && referencePloidy.PloidyByChromosome.ContainsKey(chromosome.Name))
{
foreach (var interval in referencePloidy.PloidyByChromosome[chromosome.Name])
{
if (interval.Start <= pointEndPos && interval.End >= pointStartPos)
{
refPloidy = interval.Ploidy;
}
}
}
writer.Write("{0}\t", refPloidy);
if (VF.Count >= 10)
{
// bin VF
float[] vfDistribution = new float[NumberVariantFrequencyBins];
foreach (float vf in VF)
{
int binNumber = Math.Min(vfDistribution.Length - 1, (int)Math.Floor(vf / 0.01));
vfDistribution[binNumber]++;
}
for (int i = 0; i < vfDistribution.Length; i++)
{
vfDistribution[i] = vfDistribution[i] / (float)VF.Count * 100.0f;
writer.Write("{0:F2}\t", vfDistribution[i]);
}
}
else
{
for (int i = 0; i < NumberVariantFrequencyBins; i++) writer.Write("\t");
}
}
writer.WriteLine();
pointStartPos += pointLength;
}
}
}
}