static float[] ComputeObservedVsExpectedGC(Dictionary<string, HitArray> observedAlignments,
Dictionary<string, byte[]> readGCContent, NexteraManifest manifest,
bool debugGC, string outFile)
{
Dictionary<string, List<NexteraManifest.ManifestRegion>> regionsByChrom = null;
if (manifest != null)
{
regionsByChrom = manifest.GetManifestRegionsByChromosome();
}
long[] expectedReadCountsByGC = new long[numberOfGCbins];
long[] observedReadCountsByGC = new long[numberOfGCbins];
foreach (KeyValuePair<string, byte[]> chromosomeReadGCContent in readGCContent)
{
string chr = chromosomeReadGCContent.Key;
if (!observedAlignments.ContainsKey(chr)) { continue; }
if (manifest == null) // look at the entire genome
{
for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
{
expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
}
}
else // look at only the targeted regions
{
if (!regionsByChrom.ContainsKey(chr)) { continue; }
int i = -1;
foreach (var region in regionsByChrom[chr])
{
if (i < region.Start) // avoid overlapping targeted regions
{
i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
}
for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
{
expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
}
}
}
}
// calculate ratio of observed to expected read counts for each read GC bin
float[] observedVsExpectedGC = new float[numberOfGCbins];
for (int i = 0; i < numberOfGCbins; i++)
observedVsExpectedGC[i] = 1;
long sumObserved = 0;
long sumExpected = 0;
foreach (long gcContent in observedReadCountsByGC)
sumObserved += gcContent;
foreach (long gcContent in expectedReadCountsByGC)
sumExpected += gcContent;
for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
{
if (expectedReadCountsByGC[binIndex] == 0)
expectedReadCountsByGC[binIndex] = 1;
if (observedReadCountsByGC[binIndex] == 0)
observedReadCountsByGC[binIndex] = 1;
observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
}
if (debugGC)
{
using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
{
for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
{
writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
}
}
}
return observedVsExpectedGC;
}