CanvasBin.CanvasBin.ComputeObservedVsExpectedGC C# (CSharp) Method

ComputeObservedVsExpectedGC() static private method

Computes fragment-based GC normalization correction factor
static private ComputeObservedVsExpectedGC ( HitArray>.Dictionary observedAlignments, byte[]>.Dictionary readGCContent, NexteraManifest manifest, bool debugGC, string outFile ) : float[]
observedAlignments HitArray>.Dictionary
readGCContent byte[]>.Dictionary
manifest NexteraManifest
debugGC bool
outFile string
return float[]
        static float[] ComputeObservedVsExpectedGC(Dictionary<string, HitArray> observedAlignments,
            Dictionary<string, byte[]> readGCContent, NexteraManifest manifest,
            bool debugGC, string outFile)
        {
            Dictionary<string, List<NexteraManifest.ManifestRegion>> regionsByChrom = null;
            if (manifest != null)
            {
                regionsByChrom = manifest.GetManifestRegionsByChromosome();
            }

            long[] expectedReadCountsByGC = new long[numberOfGCbins];
            long[] observedReadCountsByGC = new long[numberOfGCbins];
            foreach (KeyValuePair<string, byte[]> chromosomeReadGCContent in readGCContent)
            {
                string chr = chromosomeReadGCContent.Key;
                if (!observedAlignments.ContainsKey(chr)) { continue; }

                if (manifest == null) // look at the entire genome
                {
                    for (int i = 0; i < chromosomeReadGCContent.Value.Length; i++)
                    {
                        expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                        observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                    }
                }
                else // look at only the targeted regions
                {
                    if (!regionsByChrom.ContainsKey(chr)) { continue; }
                    int i = -1;
                    foreach (var region in regionsByChrom[chr])
                    {
                        if (i < region.Start) // avoid overlapping targeted regions
                        {
                            i = region.Start - 1; // i is 0-based; manifest coordinates are 1-based.
                        }
                        for (; i < chromosomeReadGCContent.Value.Length && i < region.End; i++)
                        {
                            expectedReadCountsByGC[chromosomeReadGCContent.Value[i]]++;
                            observedReadCountsByGC[chromosomeReadGCContent.Value[i]] += observedAlignments[chr].Data[i];
                        }
                    }
                }
            }

            // calculate ratio of observed to expected read counts for each read GC bin
            float[] observedVsExpectedGC = new float[numberOfGCbins];
            for (int i = 0; i < numberOfGCbins; i++)
                observedVsExpectedGC[i] = 1;
            long sumObserved = 0;
            long sumExpected = 0;
            foreach (long gcContent in observedReadCountsByGC)
                sumObserved += gcContent;
            foreach (long gcContent in expectedReadCountsByGC)
                sumExpected += gcContent;
            for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
            {
                if (expectedReadCountsByGC[binIndex] == 0)
                    expectedReadCountsByGC[binIndex] = 1;
                if (observedReadCountsByGC[binIndex] == 0)
                    observedReadCountsByGC[binIndex] = 1;
                observedVsExpectedGC[binIndex] = ((float)observedReadCountsByGC[binIndex] / (float)expectedReadCountsByGC[binIndex]) * ((float)sumExpected / (float)sumObserved);
            }

            if (debugGC)
            {
                using (GzipWriter writer = new GzipWriter(outFile + ".gcstat"))
                {
                    for (int binIndex = 0; binIndex < numberOfGCbins; binIndex++)
                    {
                        writer.WriteLine(string.Format("{0}\t{1}\t{2}", expectedReadCountsByGC[binIndex], observedReadCountsByGC[binIndex], observedVsExpectedGC[binIndex]));
                    }
                }
            }
            return observedVsExpectedGC;
        }