static bool NormalizeVarianceByGC(List<GenomicBin> bins, NexteraManifest manifest = null)
{
// DebugPrintCountsByGC(bins, "CountsByGCVariance-Before.txt");
// An array of lists. Each array element (0-100) will hold a list of counts whose bins have the same GC content.
List<float>[] countsByGC;
// Will hold all of the autosomal counts present in 'bins'
List<float> counts;
EnrichmentUtilities.GetCountsByGC(bins, manifest, out countsByGC, out counts);
// Estimate quartiles of all bins genomewide
var globalQuartiles = Utilities.Quartiles(counts);
// Will hold interquartile range (IQR) separately for each GC bin
List<float> localIQR = new List<float>(countsByGC.Length);
// Will hold quartiles separately for each GC bin
List<Tuple<float, float, float>> localQuartiles = new List<Tuple<float, float, float>>(countsByGC.Length);
// calculate interquartile range (IQR) for GC bins and populate localQuartiles list
for (int i = 0; i < countsByGC.Length; i++)
{
if (countsByGC[i].Count == 0)
{
localIQR.Add(-1f);
localQuartiles.Add(new Tuple<float, float, float>(-1f, -1f, -1f));
}
else if (countsByGC[i].Count >= defaultMinNumberOfBinsPerGC)
{
localQuartiles.Add(Utilities.Quartiles(countsByGC[i]));
localIQR.Add(localQuartiles[i].Item3 - localQuartiles[i].Item1);
}
else
{
List<Tuple<float, float>> weightedCounts = GetWeightedCounts(countsByGC, i);
double[] quartiles = Utilities.WeightedQuantiles(weightedCounts, new List<float>() { 0.25f, 0.5f, 0.75f });
localQuartiles.Add(new Tuple<float, float, float>((float)quartiles[0], (float)quartiles[1], (float)quartiles[2]));
localIQR.Add((float)(quartiles[2] - quartiles[0]));
}
}
// Identify if particular GC bins have IQR twice as large as IQR genomewide
float globalIQR = globalQuartiles.Item3 - globalQuartiles.Item1;
// Holder for GC bins with large IQR (compared to genomewide IQR)
int significantIQRcounter = 0;
for (int i = 10; i < 90; i++)
{
if (globalIQR < localIQR[i] * 2f)
significantIQRcounter++;
}
if (significantIQRcounter <= 0)
return false;
// Divide each count by the median count of bins with the same GC content
foreach (GenomicBin bin in bins)
{
var scaledLocalIqr = localIQR[bin.GC] * 0.8f;
if (globalIQR >= scaledLocalIqr) continue;
// ratio of GC bins and global IQRs
float iqrRatio = scaledLocalIqr / globalIQR;
var medianGCCount = localQuartiles[bin.GC].Item2;
bin.Count = medianGCCount + (bin.Count - medianGCCount) / iqrRatio;
}
// DebugPrintCountsByGC(bins, "CountsByGCVariance-After.txt");
return true;
}