public static void WriteSegments(string outVcfPath, List<CanvasSegment> segments, string wholeGenomeFastaDirectory, string sampleName,
List<string> extraHeaders, PloidyInfo ploidy, int qualityThreshold = 10)
{
using (BgzipOrStreamWriter writer = new BgzipOrStreamWriter(outVcfPath))
{
// Write the VCF header:
writer.WriteLine("##fileformat=VCFv4.1");
writer.WriteLine($"##source={CanvasVersionInfo.NameString} {CanvasVersionInfo.VersionString}");
writer.WriteLine($"##reference={Path.Combine(wholeGenomeFastaDirectory, "genome.fa")}");
foreach (string header in extraHeaders ?? new List<string>())
{
writer.WriteLine(header);
}
GenomeMetadata genome = new GenomeMetadata();
genome.Deserialize(Path.Combine(wholeGenomeFastaDirectory, "GenomeSize.xml"));
foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
{
writer.WriteLine($"##contig=<ID={chromosome.Name},length={chromosome.Length}>");
}
string qualityFilter = $"q{qualityThreshold}";
writer.WriteLine("##ALT=<ID=CNV,Description=\"Copy number variable region\">");
writer.WriteLine($"##FILTER=<ID={qualityFilter},Description=\"Quality below {qualityThreshold}\">");
writer.WriteLine("##FILTER=<ID=L10kb,Description=\"Length shorter than 10kb\">");
writer.WriteLine("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">");
writer.WriteLine("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
writer.WriteLine("##INFO=<ID=CNVLEN,Number=1,Type=Integer,Description=\"Number of reference positions spanned by this CNV\">");
writer.WriteLine("##FORMAT=<ID=RC,Number=1,Type=Float,Description=\"Mean counts per bin in the region\">");
writer.WriteLine("##FORMAT=<ID=BC,Number=1,Type=Float,Description=\"Number of bins in the region\">");
writer.WriteLine("##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number genotype for imprecise events\">");
writer.WriteLine("##FORMAT=<ID=MCC,Number=1,Type=Integer,Description=\"Major chromosome count (equal to copy number for LOH regions)\">");
writer.WriteLine("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + sampleName);
SanityCheckChromosomeNames(genome, segments);
foreach (GenomeMetadata.SequenceMetadata chromosome in genome.Sequences)
{
foreach (CanvasSegment segment in segments)
{
if (!segment.Chr.Equals(chromosome.Name, StringComparison.OrdinalIgnoreCase)) continue;
int referenceCopyNumber = ploidy?.GetReferenceCopyNumber(segment) ?? 2;
CnvType cnvType = segment.GetCnvType(referenceCopyNumber);
// From vcf 4.1 spec:
// If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String “<ID>”) then the padding base is required and POS denotes the
// coordinate of the base preceding the polymorphism.
string alternateAllele = cnvType.ToAltId();
int position = (alternateAllele.StartsWith("<") && alternateAllele.EndsWith(">")) ? segment.Begin : segment.Begin + 1;
writer.Write($"{segment.Chr}\t{position}\tCanvas:{cnvType.ToVcfId()}:{segment.Chr}:{segment.Begin + 1}-{segment.End}\t");
writer.Write($"N\t{alternateAllele}\t{segment.QScore}\t{segment.Filter}\t", alternateAllele, segment.QScore, segment.Filter);
if (cnvType != CnvType.Reference)
writer.Write($"SVTYPE={cnvType.ToSvType()};");
writer.Write($"END={segment.End}");
if (cnvType != CnvType.Reference)
writer.Write($";CNVLEN={segment.End - segment.Begin}");
// FORMAT field
writer.Write("\tRC:BC:CN", segment.End);
if (segment.MajorChromosomeCount.HasValue)
{
writer.Write(":MCC");
}
writer.Write("\t{1}:{2}:{3}", segment.End, Math.Round(segment.MeanCount, 0, MidpointRounding.AwayFromZero), segment.BinCount, segment.CopyNumber);
if (segment.MajorChromosomeCount.HasValue)
{
writer.Write(":{0}", segment.MajorChromosomeCount);
}
writer.WriteLine();
}
}
}
}