PageRenderTime 53ms CodeModel.GetById 23ms RepoModel.GetById 1ms app.codeStats 0ms

/itmi_vcfqc_optimized_with_globus_transfer/pymodules/python2.7/lib/python/cnvlib/vary.py

https://gitlab.com/pooja043/Globus_Docker_4
Python | 339 lines | 292 code | 17 blank | 30 comment | 21 complexity | 7090855986ee1c851a1f67ceddefd792 MD5 | raw file
  1. """An array of genomic intervals, treated as variant loci."""
  2. from __future__ import absolute_import, division, print_function
  3. import logging
  4. import numpy as np
  5. import pandas as pd
  6. import vcf
  7. from . import gary
  8. class VariantArray(gary.GenomicArray):
  9. """An array of genomic intervals, treated as variant loci.
  10. Required columns: chromosome, start, end, ref, alt
  11. """
  12. _required_columns = ("chromosome", "start", "end", "ref", "alt")
  13. _required_dtypes = ("string", "int", "int", "string", "string")
  14. # Extra: somatic, zygosity, depth, alt_count, alt_freq
  15. def __init__(self, data_table, meta_dict=None):
  16. gary.GenomicArray.__init__(self, data_table, meta_dict)
  17. def mirrored_baf(self, above_half=True, tumor_boost=True):
  18. """Mirrored B-allele frequencies (BAFs).
  19. Flip BAFs to be all above 0.5 (if `above_half`) or below 0.5, for
  20. consistency.
  21. With `tumor_boost`, normalize tumor-sample allele frequencies to the
  22. matched normal allele frequencies.
  23. """
  24. if tumor_boost and "n_alt_freq" in self:
  25. alt_freqs = self.tumor_boost()
  26. else:
  27. alt_freqs = self["alt_freq"]
  28. fromhalf = np.abs(alt_freqs - 0.5)
  29. if above_half:
  30. return fromhalf + 0.5
  31. else:
  32. return 0.5 - fromhalf
  33. def tumor_boost(self):
  34. """TumorBoost normalization of tumor-sample allele frequencies.
  35. De-noises the signal for detecting LOH.
  36. """
  37. if "n_alt_freq" in self:
  38. n_freqs = self["n_alt_freq"]
  39. else:
  40. raise ValueError("TumorBoost requires a paired normal sample in "
  41. "the VCF.")
  42. t_freqs = self["alt_freq"]
  43. return _tumor_boost(t_freqs, n_freqs)
  44. # I/O
  45. @classmethod
  46. def read_vcf(cls, infile, sample_id=None, normal_id=None, min_depth=None,
  47. skip_hom=False, skip_reject=False, skip_somatic=False):
  48. """Parse SNV coordinates from a VCF file into a VariantArray."""
  49. if isinstance(infile, basestring):
  50. vcf_reader = vcf.Reader(filename=infile)
  51. else:
  52. vcf_reader = vcf.Reader(infile)
  53. if not vcf_reader.samples:
  54. logging.warn("VCF file %s has no samples; parsing minimal info",
  55. infile)
  56. return cls._read_vcf_nosample(infile, sample_id, skip_reject)
  57. columns = [
  58. "chromosome", "start", "end", "ref", "alt",
  59. "somatic", "zygosity", "depth", "alt_count"]
  60. sample_id, normal_id = _select_sample(vcf_reader, sample_id, normal_id)
  61. if normal_id:
  62. columns.extend(["n_zygosity", "n_depth", "n_alt_count"])
  63. rows = _parse_records(vcf_reader, sample_id, normal_id, skip_reject)
  64. table = pd.DataFrame.from_records(rows, columns=columns)
  65. table["alt_freq"] = table["alt_count"] / table["depth"]
  66. if normal_id:
  67. table["n_alt_freq"] = table["n_alt_count"] / table["n_depth"]
  68. # Filter out records as requested
  69. if min_depth:
  70. dkey = "n_depth" if "n_depth" in table else "depth"
  71. idx_depth = table[dkey] >= min_depth
  72. cnt_depth = (~idx_depth).sum()
  73. table = table[idx_depth]
  74. if skip_hom:
  75. zkey = "n_zygosity" if "n_zygosity" in table else "zygosity"
  76. idx_hom = (table[zkey] != 0.0) & (table[zkey] != 1.0)
  77. cnt_hom = (~idx_hom).sum()
  78. table = table[idx_hom]
  79. if skip_somatic:
  80. idx_som = table["somatic"]
  81. cnt_som = (~idx_som).sum()
  82. table = table[idx_som]
  83. logging.info("Skipped records: %d somatic, %d depth, %d homozygous",
  84. cnt_som, cnt_depth, cnt_hom)
  85. return cls(table, {"sample_id": sample_id})
  86. @classmethod
  87. def _read_vcf_nosample(cls, vcf_file, sample_id=None, skip_reject=False):
  88. table = pd.read_table(vcf_file,
  89. comment="#",
  90. header=None,
  91. na_filter=False,
  92. names=["chromosome", "start", "_ID", "ref", "alt",
  93. "_QUAL", "filter", "info"],
  94. usecols=cls._required_columns,
  95. # usecols=["chromosome", "start", "ref", "alt",
  96. # # "filter", "info",
  97. # ],
  98. # ENH: converters=func -> to parse each col
  99. dtype=dict(zip(cls._required_columns,
  100. cls._required_dtypes)),
  101. )
  102. # ENH: do things with filter, info
  103. # if skip_reject and record.FILTER and len(record.FILTER) > 0:
  104. table['end'] = table['start'] # TODO: _get_end
  105. table['start'] -= 1
  106. table = table.loc[:, cls._required_columns]
  107. return cls(table, {"sample_id": sample_id})
  108. # def write_vcf(self, outfile=sys.stdout):
  109. # """Write data to a file or handle in VCF format."""
  110. # # grab from export.export_vcf()
  111. def _select_sample(vcf_reader, sample_id, normal_id):
  112. """Select a sample ID in the VCF; ensure it's valid."""
  113. peds = list(_parse_pedigrees(vcf_reader))
  114. if sample_id is None and normal_id is None and peds:
  115. # Use the VCF header to select the tumor sample
  116. # Take the first entry, if any
  117. sample_id, normal_id = peds[0]
  118. elif sample_id and normal_id:
  119. # Take the given IDs at face value, just verify below
  120. pass
  121. elif sample_id:
  122. if peds:
  123. for sid, nid in peds:
  124. if sid == sample_id:
  125. normal_id = nid
  126. break
  127. # if normal_id is None and len(vcf_reader.samples == 2):
  128. # normal_id = next(s for s in vcf_reader.samples if s != sample_id)
  129. elif normal_id:
  130. if peds:
  131. for sid, nid in peds:
  132. if nid == normal_id:
  133. sample_id = sid
  134. break
  135. else:
  136. try:
  137. sample_id = next(s for s in vcf_reader.samples if s != normal_id)
  138. except StopIteration:
  139. raise ValueError(
  140. "No other sample in VCF besides the specified normal " +
  141. normal_id + "; did you mean to use this as the sample_id "
  142. "instead?")
  143. else:
  144. sample_id = vcf_reader.samples[0]
  145. _confirm_unique(sample_id, vcf_reader.samples)
  146. if normal_id:
  147. _confirm_unique(normal_id, vcf_reader.samples)
  148. logging.info("Selected tumor sample " + sample_id +
  149. (" and normal sample %s" % normal_id if normal_id else ''))
  150. return sample_id, normal_id
  151. def _parse_pedigrees(vcf_reader):
  152. """Extract tumor/normal pair sample IDs from the VCF header.
  153. Return an iterable of (tumor sample ID, normal sample ID).
  154. """
  155. if "PEDIGREE" in vcf_reader.metadata:
  156. for tag in vcf_reader.metadata["PEDIGREE"]:
  157. if "Derived" in tag:
  158. sample_id = tag["Derived"]
  159. normal_id = tag["Original"]
  160. logging.debug("Found tumor sample %s and normal sample %s "
  161. "in the VCF header PEDIGREE tag",
  162. sample_id, normal_id)
  163. yield sample_id, normal_id
  164. elif "GATKCommandLine" in vcf_reader.metadata:
  165. for tag in vcf_reader.metadata["GATKCommandLine"]:
  166. if tag.get("ID") == "MuTect": # any others OK?
  167. options = dict(kv.split("=", 1)
  168. for kv in tag["CommandLineOptions"].split()
  169. if '=' in kv)
  170. sample_id = options.get('tumor_sample_name')
  171. normal_id = options['normal_sample_name']
  172. logging.debug("Found tumor sample %s and normal sample "
  173. "%s in the MuTect VCF header",
  174. sample_id, normal_id)
  175. yield sample_id, normal_id
  176. def _confirm_unique(sample_id, samples):
  177. occurrences = [s for s in samples if s == sample_id]
  178. if len(occurrences) != 1:
  179. raise ValueError(
  180. "Did not find a single sample ID '%s' in: %s"
  181. % (sample_id, samples))
  182. def _parse_records(vcf_reader, sample_id, normal_id, skip_reject):
  183. """Parse VCF records into DataFrame rows.
  184. Apply filters to skip records with low depth, homozygosity, the REJECT
  185. flag, or the SOMATIC info field.
  186. """
  187. cnt_reject = 0 # For logging
  188. for record in vcf_reader:
  189. is_som = False
  190. if skip_reject and record.FILTER and len(record.FILTER) > 0:
  191. cnt_reject += 1
  192. continue
  193. if record.INFO.get("SOMATIC"):
  194. is_som = True
  195. sample = record.genotype(sample_id)
  196. depth, zygosity, alt_count = _extract_genotype(sample)
  197. if normal_id:
  198. normal = record.genotype(normal_id)
  199. n_depth, n_zygosity, n_alt_count = _extract_genotype(normal)
  200. if n_zygosity == 0:
  201. is_som = True
  202. # Split multiallelics?
  203. # XXX Ensure sample genotypes are handled properly
  204. for alt in record.ALT:
  205. posn = record.POS - 1
  206. end = _get_end(posn, alt, record.INFO)
  207. row = (record.CHROM, posn, end, record.REF, str(alt),
  208. is_som,
  209. zygosity,
  210. depth,
  211. alt_count,
  212. )
  213. if normal_id:
  214. row += (n_zygosity, n_depth, n_alt_count)
  215. yield row
  216. if cnt_reject:
  217. logging.info('Filtered out %d records', cnt_reject)
  218. def _extract_genotype(sample):
  219. if "DP" in sample.data._fields:
  220. depth = sample.data.DP
  221. else:
  222. # SV, probably
  223. depth = alt_count = None
  224. if sample.is_het:
  225. zygosity = 0.5
  226. elif sample.gt_type == 0:
  227. zygosity = 0.0
  228. else:
  229. zygosity = 1.0
  230. alt_count = (_get_alt_count(sample) if sample.gt_type else 0.0)
  231. return depth, zygosity, alt_count
  232. def _get_alt_count(sample):
  233. """Get the alternative allele count from a sample in a VCF record."""
  234. if "AD" in sample.data._fields and sample.data.AD is not None:
  235. # GATK and other callers
  236. if isinstance(sample.data.AD, (list, tuple)):
  237. alt_count = float(sample.data.AD[1])
  238. # VarScan
  239. else:
  240. alt_count = float(sample.data.AD)
  241. elif "CLCAD2" in sample.data._fields and sample.data.CLCAD2 is not None:
  242. # Qiagen CLC Genomics Server -- similar to GATK's AD
  243. alt_count = float(sample.data.CLCAD2[1])
  244. elif "AO" in sample.data._fields:
  245. if sample.data.AO:
  246. if isinstance(sample.data.AO, (list, tuple)):
  247. alt_count = sum(map(float, sample.data.AO))
  248. else:
  249. alt_count = float(sample.data.AO)
  250. else:
  251. alt_count = 0.0
  252. else:
  253. logging.warn("Skipping %s:%d %s; "
  254. "unsure how to get alternative allele count: %s",
  255. sample.site.CHROM, sample.site.POS, sample.site.REF,
  256. sample.data)
  257. alt_count = None # or 0 or "missing data"?
  258. return alt_count
  259. def _get_end(posn, alt, info):
  260. """Get record end position."""
  261. if "END" in info:
  262. # Structural variant
  263. return posn + info['END']
  264. return posn + len(alt)
  265. def _tumor_boost(t_freqs, n_freqs):
  266. """Normalize tumor-sample allele frequencies.
  267. boosted = { 0.5 (t/n) if t < n
  268. 1 - 0.5(1-t)/(1-n) otherwise
  269. See: TumorBoost, Bengtsson et al. 2010
  270. """
  271. lt_mask = (t_freqs < n_freqs)
  272. lt_idx = np.nonzero(lt_mask)[0]
  273. gt_idx = np.nonzero(~lt_mask)[0]
  274. out = np.zeros_like(t_freqs)
  275. out[lt_idx] = 0.5 * t_freqs.take(lt_idx) / n_freqs.take(lt_idx)
  276. out[gt_idx] = 1 - 0.5 * (1 - t_freqs.take(gt_idx)
  277. ) / (1 - n_freqs.take(gt_idx))
  278. return out
  279. def _allele_specific_copy_numbers(segarr, varr, ploidy=2):
  280. """Split total copy number between alleles based on BAF.
  281. See: PSCBS, Bentsson et al. 2011
  282. """
  283. seg_depths = ploidy * np.exp2(segarr["log2"])
  284. seg_bafs = np.asarray([(np.median(subvarr.mirrored_baf())
  285. if len(subvarr) else np.nan)
  286. for _seg, subvarr in varr.by_ranges(segarr)])
  287. cn1 = 0.5 * (1 - seg_bafs) * seg_depths
  288. cn2 = seg_depths - cn1
  289. # segout = segarr.copy()
  290. # segout.update({"baf": seg_bafs, "CN1": cn1, "CN2": cn2})
  291. # return segout
  292. return pd.DataFrame({"baf": seg_bafs, "CN1": cn1, "CN2": cn2})