From cf046e8361a29812987744aa25c9b1e1b12e56c7 Mon Sep 17 00:00:00 2001 From: Brian Clarke Date: Wed, 9 Sep 2020 15:51:29 +0200 Subject: [PATCH 1/3] implement SampleSeqExtractor --- kipoiseq/extractors/vcf_seq.py | 87 +++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/kipoiseq/extractors/vcf_seq.py b/kipoiseq/extractors/vcf_seq.py index 5879aa8..adf0946 100644 --- a/kipoiseq/extractors/vcf_seq.py +++ b/kipoiseq/extractors/vcf_seq.py @@ -2,7 +2,8 @@ from typing import Union from pyfaidx import Sequence, complement -from kipoiseq.dataclasses import Interval +from cyvcf2 import VCF +from kipoiseq.dataclasses import Interval, Variant from kipoiseq.extractors import ( BaseExtractor, FastaStringExtractor, @@ -15,7 +16,8 @@ __all__ = [ 'VariantSeqExtractor', 'SingleVariantVCFSeqExtractor', - 'SingleSeqVCFSeqExtractor' + 'SingleSeqVCFSeqExtractor', + 'SampleSeqExtractor' ] @@ -320,3 +322,84 @@ def extract(self, interval, anchor=None, sample_id=None, fixed_len=True): anchor=anchor, fixed_len=fixed_len ) + + +class SampleSeqExtractor(VariantSeqExtractor): + def __init__(self, fasta_file, vcf_file): + """Sequence extractor which can extract an alternate sequence for a + given interval and the variants corresponding to a given + sample and phase. + + Args: + fasta_file: Path to the fasta file containing the reference + sequence (can be gzipped) + vcf_file: Path to the VCF file containing phased genotype information + + """ + self.vcf = VCF(vcf_file) + self._sample_indices = dict(zip(self.vcf.samples, + range(len(self.vcf.samples)))) + + super().__init__(fasta_file) + + def extract(self, interval, sample, phase, anchor, + fixed_len=True, **kwargs): + """Extracts an alternate sequence for a given interval and the + variants corresponding to a given sample. + + Args: + interval: `kipoiseq.dataclasses.Interval`, Region of + interest from which to query the sequence. 0-based. + sample: `str`, Sample from the VCF file for which variants should be + extracted. + phase: `0` or `1`, Phase for which sequence should be extracted + anchor: `int`, Absolution position w.r.t. the interval + start. (0-based). E.g. for an interval of `chr1:10-20` + the anchor of 10 denotes the point chr1:10 in the 0-based + coordinate system. + fixed_len: `bool`, If True, the return sequence will have the + same length as the `interval` (e.g. `interval.end - + interval.start`) + kwargs: Additional keyword arguments to pass to + `SampleSeqExtractor.extract` + + Returns: + A single sequence (`str`) with all the variants applied. + """ + variants = [] + if sample is not None: + if phase not in (0, 1): + raise ValueError('phase argument must be in (0, 1)' + ' if sample is not None') + + # Interval is 0-based, cyvcf2 positions are 1-based: need to add 1 + variants = self._get_sample_variants( + self.vcf( + f'{interval.chrom}:' + + f'{interval.start + 1}-{interval.end + 1}' + ), + sample, + phase + ) + + return super(SampleSeqExtractor, self).extract( + interval, variants, anchor, fixed_len, **kwargs) + + def _get_sample_variants(self, variants, sample, phase): + """Given a list of `cyvcf2.Variant`, returns all those present for a + given sample and phase and converts them to + `kipoiseq.dataclasses.Variant` + + Args: + variants: List of `cyvcf2.Variant`, Variants of interest + sample: `str`, Sample for which to filter genotypes + phase: `0` or `1`, Phase for which to filter genotypes + + Returns: + List of `kipoiseq.dataclasses.Variant` + """ + sample_index = self._sample_indices[sample] + return [ + Variant.from_cyvcf(v) for v in variants + if v.genotypes[sample_index][phase] + ] From 59ec5122bb370242506ab23b0f3b84da9a68f568 Mon Sep 17 00:00:00 2001 From: Brian Clarke Date: Wed, 9 Sep 2020 15:58:27 +0200 Subject: [PATCH 2/3] add error check for sample argument to SampleSeqExtractor.extract() --- kipoiseq/extractors/vcf_seq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kipoiseq/extractors/vcf_seq.py b/kipoiseq/extractors/vcf_seq.py index adf0946..e0bb799 100644 --- a/kipoiseq/extractors/vcf_seq.py +++ b/kipoiseq/extractors/vcf_seq.py @@ -368,6 +368,10 @@ def extract(self, interval, sample, phase, anchor, """ variants = [] if sample is not None: + if sample not in self.vcf.samples: + raise ValueError(f'Sample \'{sample}\'' + 'not present in VCF file') + if phase not in (0, 1): raise ValueError('phase argument must be in (0, 1)' ' if sample is not None') From 02cd1c3b206a34b34748a2523335fa6d15dc53f4 Mon Sep 17 00:00:00 2001 From: Brian Clarke Date: Wed, 9 Sep 2020 16:05:21 +0200 Subject: [PATCH 3/3] correct typo in error message --- kipoiseq/extractors/vcf_seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kipoiseq/extractors/vcf_seq.py b/kipoiseq/extractors/vcf_seq.py index e0bb799..5e6981c 100644 --- a/kipoiseq/extractors/vcf_seq.py +++ b/kipoiseq/extractors/vcf_seq.py @@ -369,7 +369,7 @@ def extract(self, interval, sample, phase, anchor, variants = [] if sample is not None: if sample not in self.vcf.samples: - raise ValueError(f'Sample \'{sample}\'' + raise ValueError(f'Sample \'{sample}\' ' 'not present in VCF file') if phase not in (0, 1):