Source code for mavis.annotate.genomic

from copy import copy
import itertools

from .base import BioInterval, ReferenceName
from .constants import SPLICE_SITE_TYPE
from .splicing import SpliceSite, SplicingPattern
from ..constants import reverse_complement, STRAND
from ..error import NotSpecifiedError
from ..interval import Interval


[docs]class Template(BioInterval): def __init__(self, name, start, end, seq=None, bands=None): bands = [] if bands is None else bands name = ReferenceName(name) BioInterval.__init__(self, None, start, end, name=name, seq=seq) self.bands = bands for i in range(0, len(bands)): bands[i].reference_object = self self.bands.sort() def __str__(self): return str(self.name) def __eq__(self, other): return str(self) == str(other) def __hash__(self): return hash(self.name)
[docs]class IntergenicRegion(BioInterval): def __init__(self, chr, start, end, strand): """ Args: chr (str): the reference object/chromosome for this region start (int): the start of the IntergenicRegion end (int): the end of the IntergenicRegion strand (STRAND): the strand the region is defined on Example: >>> IntergenicRegion('1', 1, 100, '+') """ BioInterval.__init__(self, chr, start, end) self.strand = STRAND.enforce(strand)
[docs] def key(self): """see :func:`structural_variant.annotate.base.BioInterval.key`""" return BioInterval.key(self), self.strand
@property def chr(self): """returns the name of the chromosome that this region resides on""" return self.reference_object def __repr__(self): return 'IntergenicRegion({}:{}_{}{})'.format(self.chr, self.start, self.end, self.strand)
[docs] def to_dict(self): """see :func:`structural_variant.annotate.base.BioInterval.to_dict`""" d = BioInterval.to_dict(self) d['strand'] = self.strand return d
[docs]class Gene(BioInterval): """ """ def __init__(self, chr, start, end, name=None, strand=STRAND.NS, aliases=None, seq=None): """ Args: chr (str): the chromosome name (str): the gene name/id i.e. ENSG0001 strand (STRAND): the genomic strand '+' or '-' aliases (:class:`list` of :class:`str`): a list of aliases. For example the hugo name could go here seq (str): genomic seq of the gene Example: >>> Gene('X', 1, 1000, 'ENG0001', '+', ['KRAS']) """ aliases = [] if aliases is None else aliases chr = ReferenceName(chr) BioInterval.__init__(self, name=name, reference_object=chr, start=start, end=end, seq=seq) self.unspliced_transcripts = [] self.strand = STRAND.enforce(strand) self.aliases = aliases
[docs] def transcript_priority(self, transcript): """ prioritizes transcripts from 0 to n-1 based on best transcript flag and then alphanumeric name sort Warning: Lower number means higher priority. This is to make sort work by default """ def sort_key(t): return ( 0 if t.is_best_transcript else 1, t.name, t.start - t.end, t.start, t.end ) priority = sorted(self.transcripts, key=sort_key) for i, curr_transcript in enumerate(priority): if curr_transcript == transcript: return i raise ValueError('input transcript is not associated with this gene', transcript)
@property def transcripts(self): """:any:`list` of :class:`UsTranscript`: list of unspliced transcripts""" return self.unspliced_transcripts @property def translations(self): """:any:`list` of :class:`~mavis.annotate.protein.Translation`: list of translations""" translations = [] for ust in self.unspliced_transcripts: for tx in ust.transcripts: for tl in tx.translations: translations.append(tl) return translations @property def chr(self): """returns the name of the chromosome that this gene resides on""" return self.reference_object
[docs] def key(self): """see :func:`structural_variant.annotate.base.BioInterval.key`""" return BioInterval.key(self), self.strand
[docs] def get_seq(self, reference_genome, ignore_cache=False): """ gene sequence is always given wrt to the positive forward strand regardless of gene strand Args: reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: str: the sequence of the gene """ if self.seq and not ignore_cache: return self.seq elif reference_genome is None: raise NotSpecifiedError('reference genome is required to retrieve the gene sequence') else: return str(reference_genome[self.chr].seq[self.start - 1:self.end]).upper()
@property def spliced_transcripts(self): """:any:`list` of :class:`Transcript`: list of transcripts""" spl = [] for t in self.unspliced_transcripts: spl.extend(t.spliced_transcripts) return spl
[docs] def to_dict(self): """see :func:`structural_variant.annotate.base.BioInterval.to_dict`""" d = BioInterval.to_dict(self) d['strand'] = self.strand return d
[docs]class Exon(BioInterval): """ """ def __init__( self, start, end, transcript=None, name=None, intact_start_splice=True, intact_end_splice=True, seq=None, strand=None): """ Args: start (int): the genomic start position end (int): the genomic end position name (str): the name of the exon transcript (UsTranscript): the 'parent' transcript this exon belongs to intact_start_splice (bool): if the starting splice site has been abrogated intact_end_splice (bool): if the end splice site has been abrogated Raises: AttributeError: if the exon start > the exon end Example: >>> Exon(15, 78) """ BioInterval.__init__(self, name=name, reference_object=transcript, start=start, end=end, seq=seq, strand=strand) if self.is_reverse: self.start_splice_site = SpliceSite( self.transcript, self.start, site_type=SPLICE_SITE_TYPE.DONOR, strand=STRAND.NEG, intact=intact_start_splice) self.end_splice_site = SpliceSite( self.transcript, self.end, site_type=SPLICE_SITE_TYPE.ACCEPTOR, strand=STRAND.NEG, intact=intact_end_splice) else: self.start_splice_site = SpliceSite( self.transcript, self.start, site_type=SPLICE_SITE_TYPE.ACCEPTOR, strand=STRAND.POS, intact=intact_start_splice) self.end_splice_site = SpliceSite( self.transcript, self.end, site_type=SPLICE_SITE_TYPE.DONOR, strand=STRAND.POS, intact=intact_end_splice) @property def transcript(self): """:class:`UsTranscript`: the transcript this exon belongs to""" return self.reference_object @property def donor_splice_site(self): """:class:`~mavis.interval.Interval`: the genomic range describing the splice site""" if self.is_reverse: return self.start_splice_site else: return self.end_splice_site @property def acceptor_splice_site(self): """:class:`~mavis.interval.Interval`: the genomic range describing the splice site""" if self.is_reverse: return self.end_splice_site else: return self.start_splice_site @property def donor(self): """`int`: returns the genomic exonic position of the donor splice site""" if self.is_reverse: return self.start else: return self.end @property def acceptor(self): """`int`: returns the genomic exonic position of the acceptor splice site""" if self.is_reverse: return self.end else: return self.start def __repr__(self): return 'Exon({}, {})'.format(self.start, self.end)
[docs]class UsTranscript(BioInterval): """ """ def __init__( self, exons, gene=None, name=None, strand=None, spliced_transcripts=None, seq=None, is_best_transcript=False ): """ creates a new transcript object Args: exons (:class:`list` of :any:`Exon`): list of Exon that make up the transcript genomic_start (int): genomic start position of the transcript genomic_end (int): genomic end position of the transcript gene (Gene): the gene this transcript belongs to name (str): name of the transcript strand (STRAND): strand the transcript is on, defaults to the strand of the Gene if not specified seq (str): unspliced cDNA seq """ # cannot use mutable default args in the function decl self.exons = exons self.spliced_transcripts = [] if spliced_transcripts is None else spliced_transcripts self.is_best_transcript = is_best_transcript if len(exons) == 0: raise AttributeError('exons must be given') start = min([e[0] for e in self.exons]) end = max([e[1] for e in self.exons]) BioInterval.__init__(self, gene, start, end, name=name, seq=seq, strand=strand) for i, curr in enumerate(self.exons): if isinstance(curr, Exon): curr.reference_object = self else: self.exons[i] = Exon(curr[0], curr[1], self) self.exons = sorted(self.exons, key=lambda x: x.start) for ex in self.exons: if ex.end > self.end or ex.start < self.start: raise AssertionError('exon is outside transcript', self, ex) for e1, e2 in itertools.combinations(self.exons, 2): if Interval.overlaps(e1, e2): raise AttributeError('exons cannot overlap') for s in self.spliced_transcripts: s.reference_object = self try: if self.get_strand() != self.gene.get_strand(): raise AssertionError('gene strand and transcript strand conflict') except AttributeError: pass
[docs] def generate_splicing_patterns(self): """ returns a list of splice sites to be connected as a splicing pattern Returns: :class:`list` of :class:`SplicingPattern`: List of positions to be spliced together see :ref:`theory - predicting splicing patterns <theory-predicting-splicing-patterns>` """ exons = sorted(self.exons, key=lambda x: x[0]) if len(exons) < 2: return [SplicingPattern()] donor = 3 acceptor = 5 reverse = True if self.get_strand() == STRAND.NEG else False pattern = [(exons[0].end, exons[0].end_splice_site.intact, acceptor if reverse else donor)] # always start with a donor for i in range(1, len(exons) - 1): pattern.append((exons[i].start, exons[i].start_splice_site.intact, donor if reverse else acceptor)) pattern.append((exons[i].end, exons[i].end_splice_site.intact, acceptor if reverse else donor)) pattern.append((exons[-1].start, exons[-1].start_splice_site.intact, donor if reverse else acceptor)) # always end with acceptor original_sites = [p for p, s, t in pattern] pattern = [(p, t) for p, s, t in pattern if s] # filter out abrogated splice sites if reverse: pattern.reverse() def get_cons_sites(index, splice_type): # get the next 'n' of any given type temp = [] for i in range(index, len(pattern)): if pattern[i][1] == splice_type: temp.append(pattern[i]) else: break return temp splice_site_sets = [SplicingPattern()] # i should start at the first donor site i = len(get_cons_sites(0, acceptor)) while i < len(pattern): if pattern[i][1] == donor: donors = get_cons_sites(i, donor) acceptors = get_cons_sites(i + len(donors), acceptor) temp = [] for donor_site, acceptor_site in sorted(itertools.product(donors, acceptors)): for curr_splss in splice_site_sets: splss = copy(curr_splss) splss.extend([donor_site[0], acceptor_site[0]]) temp.append(splss) if temp: splice_site_sets = temp i += len(donors) + len(acceptors) else: raise AssertionError('should always be positioned at a donor splice site') # now need to decide the type for each set for splss in splice_site_sets: classification = SplicingPattern.classify(splss, original_sites) splss.splice_type = classification if reverse: splss.reverse() return splice_site_sets
@property def gene(self): """:any:`Gene`: the gene this transcript belongs to""" return self.reference_object def _genomic_to_cdna_mapping(self, splicing_pattern): """ Args: splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating """ mapping = {} length = 1 pos = sorted(splicing_pattern + [self.start, self.end]) genome_intervals = [Interval(s, t) for s, t in zip(pos[::2], pos[1::2])] if self.get_strand() == STRAND.POS: pass elif self.get_strand() == STRAND.NEG: genome_intervals.reverse() else: raise NotSpecifiedError('cannot convert without strand information') for exon in genome_intervals: mapping[exon] = Interval(length, length + len(exon) - 1) length += len(exon) return mapping def _cdna_to_genomic_mapping(self, splicing_pattern): """ Args: splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating """ mapping = {v: k for k, v in self._genomic_to_cdna_mapping(splicing_pattern).items()} return mapping
[docs] def convert_genomic_to_cdna(self, pos, splicing_pattern): """ Args: pos (int): the genomic position to be converted splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating Returns: int: the cdna equivalent Raises: :class:`~mavis.error.IndexError`: when a genomic position not present in the cdna is attempted to be converted """ cdna_pos, shift = self.convert_genomic_to_nearest_cdna(pos, splicing_pattern) if shift != 0: raise IndexError('outside of exonic regions', pos, splicing_pattern, cdna_pos, shift) return cdna_pos
[docs] def convert_genomic_to_nearest_cdna(self, pos, splicing_pattern): """ converts a genomic position to its cdna equivalent or (if intronic) the nearest cdna and shift Args: pos (int): the genomic position splicing_pattern (SplicingPattern): the splicing pattern Returns: tuple of int and int: * *int* - the exonic cdna position * *int* - the intronic shift """ mapping = self._genomic_to_cdna_mapping(splicing_pattern) exons = sorted(list(mapping.keys())) # exonic for ex in exons: if pos <= ex.end and pos >= ex.start: # in the current exon cdna_pos = Interval.convert_pos(mapping, pos, True if self.get_strand() == STRAND.NEG else False) return cdna_pos, 0 # intronic for ex1, ex2 in zip(exons, exons[1::]): if pos > ex1.end and pos < ex2.start: # in the current intron if abs(pos - ex1.end) <= abs(pos - ex2.start): # closest to the first exon cdna_pos = Interval.convert_pos(mapping, ex1.end, True if self.get_strand() == STRAND.NEG else False) return cdna_pos, pos - ex1.end if self.get_strand() == STRAND.POS else ex1.end - pos cdna_pos = Interval.convert_pos(mapping, ex2.start, True if self.get_strand() == STRAND.NEG else False) return cdna_pos, pos - ex2.start if self.get_strand() == STRAND.POS else ex2.start - pos raise IndexError('position does not fall within the current transcript', pos, mapping)
[docs] def convert_cdna_to_genomic(self, pos, splicing_pattern): """ Args: pos (int): cdna position splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating Returns: int: the genomic equivalent """ mapping = self._cdna_to_genomic_mapping(splicing_pattern) return Interval.convert_pos(mapping, pos, True if self.get_strand() == STRAND.NEG else False)
[docs] def exon_number(self, exon): """ exon numbering is based on the direction of translation Args: exon (Exon): the exon to be numbered Returns: int: the exon number (1 based) Raises: AttributeError: if the strand is not given or the exon does not belong to the transcript """ for i, current_exon in enumerate(self.exons): if exon != current_exon: continue if self.get_strand() == STRAND.POS: return i + 1 elif self.get_strand() == STRAND.NEG: return len(self.exons) - i else: raise NotSpecifiedError('strand must be pos or neg to calculate the exon number') raise AttributeError('can only calculate phase on associated exons')
[docs] def get_seq(self, reference_genome=None, ignore_cache=False): """ Args: reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: str: the sequence of the transcript including introns (but relative to strand) """ if self.seq and not ignore_cache: return self.seq elif self.gene and self.gene.seq and not ignore_cache: # gene has a seq set start = self.start - self.gene.start end = self.end - self.gene.end + len(self.gene.seq) if self.get_strand() == STRAND.NEG: return reverse_complement(self.gene.seq[start:end]) return self.gene.seq[start:end] elif reference_genome is None: raise NotSpecifiedError('reference genome is required to retrieve the gene sequence') if self.get_strand() == STRAND.NEG: return reverse_complement(reference_genome[self.gene.chr].seq[self.start - 1:self.end]).upper() return str(reference_genome[self.gene.chr].seq[self.start - 1:self.end]).upper()
[docs] def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=False): """ Args: splicing_pattern (SplicingPattern): the list of splicing positions reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: str: the spliced cDNA sequence """ temp = sorted([self.start] + splicing_pattern + [self.end]) cdna_start = min(temp) conti = [] for i in range(0, len(temp) - 1, 2): conti.append(Interval(temp[i] - cdna_start, temp[i + 1] - cdna_start)) seq = self.get_seq(reference_genome, ignore_cache) if self.get_strand() == STRAND.NEG: # adjust the continuous intervals for the min and flip if revcomp seq = reverse_complement(seq) spliced_seq = ''.join([str(seq[i.start:i.end + 1]) for i in conti]) spliced_seq = spliced_seq.upper() return spliced_seq if self.get_strand() == STRAND.POS else reverse_complement(spliced_seq)
@property def translations(self): """:class:`list` of :class:`~mavis.annotate.protein.Translation`: list of translations associated with this transcript""" translations = [] for spl_tx in self.spliced_transcripts: for translation in spl_tx.translations: translations.append(translation) return translations @property def transcripts(self): """:class:`list` of :class:`Transcript`: list of spliced transcripts""" return self.spliced_transcripts
[docs]class Transcript(BioInterval): def __init__(self, ust, splicing_patt, seq=None, translations=None): """ splicing pattern is given in genomic coordinates Args: us_transcript (UsTranscript): the unspliced transcript splicing_patt (:class:`list` of :class:`int`): the list of splicing positions seq (str): the cdna sequence translations (:class:`list` of :class:`~mavis.annotate.protein.Translation`): the list of translations of this transcript """ pos = sorted([ust.start, ust.end] + splicing_patt) splicing_patt.sort() self.splicing_pattern = splicing_patt length = sum([t - s + 1 for s, t in zip(pos[::2], pos[1::2])]) BioInterval.__init__(self, ust, 1, length, seq=None) self.exons = [Exon(s, t, self) for s, t in zip(pos[::2], pos[1::2])] self.translations = [] if translations is None else [tx for tx in translations] for translation in self.translations: translation.reference_object = self if splicing_patt and (min(splicing_patt) < ust.start or max(splicing_patt) > ust.end): raise AssertionError('splicing pattern must be contained within the unspliced transcript') elif len(splicing_patt) % 2 != 0: raise AssertionError('splicing pattern must be a list of 3\'5\' splicing positions')
[docs] def convert_genomic_to_cdna(self, pos): """ Args: pos (int): the genomic position to be converted Returns: int: the cdna equivalent Raises: IndexError: when a genomic position not present in the cdna is attempted to be converted """ return self.unspliced_transcript.convert_genomic_to_cdna(pos, self.splicing_pattern)
[docs] def convert_genomic_to_nearest_cdna(self, pos): return self.reference_object.convert_genomic_to_nearest_cdna(pos, self.splicing_pattern)
[docs] def convert_cdna_to_genomic(self, pos): """ Args: pos (int): cdna position Returns: int: the genomic equivalent """ return self.unspliced_transcript.convert_cdna_to_genomic(pos, self.splicing_pattern)
[docs] def get_seq(self, reference_genome=None, ignore_cache=False): """ Args: reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome Returns: str: the sequence corresponding to the spliced cdna """ if self.seq and not ignore_cache: return self.seq seq = self.unspliced_transcript.get_cdna_seq(self.splicing_pattern, reference_genome, ignore_cache) return seq[self.start - 1:self.end]
@property def unspliced_transcript(self): """:class:`UsTranscript`: the unspliced transcript this splice variant belongs to""" return self.reference_object