from copy import copy
import itertools
from .base import BioInterval, ReferenceName
from .constants import SPLICE_SITE_TYPE
from .splicing import SpliceSite, SplicingPattern
from ..constants import reverse_complement, STRAND
from ..error import NotSpecifiedError
from ..interval import Interval
[docs]class Template(BioInterval):
def __init__(self, name, start, end, seq=None, bands=None):
bands = [] if bands is None else bands
name = ReferenceName(name)
BioInterval.__init__(self, None, start, end, name=name, seq=seq)
self.bands = bands
for i in range(0, len(bands)):
bands[i].reference_object = self
self.bands.sort()
def __str__(self):
return str(self.name)
def __eq__(self, other):
return str(self) == str(other)
def __hash__(self):
return hash(self.name)
[docs]class IntergenicRegion(BioInterval):
def __init__(self, chr, start, end, strand):
"""
Args:
chr (str): the reference object/chromosome for this region
start (int): the start of the IntergenicRegion
end (int): the end of the IntergenicRegion
strand (STRAND): the strand the region is defined on
Example:
>>> IntergenicRegion('1', 1, 100, '+')
"""
BioInterval.__init__(self, chr, start, end)
self.strand = STRAND.enforce(strand)
[docs] def key(self):
"""see :func:`structural_variant.annotate.base.BioInterval.key`"""
return BioInterval.key(self), self.strand
@property
def chr(self):
"""returns the name of the chromosome that this region resides on"""
return self.reference_object
def __repr__(self):
return 'IntergenicRegion({}:{}_{}{})'.format(self.chr, self.start, self.end, self.strand)
[docs] def to_dict(self):
"""see :func:`structural_variant.annotate.base.BioInterval.to_dict`"""
d = BioInterval.to_dict(self)
d['strand'] = self.strand
return d
[docs]class Gene(BioInterval):
"""
"""
def __init__(self, chr, start, end, name=None, strand=STRAND.NS, aliases=None, seq=None):
"""
Args:
chr (str): the chromosome
name (str): the gene name/id i.e. ENSG0001
strand (STRAND): the genomic strand '+' or '-'
aliases (:class:`list` of :class:`str`): a list of aliases. For example the hugo name could go here
seq (str): genomic seq of the gene
Example:
>>> Gene('X', 1, 1000, 'ENG0001', '+', ['KRAS'])
"""
aliases = [] if aliases is None else aliases
chr = ReferenceName(chr)
BioInterval.__init__(self, name=name, reference_object=chr, start=start, end=end, seq=seq)
self.unspliced_transcripts = []
self.strand = STRAND.enforce(strand)
self.aliases = aliases
[docs] def transcript_priority(self, transcript):
"""
prioritizes transcripts from 0 to n-1 based on best transcript flag
and then alphanumeric name sort
Warning:
Lower number means higher priority. This is to make sort work by default
"""
def sort_key(t):
return (
0 if t.is_best_transcript else 1,
t.name, t.start - t.end, t.start, t.end
)
priority = sorted(self.transcripts, key=sort_key)
for i, curr_transcript in enumerate(priority):
if curr_transcript == transcript:
return i
raise ValueError('input transcript is not associated with this gene', transcript)
@property
def transcripts(self):
""":any:`list` of :class:`UsTranscript`: list of unspliced transcripts"""
return self.unspliced_transcripts
@property
def translations(self):
""":any:`list` of :class:`~mavis.annotate.protein.Translation`: list of translations"""
translations = []
for ust in self.unspliced_transcripts:
for tx in ust.transcripts:
for tl in tx.translations:
translations.append(tl)
return translations
@property
def chr(self):
"""returns the name of the chromosome that this gene resides on"""
return self.reference_object
[docs] def key(self):
"""see :func:`structural_variant.annotate.base.BioInterval.key`"""
return BioInterval.key(self), self.strand
[docs] def get_seq(self, reference_genome, ignore_cache=False):
"""
gene sequence is always given wrt to the positive forward strand regardless of gene strand
Args:
reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by
template/chr name
ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome
Returns:
str: the sequence of the gene
"""
if self.seq and not ignore_cache:
return self.seq
elif reference_genome is None:
raise NotSpecifiedError('reference genome is required to retrieve the gene sequence')
else:
return str(reference_genome[self.chr].seq[self.start - 1:self.end]).upper()
@property
def spliced_transcripts(self):
""":any:`list` of :class:`Transcript`: list of transcripts"""
spl = []
for t in self.unspliced_transcripts:
spl.extend(t.spliced_transcripts)
return spl
[docs] def to_dict(self):
"""see :func:`structural_variant.annotate.base.BioInterval.to_dict`"""
d = BioInterval.to_dict(self)
d['strand'] = self.strand
return d
[docs]class Exon(BioInterval):
"""
"""
def __init__(
self, start, end,
transcript=None,
name=None,
intact_start_splice=True,
intact_end_splice=True,
seq=None,
strand=None):
"""
Args:
start (int): the genomic start position
end (int): the genomic end position
name (str): the name of the exon
transcript (UsTranscript): the 'parent' transcript this exon belongs to
intact_start_splice (bool): if the starting splice site has been abrogated
intact_end_splice (bool): if the end splice site has been abrogated
Raises:
AttributeError: if the exon start > the exon end
Example:
>>> Exon(15, 78)
"""
BioInterval.__init__(self, name=name, reference_object=transcript, start=start, end=end, seq=seq, strand=strand)
if self.is_reverse:
self.start_splice_site = SpliceSite(
self.transcript, self.start, site_type=SPLICE_SITE_TYPE.DONOR, strand=STRAND.NEG, intact=intact_start_splice)
self.end_splice_site = SpliceSite(
self.transcript, self.end, site_type=SPLICE_SITE_TYPE.ACCEPTOR, strand=STRAND.NEG, intact=intact_end_splice)
else:
self.start_splice_site = SpliceSite(
self.transcript, self.start, site_type=SPLICE_SITE_TYPE.ACCEPTOR, strand=STRAND.POS, intact=intact_start_splice)
self.end_splice_site = SpliceSite(
self.transcript, self.end, site_type=SPLICE_SITE_TYPE.DONOR, strand=STRAND.POS, intact=intact_end_splice)
@property
def transcript(self):
""":class:`UsTranscript`: the transcript this exon belongs to"""
return self.reference_object
@property
def donor_splice_site(self):
""":class:`~mavis.interval.Interval`: the genomic range describing the splice site"""
if self.is_reverse:
return self.start_splice_site
else:
return self.end_splice_site
@property
def acceptor_splice_site(self):
""":class:`~mavis.interval.Interval`: the genomic range describing the splice site"""
if self.is_reverse:
return self.end_splice_site
else:
return self.start_splice_site
@property
def donor(self):
"""`int`: returns the genomic exonic position of the donor splice site"""
if self.is_reverse:
return self.start
else:
return self.end
@property
def acceptor(self):
"""`int`: returns the genomic exonic position of the acceptor splice site"""
if self.is_reverse:
return self.end
else:
return self.start
def __repr__(self):
return 'Exon({}, {})'.format(self.start, self.end)
[docs]class UsTranscript(BioInterval):
"""
"""
def __init__(
self,
exons,
gene=None,
name=None,
strand=None,
spliced_transcripts=None,
seq=None,
is_best_transcript=False
):
""" creates a new transcript object
Args:
exons (:class:`list` of :any:`Exon`): list of Exon that make up the transcript
genomic_start (int): genomic start position of the transcript
genomic_end (int): genomic end position of the transcript
gene (Gene): the gene this transcript belongs to
name (str): name of the transcript
strand (STRAND): strand the transcript is on, defaults to the strand of the Gene if not specified
seq (str): unspliced cDNA seq
"""
# cannot use mutable default args in the function decl
self.exons = exons
self.spliced_transcripts = [] if spliced_transcripts is None else spliced_transcripts
self.is_best_transcript = is_best_transcript
if len(exons) == 0:
raise AttributeError('exons must be given')
start = min([e[0] for e in self.exons])
end = max([e[1] for e in self.exons])
BioInterval.__init__(self, gene, start, end, name=name, seq=seq, strand=strand)
for i, curr in enumerate(self.exons):
if isinstance(curr, Exon):
curr.reference_object = self
else:
self.exons[i] = Exon(curr[0], curr[1], self)
self.exons = sorted(self.exons, key=lambda x: x.start)
for ex in self.exons:
if ex.end > self.end or ex.start < self.start:
raise AssertionError('exon is outside transcript', self, ex)
for e1, e2 in itertools.combinations(self.exons, 2):
if Interval.overlaps(e1, e2):
raise AttributeError('exons cannot overlap')
for s in self.spliced_transcripts:
s.reference_object = self
try:
if self.get_strand() != self.gene.get_strand():
raise AssertionError('gene strand and transcript strand conflict')
except AttributeError:
pass
[docs] def generate_splicing_patterns(self):
"""
returns a list of splice sites to be connected as a splicing pattern
Returns:
:class:`list` of :class:`SplicingPattern`: List of positions to be spliced together
see :ref:`theory - predicting splicing patterns <theory-predicting-splicing-patterns>`
"""
exons = sorted(self.exons, key=lambda x: x[0])
if len(exons) < 2:
return [SplicingPattern()]
donor = 3
acceptor = 5
reverse = True if self.get_strand() == STRAND.NEG else False
pattern = [(exons[0].end, exons[0].end_splice_site.intact, acceptor if reverse else donor)] # always start with a donor
for i in range(1, len(exons) - 1):
pattern.append((exons[i].start, exons[i].start_splice_site.intact, donor if reverse else acceptor))
pattern.append((exons[i].end, exons[i].end_splice_site.intact, acceptor if reverse else donor))
pattern.append((exons[-1].start, exons[-1].start_splice_site.intact, donor if reverse else acceptor)) # always end with acceptor
original_sites = [p for p, s, t in pattern]
pattern = [(p, t) for p, s, t in pattern if s] # filter out abrogated splice sites
if reverse:
pattern.reverse()
def get_cons_sites(index, splice_type):
# get the next 'n' of any given type
temp = []
for i in range(index, len(pattern)):
if pattern[i][1] == splice_type:
temp.append(pattern[i])
else:
break
return temp
splice_site_sets = [SplicingPattern()]
# i should start at the first donor site
i = len(get_cons_sites(0, acceptor))
while i < len(pattern):
if pattern[i][1] == donor:
donors = get_cons_sites(i, donor)
acceptors = get_cons_sites(i + len(donors), acceptor)
temp = []
for donor_site, acceptor_site in sorted(itertools.product(donors, acceptors)):
for curr_splss in splice_site_sets:
splss = copy(curr_splss)
splss.extend([donor_site[0], acceptor_site[0]])
temp.append(splss)
if temp:
splice_site_sets = temp
i += len(donors) + len(acceptors)
else:
raise AssertionError('should always be positioned at a donor splice site')
# now need to decide the type for each set
for splss in splice_site_sets:
classification = SplicingPattern.classify(splss, original_sites)
splss.splice_type = classification
if reverse:
splss.reverse()
return splice_site_sets
@property
def gene(self):
""":any:`Gene`: the gene this transcript belongs to"""
return self.reference_object
def _genomic_to_cdna_mapping(self, splicing_pattern):
"""
Args:
splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating
"""
mapping = {}
length = 1
pos = sorted(splicing_pattern + [self.start, self.end])
genome_intervals = [Interval(s, t) for s, t in zip(pos[::2], pos[1::2])]
if self.get_strand() == STRAND.POS:
pass
elif self.get_strand() == STRAND.NEG:
genome_intervals.reverse()
else:
raise NotSpecifiedError('cannot convert without strand information')
for exon in genome_intervals:
mapping[exon] = Interval(length, length + len(exon) - 1)
length += len(exon)
return mapping
def _cdna_to_genomic_mapping(self, splicing_pattern):
"""
Args:
splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating
"""
mapping = {v: k for k, v in self._genomic_to_cdna_mapping(splicing_pattern).items()}
return mapping
[docs] def convert_genomic_to_cdna(self, pos, splicing_pattern):
"""
Args:
pos (int): the genomic position to be converted
splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating
Returns:
int: the cdna equivalent
Raises:
:class:`~mavis.error.IndexError`: when a genomic position not present in the
cdna is attempted to be converted
"""
cdna_pos, shift = self.convert_genomic_to_nearest_cdna(pos, splicing_pattern)
if shift != 0:
raise IndexError('outside of exonic regions', pos, splicing_pattern, cdna_pos, shift)
return cdna_pos
[docs] def convert_genomic_to_nearest_cdna(self, pos, splicing_pattern):
"""
converts a genomic position to its cdna equivalent or (if intronic) the nearest cdna and shift
Args:
pos (int): the genomic position
splicing_pattern (SplicingPattern): the splicing pattern
Returns:
tuple of int and int:
* *int* - the exonic cdna position
* *int* - the intronic shift
"""
mapping = self._genomic_to_cdna_mapping(splicing_pattern)
exons = sorted(list(mapping.keys()))
# exonic
for ex in exons:
if pos <= ex.end and pos >= ex.start:
# in the current exon
cdna_pos = Interval.convert_pos(mapping, pos, True if self.get_strand() == STRAND.NEG else False)
return cdna_pos, 0
# intronic
for ex1, ex2 in zip(exons, exons[1::]):
if pos > ex1.end and pos < ex2.start:
# in the current intron
if abs(pos - ex1.end) <= abs(pos - ex2.start):
# closest to the first exon
cdna_pos = Interval.convert_pos(mapping, ex1.end, True if self.get_strand() == STRAND.NEG else False)
return cdna_pos, pos - ex1.end if self.get_strand() == STRAND.POS else ex1.end - pos
cdna_pos = Interval.convert_pos(mapping, ex2.start, True if self.get_strand() == STRAND.NEG else False)
return cdna_pos, pos - ex2.start if self.get_strand() == STRAND.POS else ex2.start - pos
raise IndexError('position does not fall within the current transcript', pos, mapping)
[docs] def convert_cdna_to_genomic(self, pos, splicing_pattern):
"""
Args:
pos (int): cdna position
splicing_pattern (SplicingPattern): list of genomic splice sites 3'5' repeating
Returns:
int: the genomic equivalent
"""
mapping = self._cdna_to_genomic_mapping(splicing_pattern)
return Interval.convert_pos(mapping, pos, True if self.get_strand() == STRAND.NEG else False)
[docs] def exon_number(self, exon):
"""
exon numbering is based on the direction of translation
Args:
exon (Exon): the exon to be numbered
Returns:
int: the exon number (1 based)
Raises:
AttributeError: if the strand is not given or the exon does not belong to the transcript
"""
for i, current_exon in enumerate(self.exons):
if exon != current_exon:
continue
if self.get_strand() == STRAND.POS:
return i + 1
elif self.get_strand() == STRAND.NEG:
return len(self.exons) - i
else:
raise NotSpecifiedError('strand must be pos or neg to calculate the exon number')
raise AttributeError('can only calculate phase on associated exons')
[docs] def get_seq(self, reference_genome=None, ignore_cache=False):
"""
Args:
reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome
Returns:
str: the sequence of the transcript including introns (but relative to strand)
"""
if self.seq and not ignore_cache:
return self.seq
elif self.gene and self.gene.seq and not ignore_cache:
# gene has a seq set
start = self.start - self.gene.start
end = self.end - self.gene.end + len(self.gene.seq)
if self.get_strand() == STRAND.NEG:
return reverse_complement(self.gene.seq[start:end])
return self.gene.seq[start:end]
elif reference_genome is None:
raise NotSpecifiedError('reference genome is required to retrieve the gene sequence')
if self.get_strand() == STRAND.NEG:
return reverse_complement(reference_genome[self.gene.chr].seq[self.start - 1:self.end]).upper()
return str(reference_genome[self.gene.chr].seq[self.start - 1:self.end]).upper()
[docs] def get_cdna_seq(self, splicing_pattern, reference_genome=None, ignore_cache=False):
"""
Args:
splicing_pattern (SplicingPattern): the list of splicing positions
reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome
Returns:
str: the spliced cDNA sequence
"""
temp = sorted([self.start] + splicing_pattern + [self.end])
cdna_start = min(temp)
conti = []
for i in range(0, len(temp) - 1, 2):
conti.append(Interval(temp[i] - cdna_start, temp[i + 1] - cdna_start))
seq = self.get_seq(reference_genome, ignore_cache)
if self.get_strand() == STRAND.NEG:
# adjust the continuous intervals for the min and flip if revcomp
seq = reverse_complement(seq)
spliced_seq = ''.join([str(seq[i.start:i.end + 1]) for i in conti])
spliced_seq = spliced_seq.upper()
return spliced_seq if self.get_strand() == STRAND.POS else reverse_complement(spliced_seq)
@property
def translations(self):
""":class:`list` of :class:`~mavis.annotate.protein.Translation`: list of translations associated with this transcript"""
translations = []
for spl_tx in self.spliced_transcripts:
for translation in spl_tx.translations:
translations.append(translation)
return translations
@property
def transcripts(self):
""":class:`list` of :class:`Transcript`: list of spliced transcripts"""
return self.spliced_transcripts
[docs]class Transcript(BioInterval):
def __init__(self, ust, splicing_patt, seq=None, translations=None):
"""
splicing pattern is given in genomic coordinates
Args:
us_transcript (UsTranscript): the unspliced transcript
splicing_patt (:class:`list` of :class:`int`): the list of splicing positions
seq (str): the cdna sequence
translations (:class:`list` of :class:`~mavis.annotate.protein.Translation`):
the list of translations of this transcript
"""
pos = sorted([ust.start, ust.end] + splicing_patt)
splicing_patt.sort()
self.splicing_pattern = splicing_patt
length = sum([t - s + 1 for s, t in zip(pos[::2], pos[1::2])])
BioInterval.__init__(self, ust, 1, length, seq=None)
self.exons = [Exon(s, t, self) for s, t in zip(pos[::2], pos[1::2])]
self.translations = [] if translations is None else [tx for tx in translations]
for translation in self.translations:
translation.reference_object = self
if splicing_patt and (min(splicing_patt) < ust.start or max(splicing_patt) > ust.end):
raise AssertionError('splicing pattern must be contained within the unspliced transcript')
elif len(splicing_patt) % 2 != 0:
raise AssertionError('splicing pattern must be a list of 3\'5\' splicing positions')
[docs] def convert_genomic_to_cdna(self, pos):
"""
Args:
pos (int): the genomic position to be converted
Returns:
int: the cdna equivalent
Raises:
IndexError: when a genomic position not present in the cdna is attempted to be converted
"""
return self.unspliced_transcript.convert_genomic_to_cdna(pos, self.splicing_pattern)
[docs] def convert_genomic_to_nearest_cdna(self, pos):
return self.reference_object.convert_genomic_to_nearest_cdna(pos, self.splicing_pattern)
[docs] def convert_cdna_to_genomic(self, pos):
"""
Args:
pos (int): cdna position
Returns:
int: the genomic equivalent
"""
return self.unspliced_transcript.convert_cdna_to_genomic(pos, self.splicing_pattern)
[docs] def get_seq(self, reference_genome=None, ignore_cache=False):
"""
Args:
reference_genome (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by
template/chr name
ignore_cache (bool): if True then stored sequences will be ignored and the function will attempt to retrieve the sequence using the positions and the input reference_genome
Returns:
str: the sequence corresponding to the spliced cdna
"""
if self.seq and not ignore_cache:
return self.seq
seq = self.unspliced_transcript.get_cdna_seq(self.splicing_pattern, reference_genome, ignore_cache)
return seq[self.start - 1:self.end]
@property
def unspliced_transcript(self):
""":class:`UsTranscript`: the unspliced transcript this splice variant belongs to"""
return self.reference_object