from ..interval import Interval
from .base import BioInterval
from ..constants import translate, START_AA, STOP_AA, CODON_SIZE
import itertools
from ..error import NotSpecifiedError
[docs]def calculate_ORF(spliced_cdna_sequence, min_orf_size=None):
"""
calculate all possible open reading frames given a spliced cdna sequence (no introns)
Args:
spliced_cdna_sequence (str): the sequence
Returns:
:any:`list` of :any:`Interval`: list of open reading frame positions on the input sequence
"""
# do not revcomp
assert(START_AA != STOP_AA)
cds_orfs = [] # (cds_start, cds_end)
for offset in range(0, CODON_SIZE):
aa_sequence = translate(spliced_cdna_sequence, offset)
# now calc the open reading frames
starts = []
stops = []
for i, aa in enumerate(aa_sequence):
if aa == START_AA:
starts.append(i * CODON_SIZE + 1 + offset)
elif aa == STOP_AA:
stops.append((i + 1) * CODON_SIZE + offset)
orfs = []
for s in starts:
for t in sorted(stops):
if t > s:
i = Interval(s, t)
if not min_orf_size or len(i) >= min_orf_size:
orfs.append(Interval(s, t))
break
temp = {}
for orf in orfs:
if orf.end not in temp:
temp[orf.end] = orf
elif len(orf) > len(temp[orf.end]):
temp[orf.end] = orf
cds_orfs.extend(temp.values())
return cds_orfs
[docs]class DomainRegion(BioInterval):
def __init__(self, start, end, seq=None, domain=None, name=None):
BioInterval.__init__(self, domain, start, end, seq=seq, name=name)
if seq and len(seq) != len(self):
raise AssertionError('domain region seq must be of equal length', self, seq)
[docs]class Domain:
"""
"""
def __init__(self, name, regions, translation=None, data=None):
"""
Args:
name (str): the name of the domain i.e. PF00876
regions (:any:`list` of :any:`DomainRegion`): the amino acid ranges that are part of the domain
transcript (Transcript): the 'parent' transcript this domain belongs to
Raises:
AttributeError: if the end of any region is less than the start
Example:
>>> Domain('DNA binding domain', [(1, 4), (10, 24)], transcript)
"""
self.reference_object = translation
self.name = name
self.regions = sorted(list(set(regions))) # remove duplicates
self.data = dict()
if data is not None:
self.data.update(data)
if len(regions) == 0:
raise AttributeError('at least one region must be given')
for r1, r2 in itertools.combinations(self.regions, 2):
if Interval.overlaps(r1, r2):
raise AttributeError('regions cannot overlap')
for i in range(0, len(self.regions)):
curr = self.regions[i]
if not hasattr(curr, 'seq'):
self.regions[i] = DomainRegion(curr[0], curr[1])
@property
def translation(self):
""":class:`~structural_variant.annotate.Translation`: the Translation this domain belongs to"""
return self.reference_object
[docs] def key(self):
""":class:`tuple`: a tuple representing the items expected to be unique. for hashing and comparing"""
return tuple([self.name, self.translation])
[docs] def score_region_mapping(self, REFERENCE_GENOME=None):
"""
compares the sequence in each DomainRegion to the sequence collected for that domain region from the
translation object
Args:
REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
Returns:
tuple of int and int: tuple contains
- int: the number of matching amino acids
- int: the total number of amino acids
"""
if self.translation:
aa = self.translation.get_AA_seq(REFERENCE_GENOME)
total = 0
matches = 0
for region in self.regions:
if not region.seq:
raise NotSpecifiedError('insufficient sequence information')
ref = aa[region.start - 1:region.end]
for c1, c2 in zip(ref, region.seq):
if c1 == c2:
matches += 1
total += 1
return matches, total
else:
raise NotSpecifiedError('insufficient sequence information')
[docs] def get_seqs(self, REFERENCE_GENOME=None, ignore_cache=False):
"""
returns the amino acid sequences for each of the domain regions associated with
this domain in the order of the regions (sorted by start)
Args:
REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
Returns:
:class:`list` of :class:`str`: list of amino acid sequences for each DomainRegion
Raises:
AttributeError: if there is not enough sequence information given to determine this
"""
sequences = {}
if not ignore_cache:
for region in self.regions:
if region.seq:
sequences[region] = region.seq
if any([r not in sequences for r in self.regions]):
if self.translation:
aa = self.translation.get_AA_seq(REFERENCE_GENOME)
for region in self.regions:
s = aa[region.start - 1:region.end]
if region not in sequences:
sequences[region] = s
else:
raise NotSpecifiedError('insufficient sequence information')
return [sequences[r] for r in self.regions]
[docs] def align_seq(self, input_sequence, REFERENCE_GENOME=None):
"""
align each region to the input sequence starting with the last one.
then take the subset of sequence that remains to align the second last and so on
return a list of intervals for the alignment. If multiple alignments are found,
then raise an error
Args:
input_sequence (str): the sequence to be aligned to
REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
Returns:
tuple: tuple contains
- int: the number of matches
- int: the total number of amino acids to be aligned
- :any:`list` of :any:`DomainRegion`: the list of domain regions on the new input sequence
Raises:
AttributeError: if sequence information is not available
UserWarning: if a valid alignment could not be found or no best alignment was found
"""
seq_list = self.get_seqs(REFERENCE_GENOME)
total = sum([len(s) for s in seq_list])
results = {}
for seq in set(seq_list):
# align the current sequence to find the best matches
scores = []
for p in range(0, len(input_sequence) - len(seq) + 1):
score = 0
for i in range(0, len(seq)):
if input_sequence[p + i].upper() == seq[i].upper():
score += 1
if score > 0:
scores.append((Interval(p + 1, p + len(seq)), score))
if len(scores) == 0:
raise UserWarning('could not align a given region')
results.setdefault(seq, []).extend(scores)
# take the best score for each region and see if they work in sequence
best = []
for seq in seq_list:
temp = max([s for i, s in results[seq]])
curr = [(i, s) for i, s in results[seq] if s == temp]
best.append(curr)
combinations = []
for combo in itertools.product(*best):
total_score = sum([s for i, s in combo])
valid = True
for i in range(1, len(combo)):
if combo[i][0].start <= combo[i - 1][0].end:
valid = False
break
if valid:
combinations.append((total_score, [i for i, s in combo]))
if len(combinations) == 0:
raise UserWarning('could not map the sequences to the input')
else:
high = max([s for s, pl in combinations])
temp = []
for score, pl in combinations:
if score == high:
temp.append(pl)
if len(temp) > 1:
raise UserWarning('multiple mappings of equal score')
else:
regions = []
for pos, seq in zip(temp[0], seq_list):
regions.append(DomainRegion(pos.start, pos.end, seq))
return high, total, regions
[docs]class Translation(BioInterval):
def __init__(self, start, end, transcript=None, domains=None, seq=None, name=None):
"""
describes the splicing pattern and cds start and end with reference to a particular transcript
Args:
start (int): start of the coding sequence (cds) relative to the start of the first exon in the transcript
end (int): end of the coding sequence (cds) relative to the start of the first exon in the transcript
transcript (Transcript): the transcript this is a Translation of
domains (:class:`list` of :any:`Domain`): a list of the domains on this translation
sequence (str): the cds sequence
"""
domains = [] if domains is None else domains
BioInterval.__init__(self, reference_object=transcript, name=name, start=start, end=end, seq=seq)
self.domains = [d for d in domains]
if start <= 0:
raise AttributeError('start must be a positive integer')
if transcript and end > len(transcript):
raise AttributeError('translation cannot be outside of related transcript range', end, len(transcript))
for d in domains:
d.reference_object = self
@property
def transcript(self):
""":class:`~structural_variant.annotate.genomic.Transcript`: the spliced transcript this translation belongs to
"""
return self.reference_object
[docs] def convert_aa_to_cdna(self, pos):
"""
Args:
pos (int): the amino acid position
Returns:
Interval: the cdna equivalent (with CODON_SIZE uncertainty)
"""
return Interval(self.start - 1 + (pos - 1) * 3 + 1, self.start - 1 + pos * 3)
[docs] def convert_cdna_to_aa(self, pos):
"""
Args:
pos (int): the cdna position
Returns:
int: the protein/amino-acid position
Raises:
AttributeError: the cdna position is not translated
"""
if pos < self.start or pos > self.end:
raise IndexError('position is out of bounds')
pos = pos - self.start + 1
aa = pos // CODON_SIZE
if pos % CODON_SIZE != 0:
aa += 1
return aa
[docs] def convert_genomic_to_cds(self, pos):
"""
converts a genomic position to its cds (coding sequence) equivalent
Args:
pos (int): the genomic position
Returns:
int: the cds position (negative if before the initiation start site)
"""
cds, shift = self.convert_genomic_to_nearest_cds(pos)
if shift != 0:
raise IndexError('conversion failed. position is outside the exonic region')
return cds
[docs] def convert_genomic_to_nearest_cds(self, pos):
"""
converts a genomic position to its cds equivalent or (if intronic) the nearest cds and shift
Args:
pos (int): the genomic position
Returns:
tuple of int and int:
* *int* - the cds position
* *int* - the intronic shift
"""
c, shift = self.transcript.convert_genomic_to_nearest_cdna(pos)
if c >= self.start:
c = c - self.start + 1
else:
c -= self.start
return c, shift
[docs] def convert_genomic_to_cds_notation(self, pos):
"""
converts a genomic position to its cds (coding sequence) equivalent using
`hgvs <http://www.hgvs.org/mutnomen/recs-DNA.html>`_ cds notation
Args:
pos (int): the genomic position
Returns:
str: the cds position notation
Example:
>>> tl = Translation(...)
# a position before the translation start
>>> tl.convert_genomic_to_cds_notation(1010)
'-50'
# a position after the translation end
>>> tl.convert_genomic_to_cds_notation(2031)
'*72'
# an intronic position
>>> tl.convert_genomic_to_cds_notation(1542)
'50+10'
>>> tl.convert_genomic_to_cds_notation(1589)
'51-14'
"""
c, shift = self.convert_genomic_to_nearest_cds(pos)
sc = ''
if shift > 0:
sc = '+{}'.format(shift)
elif shift < 0:
sc = str(shift)
if c > len(self):
return '*{}{}'.format(c - len(self), sc)
return '{}{}'.format(c, sc)
[docs] def get_cds_seq(self, REFERENCE_GENOME=None, ignore_cache=False):
"""
Args:
REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
Returns:
str: the cds sequence
Raises:
AttributeError: if the reference sequence has not been given and is not set
"""
if self.seq and not ignore_cache:
return self.seq
elif self.transcript and self.transcript.get_strand():
seq = self.transcript.get_seq(REFERENCE_GENOME, ignore_cache)
return seq[self.start - 1:self.end]
raise NotSpecifiedError('insufficient seq information')
[docs] def get_seq(self, REFERENCE_GENOME=None, ignore_cache=False):
"""
wrapper for the sequence method
Args:
REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
"""
return self.get_cds_seq(REFERENCE_GENOME, ignore_cache)
[docs] def get_AA_seq(self, REFERENCE_GENOME=None, ignore_cache=False):
"""
Args:
REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence
by template/chr name
Returns:
str: the amino acid sequence
Raises:
AttributeError: if the reference sequence has not been given and is not set
"""
cds = self.get_cds_seq(REFERENCE_GENOME, ignore_cache)
return translate(cds)
[docs] def key(self):
"""see :func:`structural_variant.annotate.base.BioInterval.key`"""
return BioInterval.key(self), self.splicing_pattern