Source code for mavis.annotate.protein

from ..interval import Interval
from .base import BioInterval
from ..constants import translate, START_AA, STOP_AA, CODON_SIZE
import itertools
from ..error import NotSpecifiedError


[docs]def calculate_ORF(spliced_cdna_sequence, min_orf_size=None): """ calculate all possible open reading frames given a spliced cdna sequence (no introns) Args: spliced_cdna_sequence (str): the sequence Returns: :any:`list` of :any:`Interval`: list of open reading frame positions on the input sequence """ # do not revcomp assert(START_AA != STOP_AA) cds_orfs = [] # (cds_start, cds_end) for offset in range(0, CODON_SIZE): aa_sequence = translate(spliced_cdna_sequence, offset) # now calc the open reading frames orf_intervals = [] current_start = None for i, aa in enumerate(aa_sequence): if aa == START_AA: p = i * CODON_SIZE + 1 + offset if current_start is None: current_start = p elif aa == STOP_AA: p = (i + 1) * CODON_SIZE + offset if current_start is not None: # close the current interval itvl = Interval(current_start, p) if min_orf_size is None or len(itvl) >= min_orf_size: orf_intervals.append(itvl) current_start = None cds_orfs.extend(orf_intervals) return cds_orfs
[docs]class DomainRegion(BioInterval): def __init__(self, start, end, seq=None, domain=None, name=None): BioInterval.__init__(self, domain, start, end, seq=seq, name=name) if seq and len(seq) != len(self): raise AssertionError('domain region seq must be of equal length', self, seq)
[docs]class Domain: """ """ def __init__(self, name, regions, translation=None, data=None): """ Args: name (str): the name of the domain i.e. PF00876 regions (:any:`list` of :any:`DomainRegion`): the amino acid ranges that are part of the domain transcript (Transcript): the 'parent' transcript this domain belongs to Raises: AttributeError: if the end of any region is less than the start Example: >>> Domain('DNA binding domain', [(1, 4), (10, 24)], transcript) """ self.reference_object = translation self.name = name self.regions = sorted(list(set(regions))) # remove duplicates self.data = dict() if data is not None: self.data.update(data) if len(regions) == 0: raise AttributeError('at least one region must be given') for r1, r2 in itertools.combinations(self.regions, 2): if Interval.overlaps(r1, r2): raise AttributeError('regions cannot overlap') for i in range(0, len(self.regions)): curr = self.regions[i] if not hasattr(curr, 'seq'): self.regions[i] = DomainRegion(curr[0], curr[1]) @property def translation(self): """:class:`~mavis.annotate.Translation`: the Translation this domain belongs to""" return self.reference_object
[docs] def key(self): """:class:`tuple`: a tuple representing the items expected to be unique. for hashing and comparing""" return tuple([self.name, self.translation])
[docs] def score_region_mapping(self, REFERENCE_GENOME=None): """ compares the sequence in each DomainRegion to the sequence collected for that domain region from the translation object Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: tuple of int and int: tuple contains - int: the number of matching amino acids - int: the total number of amino acids """ if self.translation: aa = self.translation.get_AA_seq(REFERENCE_GENOME) total = 0 matches = 0 for region in self.regions: if not region.seq: raise NotSpecifiedError('insufficient sequence information') ref = aa[region.start - 1:region.end] for c1, c2 in zip(ref, region.seq): if c1 == c2: matches += 1 total += 1 return matches, total else: raise NotSpecifiedError('insufficient sequence information')
[docs] def get_seqs(self, REFERENCE_GENOME=None, ignore_cache=False): """ returns the amino acid sequences for each of the domain regions associated with this domain in the order of the regions (sorted by start) Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: :class:`list` of :class:`str`: list of amino acid sequences for each DomainRegion Raises: AttributeError: if there is not enough sequence information given to determine this """ sequences = {} if not ignore_cache: for region in self.regions: if region.seq: sequences[region] = region.seq if any([r not in sequences for r in self.regions]): if self.translation: aa = self.translation.get_AA_seq(REFERENCE_GENOME) for region in self.regions: s = aa[region.start - 1:region.end] if region not in sequences: sequences[region] = s else: raise NotSpecifiedError('insufficient sequence information') return [sequences[r] for r in self.regions]
[docs] def align_seq(self, input_sequence, REFERENCE_GENOME=None, min_region_match=0.5): """ align each region to the input sequence starting with the last one. then take the subset of sequence that remains to align the second last and so on return a list of intervals for the alignment. If multiple alignments are found, then raise an error Args: input_sequence (str): the sequence to be aligned to REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name min_region_match (float): percent between 0 and 1. Each region must have a score len(seq) * min_region_match Returns: tuple: tuple contains - int: the number of matches - int: the total number of amino acids to be aligned - :any:`list` of :any:`DomainRegion`: the list of domain regions on the new input sequence Raises: AttributeError: if sequence information is not available UserWarning: if a valid alignment could not be found or no best alignment was found """ seq_list = self.get_seqs(REFERENCE_GENOME) dr_by_seq = {s: d for s, d in zip(seq_list, self.regions)} seq_list = sorted(seq_list, key=lambda x: dr_by_seq[x].start) total = sum([len(s) for s in seq_list]) if total > len(input_sequence): raise UserWarning('could not map the sequences to the input') results = [] last_min_end = 0 for seq in seq_list: # align the current sequence to find the best matches scores = [] min_match = max(1, int(round(len(seq) * min_region_match, 0))) for p in range(last_min_end, len(input_sequence) - len(seq) + 1): score = 0 for i in range(0, len(seq)): if input_sequence[p + i].upper() == seq[i].upper(): score += 1 if score > min_match: scores.append((Interval(p + 1, p + len(seq)), score)) if len(scores) == 0: raise UserWarning('could not align a given region', seq) best_score = max([s[1] for s in scores]) best = [s for s in scores if s[1] == best_score] results.append(best) last_min_end = min([s[0].end for s in best]) # take the best score for each region and see if they work in sequence combinations = [[p] for p in results[0]] for scores in results[1:]: temp_combos = [] for pos, score in scores: for curr in combinations: if pos.start > curr[-1][0].end: temp_combos.append(curr[:] + [(pos, score)]) combinations = temp_combos if len(combinations) == 0: break # compute the cumulative scores for the putative alignments best_score = None best_scoring_alignments = [] for alignment in combinations: cumu_score = sum([s[1] for s in alignment]) if len(best_scoring_alignments) == 0 or cumu_score > best_score: best_scoring_alignments = [alignment] best_score = cumu_score elif cumu_score == best_score: best_scoring_alignments.append(alignment) if len(best_scoring_alignments) == 0: raise UserWarning('could not map the sequences to the input') elif len(best_scoring_alignments) > 1: raise UserWarning('multiple mappings of equal score') else: alignment = [s[0] for s in best_scoring_alignments[0]] regions = [] for itvl, seq in zip(alignment, seq_list): regions.append(DomainRegion(itvl.start, itvl.end, seq)) return best_score, total, regions
[docs]class Translation(BioInterval): def __init__(self, start, end, transcript=None, domains=None, seq=None, name=None): """ describes the splicing pattern and cds start and end with reference to a particular transcript Args: start (int): start of the coding sequence (cds) relative to the start of the first exon in the transcript end (int): end of the coding sequence (cds) relative to the start of the first exon in the transcript transcript (Transcript): the transcript this is a Translation of domains (:class:`list` of :any:`Domain`): a list of the domains on this translation sequence (str): the cds sequence """ domains = [] if domains is None else domains BioInterval.__init__(self, reference_object=transcript, name=name, start=start, end=end, seq=seq) self.domains = [d for d in domains] if start <= 0: raise AttributeError('start must be a positive integer') if transcript and end > len(transcript): raise AttributeError('translation cannot be outside of related transcript range', end, len(transcript)) for d in domains: d.reference_object = self @property def transcript(self): """:class:`~mavis.annotate.genomic.Transcript`: the spliced transcript this translation belongs to """ return self.reference_object
[docs] def convert_aa_to_cdna(self, pos): """ Args: pos (int): the amino acid position Returns: Interval: the cdna equivalent (with CODON_SIZE uncertainty) """ return Interval(self.start - 1 + (pos - 1) * 3 + 1, self.start - 1 + pos * 3)
[docs] def convert_cdna_to_aa(self, pos): """ Args: pos (int): the cdna position Returns: int: the protein/amino-acid position Raises: AttributeError: the cdna position is not translated """ if pos < self.start or pos > self.end: raise IndexError('position is out of bounds') pos = pos - self.start + 1 aa = pos // CODON_SIZE if pos % CODON_SIZE != 0: aa += 1 return aa
[docs] def convert_genomic_to_cds(self, pos): """ converts a genomic position to its cds (coding sequence) equivalent Args: pos (int): the genomic position Returns: int: the cds position (negative if before the initiation start site) """ cds, shift = self.convert_genomic_to_nearest_cds(pos) if shift != 0: raise IndexError('conversion failed. position is outside the exonic region') return cds
[docs] def convert_genomic_to_nearest_cds(self, pos): """ converts a genomic position to its cds equivalent or (if intronic) the nearest cds and shift Args: pos (int): the genomic position Returns: tuple of int and int: * *int* - the cds position * *int* - the intronic shift """ c, shift = self.transcript.convert_genomic_to_nearest_cdna(pos) if c >= self.start: c = c - self.start + 1 else: c -= self.start return c, shift
[docs] def convert_genomic_to_cds_notation(self, pos): """ converts a genomic position to its cds (coding sequence) equivalent using `hgvs <http://www.hgvs.org/mutnomen/recs-DNA.html>`_ cds notation Args: pos (int): the genomic position Returns: str: the cds position notation Example: >>> tl = Translation(...) # a position before the translation start >>> tl.convert_genomic_to_cds_notation(1010) '-50' # a position after the translation end >>> tl.convert_genomic_to_cds_notation(2031) '*72' # an intronic position >>> tl.convert_genomic_to_cds_notation(1542) '50+10' >>> tl.convert_genomic_to_cds_notation(1589) '51-14' """ c, shift = self.convert_genomic_to_nearest_cds(pos) sc = '' if shift > 0: sc = '+{}'.format(shift) elif shift < 0: sc = str(shift) if c > len(self): return '*{}{}'.format(c - len(self), sc) return '{}{}'.format(c, sc)
[docs] def get_cds_seq(self, REFERENCE_GENOME=None, ignore_cache=False): """ Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: str: the cds sequence Raises: AttributeError: if the reference sequence has not been given and is not set """ if self.seq and not ignore_cache: return self.seq elif self.transcript and self.transcript.get_strand(): seq = self.transcript.get_seq(REFERENCE_GENOME, ignore_cache) return seq[self.start - 1:self.end] raise NotSpecifiedError('insufficient seq information')
[docs] def get_seq(self, REFERENCE_GENOME=None, ignore_cache=False): """ wrapper for the sequence method Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name """ return self.get_cds_seq(REFERENCE_GENOME, ignore_cache)
[docs] def get_AA_seq(self, REFERENCE_GENOME=None, ignore_cache=False): """ Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: str: the amino acid sequence Raises: AttributeError: if the reference sequence has not been given and is not set """ cds = self.get_cds_seq(REFERENCE_GENOME, ignore_cache) return translate(cds)
[docs] def key(self): """see :func:`structural_variant.annotate.base.BioInterval.key`""" return BioInterval.key(self), self.splicing_pattern