Source code for mavis.annotate.protein

from ..interval import Interval
from .base import BioInterval
from ..constants import translate, START_AA, STOP_AA, CODON_SIZE
import itertools
from ..error import NotSpecifiedError


[docs]def calculate_ORF(spliced_cdna_sequence, min_orf_size=None): """ calculate all possible open reading frames given a spliced cdna sequence (no introns) Args: spliced_cdna_sequence (str): the sequence Returns: :any:`list` of :any:`Interval`: list of open reading frame positions on the input sequence """ # do not revcomp assert(START_AA != STOP_AA) cds_orfs = [] # (cds_start, cds_end) for offset in range(0, CODON_SIZE): aa_sequence = translate(spliced_cdna_sequence, offset) # now calc the open reading frames starts = [] stops = [] for i, aa in enumerate(aa_sequence): if aa == START_AA: starts.append(i * CODON_SIZE + 1 + offset) elif aa == STOP_AA: stops.append((i + 1) * CODON_SIZE + offset) orfs = [] for s in starts: for t in sorted(stops): if t > s: i = Interval(s, t) if not min_orf_size or len(i) >= min_orf_size: orfs.append(Interval(s, t)) break temp = {} for orf in orfs: if orf.end not in temp: temp[orf.end] = orf elif len(orf) > len(temp[orf.end]): temp[orf.end] = orf cds_orfs.extend(temp.values()) return cds_orfs
[docs]class DomainRegion(BioInterval): def __init__(self, start, end, seq=None, domain=None, name=None): BioInterval.__init__(self, domain, start, end, seq=seq, name=name) if seq and len(seq) != len(self): raise AssertionError('domain region seq must be of equal length', self, seq)
[docs]class Domain: """ """ def __init__(self, name, regions, translation=None, data=None): """ Args: name (str): the name of the domain i.e. PF00876 regions (:any:`list` of :any:`DomainRegion`): the amino acid ranges that are part of the domain transcript (Transcript): the 'parent' transcript this domain belongs to Raises: AttributeError: if the end of any region is less than the start Example: >>> Domain('DNA binding domain', [(1, 4), (10, 24)], transcript) """ self.reference_object = translation self.name = name self.regions = sorted(list(set(regions))) # remove duplicates self.data = dict() if data is not None: self.data.update(data) if len(regions) == 0: raise AttributeError('at least one region must be given') for r1, r2 in itertools.combinations(self.regions, 2): if Interval.overlaps(r1, r2): raise AttributeError('regions cannot overlap') for i in range(0, len(self.regions)): curr = self.regions[i] if not hasattr(curr, 'seq'): self.regions[i] = DomainRegion(curr[0], curr[1]) @property def translation(self): """:class:`~structural_variant.annotate.Translation`: the Translation this domain belongs to""" return self.reference_object
[docs] def key(self): """:class:`tuple`: a tuple representing the items expected to be unique. for hashing and comparing""" return tuple([self.name, self.translation])
[docs] def score_region_mapping(self, REFERENCE_GENOME=None): """ compares the sequence in each DomainRegion to the sequence collected for that domain region from the translation object Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: tuple of int and int: tuple contains - int: the number of matching amino acids - int: the total number of amino acids """ if self.translation: aa = self.translation.get_AA_seq(REFERENCE_GENOME) total = 0 matches = 0 for region in self.regions: if not region.seq: raise NotSpecifiedError('insufficient sequence information') ref = aa[region.start - 1:region.end] for c1, c2 in zip(ref, region.seq): if c1 == c2: matches += 1 total += 1 return matches, total else: raise NotSpecifiedError('insufficient sequence information')
[docs] def get_seqs(self, REFERENCE_GENOME=None, ignore_cache=False): """ returns the amino acid sequences for each of the domain regions associated with this domain in the order of the regions (sorted by start) Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: :class:`list` of :class:`str`: list of amino acid sequences for each DomainRegion Raises: AttributeError: if there is not enough sequence information given to determine this """ sequences = {} if not ignore_cache: for region in self.regions: if region.seq: sequences[region] = region.seq if any([r not in sequences for r in self.regions]): if self.translation: aa = self.translation.get_AA_seq(REFERENCE_GENOME) for region in self.regions: s = aa[region.start - 1:region.end] if region not in sequences: sequences[region] = s else: raise NotSpecifiedError('insufficient sequence information') return [sequences[r] for r in self.regions]
[docs] def align_seq(self, input_sequence, REFERENCE_GENOME=None): """ align each region to the input sequence starting with the last one. then take the subset of sequence that remains to align the second last and so on return a list of intervals for the alignment. If multiple alignments are found, then raise an error Args: input_sequence (str): the sequence to be aligned to REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: tuple: tuple contains - int: the number of matches - int: the total number of amino acids to be aligned - :any:`list` of :any:`DomainRegion`: the list of domain regions on the new input sequence Raises: AttributeError: if sequence information is not available UserWarning: if a valid alignment could not be found or no best alignment was found """ seq_list = self.get_seqs(REFERENCE_GENOME) total = sum([len(s) for s in seq_list]) results = {} for seq in set(seq_list): # align the current sequence to find the best matches scores = [] for p in range(0, len(input_sequence) - len(seq) + 1): score = 0 for i in range(0, len(seq)): if input_sequence[p + i].upper() == seq[i].upper(): score += 1 if score > 0: scores.append((Interval(p + 1, p + len(seq)), score)) if len(scores) == 0: raise UserWarning('could not align a given region') results.setdefault(seq, []).extend(scores) # take the best score for each region and see if they work in sequence best = [] for seq in seq_list: temp = max([s for i, s in results[seq]]) curr = [(i, s) for i, s in results[seq] if s == temp] best.append(curr) combinations = [] for combo in itertools.product(*best): total_score = sum([s for i, s in combo]) valid = True for i in range(1, len(combo)): if combo[i][0].start <= combo[i - 1][0].end: valid = False break if valid: combinations.append((total_score, [i for i, s in combo])) if len(combinations) == 0: raise UserWarning('could not map the sequences to the input') else: high = max([s for s, pl in combinations]) temp = [] for score, pl in combinations: if score == high: temp.append(pl) if len(temp) > 1: raise UserWarning('multiple mappings of equal score') else: regions = [] for pos, seq in zip(temp[0], seq_list): regions.append(DomainRegion(pos.start, pos.end, seq)) return high, total, regions
[docs]class Translation(BioInterval): def __init__(self, start, end, transcript=None, domains=None, seq=None, name=None): """ describes the splicing pattern and cds start and end with reference to a particular transcript Args: start (int): start of the coding sequence (cds) relative to the start of the first exon in the transcript end (int): end of the coding sequence (cds) relative to the start of the first exon in the transcript transcript (Transcript): the transcript this is a Translation of domains (:class:`list` of :any:`Domain`): a list of the domains on this translation sequence (str): the cds sequence """ domains = [] if domains is None else domains BioInterval.__init__(self, reference_object=transcript, name=name, start=start, end=end, seq=seq) self.domains = [d for d in domains] if start <= 0: raise AttributeError('start must be a positive integer') if transcript and end > len(transcript): raise AttributeError('translation cannot be outside of related transcript range', end, len(transcript)) for d in domains: d.reference_object = self @property def transcript(self): """:class:`~structural_variant.annotate.genomic.Transcript`: the spliced transcript this translation belongs to """ return self.reference_object
[docs] def convert_aa_to_cdna(self, pos): """ Args: pos (int): the amino acid position Returns: Interval: the cdna equivalent (with CODON_SIZE uncertainty) """ return Interval(self.start - 1 + (pos - 1) * 3 + 1, self.start - 1 + pos * 3)
[docs] def convert_cdna_to_aa(self, pos): """ Args: pos (int): the cdna position Returns: int: the protein/amino-acid position Raises: AttributeError: the cdna position is not translated """ if pos < self.start or pos > self.end: raise IndexError('position is out of bounds') pos = pos - self.start + 1 aa = pos // CODON_SIZE if pos % CODON_SIZE != 0: aa += 1 return aa
[docs] def convert_genomic_to_cds(self, pos): """ converts a genomic position to its cds (coding sequence) equivalent Args: pos (int): the genomic position Returns: int: the cds position (negative if before the initiation start site) """ cds, shift = self.convert_genomic_to_nearest_cds(pos) if shift != 0: raise IndexError('conversion failed. position is outside the exonic region') return cds
[docs] def convert_genomic_to_nearest_cds(self, pos): """ converts a genomic position to its cds equivalent or (if intronic) the nearest cds and shift Args: pos (int): the genomic position Returns: tuple of int and int: * *int* - the cds position * *int* - the intronic shift """ c, shift = self.transcript.convert_genomic_to_nearest_cdna(pos) if c >= self.start: c = c - self.start + 1 else: c -= self.start return c, shift
[docs] def convert_genomic_to_cds_notation(self, pos): """ converts a genomic position to its cds (coding sequence) equivalent using `hgvs <http://www.hgvs.org/mutnomen/recs-DNA.html>`_ cds notation Args: pos (int): the genomic position Returns: str: the cds position notation Example: >>> tl = Translation(...) # a position before the translation start >>> tl.convert_genomic_to_cds_notation(1010) '-50' # a position after the translation end >>> tl.convert_genomic_to_cds_notation(2031) '*72' # an intronic position >>> tl.convert_genomic_to_cds_notation(1542) '50+10' >>> tl.convert_genomic_to_cds_notation(1589) '51-14' """ c, shift = self.convert_genomic_to_nearest_cds(pos) sc = '' if shift > 0: sc = '+{}'.format(shift) elif shift < 0: sc = str(shift) if c > len(self): return '*{}{}'.format(c - len(self), sc) return '{}{}'.format(c, sc)
[docs] def get_cds_seq(self, REFERENCE_GENOME=None, ignore_cache=False): """ Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: str: the cds sequence Raises: AttributeError: if the reference sequence has not been given and is not set """ if self.seq and not ignore_cache: return self.seq elif self.transcript and self.transcript.get_strand(): seq = self.transcript.get_seq(REFERENCE_GENOME, ignore_cache) return seq[self.start - 1:self.end] raise NotSpecifiedError('insufficient seq information')
[docs] def get_seq(self, REFERENCE_GENOME=None, ignore_cache=False): """ wrapper for the sequence method Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name """ return self.get_cds_seq(REFERENCE_GENOME, ignore_cache)
[docs] def get_AA_seq(self, REFERENCE_GENOME=None, ignore_cache=False): """ Args: REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`): dict of reference sequence by template/chr name Returns: str: the amino acid sequence Raises: AttributeError: if the reference sequence has not been given and is not set """ cds = self.get_cds_seq(REFERENCE_GENOME, ignore_cache) return translate(cds)
[docs] def key(self): """see :func:`structural_variant.annotate.base.BioInterval.key`""" return BioInterval.key(self), self.splicing_pattern