Source code for mavis.bam.read

from copy import copy
import re
import subprocess

import pysam

from . import cigar as cigar_tools
from .cigar import EVENT_STATES, QUERY_ALIGNED_STATES, REFERENCE_ALIGNED_STATES
from ..constants import CIGAR, DNA_ALPHABET, ORIENT, READ_PAIR_TYPE, STRAND, SVTYPE
from ..interval import Interval


[docs]class SamRead(pysam.AlignedSegment):
    """
    Subclass to extend the pysam.AlignedSegment class adding some utility methods and convenient representations

    Allows next_reference_name and reference_name to be set directly so that is does not depend on a bam header
    """

    def __init__(self, reference_name=None, next_reference_name=None, alignment_score=None):
        pysam.AlignedSegment.__init__(self)
        self._reference_name = reference_name
        self._next_reference_name = next_reference_name
        self.alignment_score = alignment_score

    def __repr__(self):
        return '{}({}:{}, {}, {})'.format(
            self.__class__.__name__, self.reference_name, self.reference_start,
            self.cigar, self.query_sequence
        )

[docs]    @classmethod
    def copy(cls, pysamread):
        return cls.copy_onto(pysamread)

[docs]    @classmethod
    def copy_onto(cls, pysamread, copyread=None):
        cp = cls() if copyread is None else copyread
        cp._reference_name = pysamread.reference_name
        cp.query_sequence = pysamread.query_sequence
        cp.reference_start = pysamread.reference_start
        cp.reference_id = pysamread.reference_id
        cp.cigar = pysamread.cigar[:]
        cp.query_name = pysamread.query_name
        cp.mapping_quality = pysamread.mapping_quality
        cp.set_tags(pysamread.get_tags())
        cp.flag = pysamread.flag
        if pysamread.is_paired:
            cp.next_reference_id = pysamread.next_reference_id
            cp.next_reference_start = pysamread.next_reference_start
            cp._next_reference_name = pysamread.next_reference_name
        try:
            cp.alignment_score = pysamread.alignment_score
        except AttributeError:
            pass
        return cp

    def __copy__(self):
        return self.__class__.copy(self)

    @property
    def reference_name(self):
        return self._reference_name

    @property
    def next_reference_name(self):
        return self._next_reference_name


[docs]def map_ref_range_to_query_range(read, ref_range):
    """
    Args:
        ref_range (Interval): 1-based inclusive
        read (pysam.AlignedSegment): read used for the mapping
    Returns:
        Interval: 1-based inclusive range
    """
    rpos = read.reference_start
    qpos = 0
    qstart = None
    qend = None
    for state, value in read.cigar:
        for i in range(0, value):
            if state in QUERY_ALIGNED_STATES:
                qpos += 1
            if state in REFERENCE_ALIGNED_STATES:
                rpos += 1
            if qstart is None and ref_range.start <= rpos:
                qstart = qpos
            if ref_range.end >= rpos:
                qend = qpos
    if qstart is None or qend is None:
        raise ValueError('reference range is not mapped by input read', ref_range, read.reference_start, read.cigar)
    return Interval(qstart, qend)


[docs]def get_samtools_version():
    """
    executes a subprocess to try and run samtools and parse the version number from the output

    Example:
        >>> get_samtools_version()
        (1, 2, 1)
    """
    proc = subprocess.getoutput(['samtools'])
    for line in proc.split('\n'):
        match = re.search(r'Version: (?P<major>\d+)(\.(?P<mid>\d+)(\.(?P<minor>\d+))?)?', line)
        if match:
            major = int(match.group('major'))
            mid = int(match.group('mid')) if match.group('mid') else 0
            minor = int(match.group('minor')) if match.group('minor') else 0
            return major, mid, minor
    raise ValueError('unable to parse samtools version number')


[docs]def samtools_v0_sort(input_bam, output_bam):
    prefix = re.sub('\.bam$', '', output_bam)
    return 'samtools sort {} {}'.format(input_bam, prefix)


[docs]def samtools_v1_sort(input_bam, output_bam):
    return 'samtools sort {} -o {}'.format(input_bam, output_bam)


[docs]def breakpoint_pos(read, orient=ORIENT.NS):
    """
    assumes the breakpoint is the position following softclipping on the side with more
    softclipping (unless and orientation has been specified)

    Args:
        read (:class:`~pysam.AlignedSegment`): the read object
        orient (ORIENT): the orientation

    Returns:
        int: the position of the breakpoint in the input read
    """
    typ, freq = read.cigar[0]
    end_typ, end_freq = read.cigar[-1]
    ORIENT.enforce(orient)

    if typ != CIGAR.S and end_typ != CIGAR.S:
        raise AttributeError('cannot compute breakpoint for a read without soft-clipping', read.cigar)

    if orient == ORIENT.NS:
        if (typ == CIGAR.S and end_typ == CIGAR.S and freq > end_freq) \
                or typ == CIGAR.S and end_typ != CIGAR.S:
            orient = ORIENT.RIGHT
            # soft clipped to the left
        else:
            # soft clipped to the right
            orient = ORIENT.LEFT

    if orient == ORIENT.RIGHT:
        if typ != CIGAR.S:
            raise AttributeError('soft clipping doesn\'t support input orientation for a breakpoint', repr(orient), read.cigar, read.get_tags())
        return read.reference_start
    else:
        if end_typ != CIGAR.S:
            raise AttributeError('soft clipping doesn\'t support input orientation for a breakpoint', orient, read.cigar, read.get_tags())
        return read.reference_end - 1


[docs]def calculate_alignment_score(read, consec_bonus=1):
    """
    calculates a score for comparing alignments

    Args:
        read (pysam.AlignedSegment): the input read

    Returns:
        float: the score
    """
    score = 0
    qlen = read.reference_end - read.reference_start
    max_score = qlen + (qlen - 1) * consec_bonus
    for c, v in read.cigar:
        if c == CIGAR.M:
            raise ValueError('cannot calculate the alignment score if mismatch v match has not been specified')
        elif c == CIGAR.EQ:
            score += v + (v - 1) * consec_bonus
    return score / max_score


[docs]def nsb_align(
        ref, seq,
        weight_of_score=0.5,
        min_overlap_percent=1,
        min_match=0,
        min_consecutive_match=1,
        scoring_function=calculate_alignment_score):
    """
    given some reference string and a smaller sequence string computes the best non-space-breaking alignment
    i.e. an alignment that does not allow for indels (straight-match). Positions in the aligned segments are
    given relative to the length of the reference sequence (1-based)

    Args:
        ref (str): the reference sequence
        seq (str): the sequence being aligned
        weight_of_score (float): when scoring alignments this determines the amount
            of weight to place on the cigar match. Should be a number between 0 and 1
        min_overlap_percent (float): the minimum amount of overlap of the input sequence to the reference
            should be a number between 0 and 1
        min_match (float): the minimum number of matches compared to total
        scoring_function (callable): any function that will take a read as input and return a float
          used in comparing alignments to choose the best alignment

    Returns:
        :class:`list` of :class:`~pysam.AlignedSegment`: list of aligned segments

    Note:
        using a higher min_match may improve performance as low quality alignments are rejected more quickly. However
        this may also result in no match being returned when there is no high quality match to be found.
    """
    ref = str(ref)
    if len(ref) < 1 or len(seq) < 1:
        raise AttributeError('cannot overlap on an empty sequence')
    if min_match < 0 or min_match > 1:
        raise AttributeError('min_match must be between 0 and 1')

    if min_overlap_percent <= 0 or min_overlap_percent > 1:
        raise AttributeError('percent must be greater than 0 and up to 1', min_overlap_percent)

    min_overlap = int(round(min_overlap_percent * len(seq), 0))

    # store to improve speed and space (don't need to store all alignments)
    best_score = (0, 0)
    results = []

    putative_start_positions = range(min_overlap - len(seq), len(ref) + len(seq) - min_overlap)
    if min_consecutive_match > 1:
        putative_start_positions = set()
        kmers_checked = {}
        for i in range(0, len(seq) - min_consecutive_match):
            current_kmer = seq[i:i + min_consecutive_match]
            if current_kmer in kmers_checked:
                putative_start_positions.update([p - i for p in kmers_checked[current_kmer]])
                continue
            rp = [m.start() for m in re.finditer(current_kmer, ref)]
            kmers_checked[current_kmer] = rp
            putative_start_positions.update([p - i for p in rp])
    for ref_start in putative_start_positions:
        score = 0
        cigar = []
        mismatches = 0
        length = len(seq)
        for i in range(0, len(seq)):
            if length == 0:
                break
            r = ref_start + i
            if r < 0 or r >= len(ref):  # outside the length of the reference seq
                cigar.append((CIGAR.S, 1))
                length -= 1
                continue
            if DNA_ALPHABET.match(ref[r], seq[i]):
                cigar.append((CIGAR.EQ, 1))
            else:
                cigar.append((CIGAR.X, 1))
                mismatches += 1
                if mismatches / length > 1 - min_match:
                    break
        if length == 0 or mismatches / length > 1 - min_match:
            continue

        cigar = cigar_tools.join(cigar)
        # end mismatches we set as soft-clipped
        if cigar[0][0] == CIGAR.X:
            cigar[0] = (CIGAR.S, cigar[0][1])
        if cigar[-1][0] == CIGAR.X:
            cigar[-1] = (CIGAR.S, cigar[-1][1])

        qstart = 0 if cigar[0][0] != CIGAR.S else cigar[0][1]

        a = pysam.AlignedSegment()
        a.query_sequence = str(seq)
        a.reference_start = ref_start + qstart
        a.cigar = cigar
        qlen = a.reference_end - a.reference_start
        score = (scoring_function(a), qlen)  # this way for equal identity matches we take the longer alignment
        if qlen < min_overlap:
            continue

        if score >= best_score:
            best_score = score
            results.append((a, score))

    filtered = [x for x, y in results if y == best_score]

    return filtered


[docs]def sequenced_strand(read, strand_determining_read=2):
    """
    determines the strand that was sequenced

    Args:
        read (:class:`~pysam.AlignedSegment`): the read being used to determine the strand
        strand_determining_read (int): which read in the read pair is the same as the sequenced strand

    Returns:
        STRAND: the strand that was sequenced

    Raises:
        ValueError: if strand_determining_read is not 1 or 2

    Warning:
        if the input pair is unstranded the information will not be representative of the
        strand sequenced since the assumed convention is not followed
    """
    if read.is_unmapped or not read.is_paired:
        raise ValueError('cannot determine strand if the read is unmapped or unpaired')
    strand = None
    if strand_determining_read == 1:
        if read.is_read1:
            strand = STRAND.NEG if read.is_reverse else STRAND.POS
        else:
            strand = STRAND.NEG if not read.is_reverse else STRAND.POS
    elif strand_determining_read == 2:
        if read.is_read2:
            strand = STRAND.NEG if read.is_reverse else STRAND.POS
        else:
            strand = STRAND.NEG if not read.is_reverse else STRAND.POS
    else:
        raise ValueError('unexpected value. Expected 1 or 2, found:', strand_determining_read)
    return strand


[docs]def read_pair_type(read):
    # check if the read pair is in the expected orientation
    """
    assumptions based on illumina pairs: only 4 possible combinations

    Args:
        read (:class:`~pysam.AlignedSegment`): the input read

    Returns:
        READ_PAIR_TYPE: the type of input read pair

    Raises:
        NotImplementedError: for any read that does not fall into the four expected configurations (see below)

    ::

        ++++> <---- is LR same-strand
        ++++> ++++> is LL opposite
        <---- <---- is RR opposite
        <---- ++++> is RL same-strand
    """
    reverse = False
    if read.reference_id > read.next_reference_id or \
            (read.reference_id == read.next_reference_id and read.reference_start > read.next_reference_start):
        reverse = True

    if not read.is_reverse and read.mate_is_reverse:  # LR
        return READ_PAIR_TYPE.RL if reverse else READ_PAIR_TYPE.LR
    elif not read.is_reverse and not read.mate_is_reverse:  # LL opp
        return READ_PAIR_TYPE.LL
    elif read.is_reverse and read.mate_is_reverse:  # RR opp
        return READ_PAIR_TYPE.RR
    elif read.is_reverse and not read.mate_is_reverse:  # RL
        return READ_PAIR_TYPE.LR if reverse else READ_PAIR_TYPE.RL
    else:
        raise NotImplementedError('unexpected orientation for pair')


[docs]def orientation_supports_type(read, event_type):
    """
    checks if the orientation is compatible with the type of event

    Args:
        read (:class:`~pysam.AlignedSegment`): a read from the pair
        event_type (SVTYPE): the type of event to check

    Returns:
        bool:
            - ``True`` - the read pair is in the correct orientation for this event type
            - ``False`` - the read is not in the correct orientation
    """
    if event_type == SVTYPE.DEL or event_type == SVTYPE.INS:
        if read_pair_type(read) != READ_PAIR_TYPE.LR:
            return False
    elif event_type == SVTYPE.TRANS:
        if read_pair_type(read) != READ_PAIR_TYPE.LR and \
                read_pair_type(read) != READ_PAIR_TYPE.RL:
            return False
    elif event_type == SVTYPE.ITRANS or event_type == SVTYPE.INV:
        if read_pair_type(read) != READ_PAIR_TYPE.LL and \
                read_pair_type(read) != READ_PAIR_TYPE.RR:
            return False
    elif event_type == SVTYPE.DUP:
        if read_pair_type(read) != READ_PAIR_TYPE.RL:
            return False
    else:
        raise ValueError('unexpected event type', event_type)
    return True


[docs]def convert_events_to_softclipping(read, orientation, max_event_size, min_anchor_size=None):
    """
    given an alignment, simplifies the alignment by grouping everything past the first anchor and including the
    first event considered too large and unaligning them turning them into softclipping

    """
    if min_anchor_size is None:
        min_anchor_size = max_event_size

    if orientation == ORIENT.LEFT:
        event_size = 0
        adjusted_cigar = []
        anchor = 0
        for state, count in read.cigar:
            if state == CIGAR.M:
                raise NotImplementedError('match v mismatch must be specified')
            elif anchor < min_anchor_size:
                if state == CIGAR.EQ:
                    anchor += count
            elif state in EVENT_STATES:
                event_size += count
                if event_size > max_event_size:
                    break
            else:
                event_size = 0
            adjusted_cigar.append((state, count))
        if event_size > max_event_size:
            while adjusted_cigar[-1][0] in EVENT_STATES:
                del adjusted_cigar[-1]
            aligned = sum([y for x, y in adjusted_cigar if x in QUERY_ALIGNED_STATES] + [0])
            sc = len(read.query_sequence) - aligned
            adjusted_cigar.append((CIGAR.S, sc))
            read = copy(read)
            read.cigar = adjusted_cigar
    elif orientation == ORIENT.RIGHT:
        # more complicated than left b/c need to also adjust the start position
        event_size = 0
        anchor = 0
        adjusted_cigar = []
        for state, count in read.cigar[::-1]:  # first event from the right
            if state == CIGAR.M:
                raise NotImplementedError('match v mismatch must be specified')
            elif anchor < min_anchor_size:
                if state == CIGAR.EQ:
                    anchor += count
            elif state in EVENT_STATES:
                event_size += count
                if event_size > max_event_size:
                    break
            else:
                event_size = 0
            adjusted_cigar.append((state, count))
        if event_size > max_event_size:
            while adjusted_cigar[-1][0] in EVENT_STATES:
                del adjusted_cigar[-1]
            originally_refaligned = sum([y for x, y in read.cigar if x in REFERENCE_ALIGNED_STATES] + [0])
            refaligned = sum([y for x, y in adjusted_cigar if x in REFERENCE_ALIGNED_STATES] + [0])
            aligned = sum([y for x, y in adjusted_cigar if x in QUERY_ALIGNED_STATES] + [0])
            sc = len(read.query_sequence) - aligned
            adjusted_cigar = [(CIGAR.S, sc)] + adjusted_cigar[::-1]
            read = copy(read)
            read.cigar = adjusted_cigar
            read.reference_start += originally_refaligned - refaligned
    else:
        raise ValueError('orientation must be specified', orientation)
    return read