Source code for mavis.validate.base

import itertools
from ..constants import *
from ..error import *
from .constants import DEFAULTS
from ..assemble import assemble
from ..interval import Interval
from ..breakpoint import BreakpointPair
from ..bam import read as read_tools
from ..bam import cigar as cigar_tools
from ..bam.cache import BamCache
from ..util import devnull, log


[docs]class Evidence(BreakpointPair):
    @property
    def outer_window1(self):
        """:class:`~mavis.interval.Interval`: the window where evidence will be gathered for the first
        breakpoint

        see :ref:`theory - calculating the evidence window <theory-calculating-the-evidence-window>`
        """
        if self.collect_from_outer_window():
            try:
                return self.outer_windows[0]
            except AttributeError:
                raise NotImplementedError('abstract property must be overridden')
        else:
            return self.inner_window1

    @property
    def outer_window2(self):
        """:class:`~mavis.interval.Interval`: the window where evidence will be gathered for the second
        breakpoint

        see :ref:`theory - calculating the evidence window <theory-calculating-the-evidence-window>`
        """
        if self.collect_from_outer_window():
            try:
                return self.outer_windows[1]
            except AttributeError:
                raise NotImplementedError('abstract property must be overridden')
        else:
            return self.inner_window2

    @property
    def compatible_window1(self):
        """:class:`~mavis.interval.Interval`: the window/region where it is expected to
        find reads in a compatible flanking pair (mate must be in compatible_window2)

        see :ref:`theory - calculating the evidence window <theory-calculating-the-evidence-window>`
        """
        return self.compatible_windows[0]

    @property
    def compatible_window2(self):
        """:class:`~mavis.interval.Interval`: the window/region where it is expected to
        find reads in a compatible flanking pair (mate must be in compatible_window1)

        see :ref:`theory - calculating the evidence window <theory-calculating-the-evidence-window>`
        """
        return self.compatible_windows[1]

    @property
    def inner_window1(self):
        """:class:`~mavis.interval.Interval`: the window where evidence will be gathered for the first
        breakpoint
        """
        try:
            return self.inner_windows[0]
        except AttributeError:
            raise NotImplementedError('abstract property must be overridden')

    @property
    def inner_window2(self):
        """:class:`~mavis.interval.Interval`: the window where evidence will be gathered for the second
        breakpoint
        """
        try:
            return self.inner_windows[1]
        except AttributeError:
            raise NotImplementedError('abstract property must be overridden')

    @property
    def min_expected_fragment_size(self):
        # cannot be negative
        return int(round(max([self.median_fragment_size - self.stdev_fragment_size * self.stdev_count_abnormal, 0]), 0))

    @property
    def max_expected_fragment_size(self):
        return int(round(self.median_fragment_size + self.stdev_fragment_size * self.stdev_count_abnormal, 0))

    def __init__(
            self,
            break1, break2,
            bam_cache,
            REFERENCE_GENOME,
            read_length,
            stdev_fragment_size,
            median_fragment_size,
            stranded=False,
            opposing_strands=None,
            untemplated_seq=None,
            data={},
            classification=None,
            **kwargs):
        """
        Args:
            breakpoint_pair (BreakpointPair): the breakpoint pair to collect evidence for
            bam_cache (BamCache): the bam cache (and assc file) to collect evidence from
            REFERENCE_GENOME (:class:`dict` of :class:`Bio.SeqRecord` by :class:`str`):
              dict of reference sequence by template/chr name
            data (dict): a dictionary of data to associate with the evidence object
            classification (SVTYPE): the event type
            protocol (PROTOCOL): genome or transcriptome
        """
        cls = self.__class__
        # initialize the breakpoint pair
        self.bam_cache = bam_cache
        if stranded and bam_cache.stranded:
            self.stranded = True
        else:
            self.stranded = False
        BreakpointPair.__init__(
            self, break1, break2,
            stranded=stranded,
            opposing_strands=opposing_strands,
            untemplated_seq=untemplated_seq,
            **data
        )
        d = dict()
        for arg in kwargs:
            if arg not in DEFAULTS.__dict__:
                raise AttributeError('unrecognized attribute', arg)
        d.update(DEFAULTS.__dict__)
        kwargs.setdefault('assembly_min_contig_length', int(read_length * 1.25))
        kwargs.setdefault('assembly_max_kmer_size', int(read_length * 0.7))
        d.update(kwargs)  # input arguments should override the defaults
        for arg, val in d.items():
            setattr(self, arg, val)

        self.bam_cache = bam_cache
        self.classification = classification
        self.REFERENCE_GENOME = REFERENCE_GENOME
        self.read_length = read_length
        self.stdev_fragment_size = stdev_fragment_size
        self.median_fragment_size = median_fragment_size
        self.compatible_windows = None

        if self.classification is not None and self.classification not in BreakpointPair.classify(self):
            raise AttributeError(
                'breakpoint pair improper classification', BreakpointPair.classify(self), self.classification)

        if self.break1.orient == ORIENT.NS or self.break2.orient == ORIENT.NS:
            raise NotSpecifiedError(
                'input breakpoint pair must specify strand and orientation. Cannot be \'not specified'
                '\' for evidence gathering')

        self.split_reads = (set(), set())
        self.flanking_pairs = set()
        self.compatible_flanking_pairs = set()
        self.spanning_reads = set()
        # for each breakpoint stores the number of reads that were read from the associated
        # bamfile for the window surrounding the breakpoint
        self.counts = [0, 0]  # has to be a list to assign
        self.contigs = []

        self.half_mapped = (set(), set())

        try:
            self.compute_fragment_size(None)
        except NotImplementedError:
            raise NotImplementedError('abstract class cannot be initialized')
        except BaseException:
            pass

[docs]    def collect_from_outer_window(self):
        """
        determines if evidence should be collected from the outer window (looking for flanking evidence)
        or should be limited to the inner window (split/spanning/contig only)

        Returns:
            bool: True or False
        """
        if self.interchromosomal:
            return True
        elif len(self.break1 | self.break2) >= self.outer_window_min_event_size:
            return True
        else:
            return False

[docs]    def standardize_read(self, read):
        # recomputing to standardize b/c split reads can be used to call breakpoints exactly
        read.set_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR, 1, value_type='i')
        # recalculate the read cigar string to ensure M is replaced with = or X
        c = cigar_tools.recompute_cigar_mismatch(
            read,
            self.REFERENCE_GENOME[self.bam_cache.get_read_reference_name(read)].seq
        )
        prefix = 0
        try:
            c, prefix = cigar_tools.extend_softclipping(c, self.sc_extension_stop)
        except AttributeError as err:
            pass
        read.cigar = cigar_tools.join(c)
        read.reference_start = read.reference_start + prefix

        # makes sure all insertions are called as far 'right' as possible
        read.cigar = cigar_tools.hgvs_standardize_cigar(
            read, self.REFERENCE_GENOME[self.bam_cache.get_read_reference_name(read)].seq)
        return read

[docs]    def putative_event_types(self):
        """
        Returns:
            list of :class:`~mavis.constants.SVTYPE`: list of the possible classifications
        """
        if self.classification:
            return [self.classification]
        else:
            return BreakpointPair.classify(self)

[docs]    def compute_fragment_size(self, read, mate):
        """
        Args:
            read (pysam.AlignedSegment):
            mate (pysam.AlignedSegment):
        Returns:
            Interval: interval representing the range of possible fragment sizes for this read pair
        """
        raise NotImplementedError('abstract method must be overridden')

[docs]    def supporting_reads(self):
        """
        convenience method to return all flanking, split and spanning reads associated with an evidence object
        """
        result = set()
        for s in self.flanking_pairs:
            result.update(s)
        for s in self.split_reads:
            result.update(s)
        result.update(self.spanning_reads)
        return result

[docs]    def collect_spanning_read(self, read):
        """
        spanning read: a read covering BOTH breakpoints

        This is only applicable to small events. Do not need to look for soft clipped reads
        here since they will be collected already

        Args:
            read (pysam.AlignedSegment): the putative spanning read

        Returns:
            bool:
                - True: the read was collected and stored in the current evidence object
                - False: the read was not collected
        """
        if self.interchromosomal:
            return False
        elif not Interval.overlaps(self.inner_window1, self.inner_window2):
            # too far apart to call spanning reads
            return False

        if self.stranded:
            strand = read_tools.sequenced_strand(read, self.strand_determining_read)
            if strand != self.break1.strand and strand != self.break2.strand:
                return False

        combined = self.inner_window1 & self.inner_window2
        read_interval = Interval(read.reference_start + 1, read.reference_end)

        if Interval.overlaps(combined, read_interval):

            if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR):

                read = self.standardize_read(read)
            # in the correct position, now determine if it can support the event types
            for event_type in self.putative_event_types():
                if event_type in [SVTYPE.DUP, SVTYPE.INS]:
                    if CIGAR.I in [c[0] for c in read.cigar]:
                        self.spanning_reads.add(read)
                        return True
                elif event_type == SVTYPE.DEL:
                    if CIGAR.D in [c[0] for c in read.cigar]:
                        self.spanning_reads.add(read)
                        return True
                elif event_type == SVTYPE.INV:
                    if CIGAR.X in [c[0] for c in read.cigar]:
                        return True
        return False

[docs]    def collect_compatible_flanking_pair(self, read, mate, compatible_type):
        """
        checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for
        this event

        Args:
            read (pysam.AlignedSegment): the read to add
            mate (pysam.AlignedSegment): the mate
            compatible_type (SVTYPE): the type we are collecting for

        Returns:
            bool:
                - True: the pair was collected and stored in the current evidence object
                - False: the pair was not collected
        Raises:
            ValueError: if the input reads are not a valid pair

        see :ref:`theory - types of flanking evidence <theory-compatible-flanking-pairs>`
        """
        if read.is_unmapped or mate.is_unmapped or read.query_name != mate.query_name or read.is_read1 == mate.is_read1:
            raise ValueError('input reads must be a mapped and mated pair')
        if not self.compatible_windows:
            raise ValueError('compatible windows were not given')
        if self.interchromosomal:
            raise NotImplementedError('interchromosomal events do not have compatible flanking pairs')
        # check that the references are right for the pair
        if read.reference_id != read.next_reference_id:
            return False
        elif read.mapping_quality < self.min_mapping_quality or mate.mapping_quality < self.min_mapping_quality:
            return False
        if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR):
            read = self.standardize_read(read)
        if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR):
            mate = self.standardize_read(mate)
        # order the read pairs so that they are in the same order that we expect for the breakpoints
        if read.reference_start > mate.reference_start:
            read, mate = mate, read

        if self.bam_cache.get_read_reference_name(read) != self.break1.chr or \
                self.bam_cache.get_read_reference_name(mate) != self.break2.chr:
            return False

        # check if this read falls in the first breakpoint window
        iread = Interval(read.reference_start + 1, read.reference_end)
        imate = Interval(mate.reference_start + 1, mate.reference_end)

        if self.stranded:
            strand1 = read_tools.sequenced_strand(read, self.strand_determining_read)
            strand2 = read_tools.sequenced_strand(mate, self.strand_determining_read)
            if strand1 != self.break1.strand or strand2 != self.break2.strand:
                return False

        # check that the pair orientation is correct
        if not read_tools.orientation_supports_type(read, compatible_type):
            return False

        # check that the fragment size is reasonable
        fragment_size = self.compute_fragment_size(read, mate)

        if compatible_type == SVTYPE.DEL:
            if fragment_size.end <= self.max_expected_fragment_size:
                return False
        elif compatible_type == SVTYPE.INS:
            if fragment_size.start >= self.min_expected_fragment_size:
                return False

        # check that the positions of the reads and the strands make sense
        if Interval.overlaps(iread, self.compatible_windows[0]) and \
                Interval.overlaps(imate, self.compatible_windows[1]):
            self.compatible_flanking_pairs.add((read, mate))
            return True

        return False

[docs]    def collect_flanking_pair(self, read, mate):
        """
        checks if a given read meets the minimum quality criteria to be counted as evidence as stored as support for
        this event

        Args:
            read (pysam.AlignedSegment): the read to add
            mate (pysam.AlignedSegment): the mate

        Returns:
            bool:
                - True: the pair was collected and stored in the current evidence object
                - False: the pair was not collected
        Raises:
            ValueError: if the input reads are not a valid pair

        see :ref:`theory - types of flanking evidence <theory-types-of-flanking-evidence>`
        """
        if read.is_unmapped or mate.is_unmapped:
            raise ValueError('input reads must be a mapped and mated pair. One or both of the reads is unmapped')
        elif read.query_name != mate.query_name:
            raise ValueError('input reads must be a mapped and mated pair. The query names do not match')
        elif abs(read.template_length) != abs(mate.template_length):
            raise ValueError(
                'input reads must be a mapped and mated pair. The template lengths (abs value) do not match',
                abs(read.template_length), abs(mate.template_length))
        elif read.mapping_quality < self.min_mapping_quality or mate.mapping_quality < self.min_mapping_quality:
            return False  # do not meet the minimum mapping quality requirements
        # check that the references are right for the pair
        if self.interchromosomal:
            if read.reference_id == read.next_reference_id:
                return False
        elif read.reference_id != read.next_reference_id:
            return False

        if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR):
            read = self.standardize_read(read)
        if not mate.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not mate.get_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR):
            mate = self.standardize_read(mate)
        # order the read pairs so that they are in the same order that we expect for the breakpoints
        if read.reference_id != mate.reference_id:
            if self.bam_cache.get_read_reference_name(read) > self.bam_cache.get_read_reference_name(mate):
                read, mate = mate, read
        elif read.reference_start > mate.reference_start:
            read, mate = mate, read

        if self.bam_cache.get_read_reference_name(read) != self.break1.chr or \
                self.bam_cache.get_read_reference_name(mate) != self.break2.chr:
            return False
        if any([
            self.break1.orient == ORIENT.LEFT and read.is_reverse,
            self.break1.orient == ORIENT.RIGHT and not read.is_reverse,
            self.break2.orient == ORIENT.LEFT and mate.is_reverse,
            self.break2.orient == ORIENT.RIGHT and not mate.is_reverse
        ]):
            return False
        # check if this read falls in the first breakpoint window
        iread = Interval(read.reference_start + 1, read.reference_end)
        imate = Interval(mate.reference_start + 1, mate.reference_end)

        if self.stranded:
            strand1 = read_tools.sequenced_strand(read, self.strand_determining_read)
            strand2 = read_tools.sequenced_strand(mate, self.strand_determining_read)

            if strand1 != self.break1.strand or strand2 != self.break2.strand:
                return False

        for event_type in self.putative_event_types():

            # check that the pair orientation is correct
            if not read_tools.orientation_supports_type(read, event_type):
                continue

            # check that the fragment size is reasonable
            fragment_size = self.compute_fragment_size(read, mate)

            if event_type == SVTYPE.DEL:
                if fragment_size.end <= self.max_expected_fragment_size:
                    continue
            elif event_type == SVTYPE.INS:
                if fragment_size.start >= self.min_expected_fragment_size:
                    continue

            # check that the positions of the reads and the strands make sense
            if Interval.overlaps(iread, self.outer_window1) and Interval.overlaps(imate, self.outer_window2):
                self.flanking_pairs.add((read, mate))
                return True

        return False

[docs]    def collect_half_mapped(self, read, mate):
        """
        Args:
            read (pysam.AlignedSegment): the read to add
            mate (pysam.AlignedSegment): the unmapped mate

        Returns:
            bool:
                - True: the read was collected and stored in the current evidence object
                - False: the read was not collected
        Raises:
            AssertionError: if the mate is not unmapped
        """
        if not mate.is_unmapped:
            raise AssertionError('expected the mate to be unmapped')
        read_itvl = Interval(read.reference_start + 1, read.reference_end)
        added = False
        if self.break1.chr == read.reference_name and Interval.overlaps(self.outer_window1, read_itvl):
            self.half_mapped[0].add(mate)
            added = True
        if self.break2.chr == read.reference_name and Interval.overlaps(self.outer_window2, read_itvl):
            self.half_mapped[1].add(mate)
            added = True
        return added

[docs]    def collect_split_read(self, read, first_breakpoint):
        """
        adds a split read if it passes the criteria filters and raises a warning if it does not

        Args:
            read (pysam.AlignedSegment): the read to add
            first_breakpoint (bool): add to the first breakpoint (or second if false)
        Returns:
            bool:
                - True: the read was collected and stored in the current evidence object
                - False: the read was not collected
        Raises:
            NotSpecifiedError: if the breakpoint orientation is not specified
        """
        breakpoint = self.break1 if first_breakpoint else self.break2
        window = self.inner_window1 if first_breakpoint else self.inner_window2
        opposite_breakpoint = self.break2 if first_breakpoint else self.break1
        opposite_window = self.inner_window2 if first_breakpoint else self.inner_window1

        if read.cigar[0][0] != CIGAR.S and read.cigar[-1][0] != CIGAR.S:
            return False  # split read is not softclipped
        elif breakpoint.orient == ORIENT.LEFT and read.cigar[-1][0] != CIGAR.S:
            return False  # split read is not softclipped
        elif breakpoint.orient == ORIENT.RIGHT and read.cigar[0][0] != CIGAR.S:
            return False  # split read is not softclipped

        # the first breakpoint of a BreakpointPair is always the lower breakpoint
        # if this is being added to the second breakpoint then we'll need to check if the
        # read soft-clipping needs to be adjusted

        # check if the read falls within the evidence collection window
        # recall that pysam is 0-indexed and the window is 1-indexed
        if read.reference_start > window.end - 1 or read.reference_end < window.start \
                or self.bam_cache.get_read_reference_name(read) != breakpoint.chr:
            return False  # read not in breakpoint evidence window
        # can only enforce strand if both the breakpoint and the bam are stranded
        if self.stranded and self.bam_cache.stranded:
            strand = read_tools.sequenced_strand(read, strand_determining_read=self.strand_determining_read)
            if strand != breakpoint.strand:
                return False  # split read not on the appropriate strand
        unused = ''
        primary = ''
        clipped = ''
        if breakpoint.orient == ORIENT.LEFT:
            unused = read.query_sequence[:read.query_alignment_start]
            primary = read.query_sequence[read.query_alignment_start:read.query_alignment_end]
            # end is exclusive in pysam
            clipped = read.query_sequence[read.query_alignment_end:]
        elif breakpoint.orient == ORIENT.RIGHT:
            clipped = read.query_sequence[:read.query_alignment_start]
            primary = read.query_sequence[read.query_alignment_start:read.query_alignment_end]
            unused = read.query_sequence[read.query_alignment_end:]
        else:
            raise NotSpecifiedError(
                'cannot assign split reads to a breakpoint where the orientation has not been specified')
        if len(primary) + len(clipped) + len(unused) != len(read.query_sequence):
            raise AssertionError(
                'unused, primary, and clipped sequences should make up the original sequence',
                unused, primary, clipped, read.query_sequence, len(read.query_sequence))

        if len(primary) < self.min_anchor_exact or len(clipped) < self.min_softclipping:
            # split read does not meet the minimum anchor criteria
            return False
        if not read.has_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR) or not read.get_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR):
            read = self.standardize_read(read)
        # data quality filters
        if cigar_tools.alignment_matches(read.cigar) >= self.min_sample_size_to_apply_percentage \
                and cigar_tools.match_percent(read.cigar) < self.min_anchor_match:
            return False  # too poor quality of an alignment
        if cigar_tools.longest_exact_match(read.cigar) < self.min_anchor_exact \
                and cigar_tools.longest_fuzzy_match(read.cigar, self.fuzzy_mismatch_number) < self.min_anchor_fuzzy:
            return False  # too poor quality of an alignment
        else:
            self.split_reads[0 if first_breakpoint else 1].add(read)

        # try mapping the soft-clipped portion to the other breakpoint
        w = (opposite_window[0], opposite_window[1])
        opposite_breakpoint_ref = self.REFERENCE_GENOME[opposite_breakpoint.chr].seq[w[0] - 1: w[1]]

        putative_alignments = None

        if not self.opposing_strands:  # same strand
            sc_align = read_tools.nsb_align(
                opposite_breakpoint_ref, read.query_sequence, min_consecutive_match=self.min_anchor_exact)

            for a in sc_align:
                a.flag = read.flag
            putative_alignments = sc_align
        else:
            # should align opposite the current read
            revcomp_sc_align = reverse_complement(read.query_sequence)
            revcomp_sc_align = read_tools.nsb_align(
                opposite_breakpoint_ref, revcomp_sc_align, min_consecutive_match=self.min_anchor_exact)

            for a in revcomp_sc_align:
                a.flag = read.flag ^ PYSAM_READ_FLAGS.REVERSE  # EXOR
            putative_alignments = revcomp_sc_align

        scores = []

        for a in putative_alignments:  # loop over the alignments
            a.flag = a.flag | PYSAM_READ_FLAGS.SUPPLEMENTARY
            # set this flag so we don't recompute the cigar multiple
            a.set_tag(PYSAM_READ_FLAGS.RECOMPUTED_CIGAR, 1, value_type='i')
            # add information from the original read
            a.reference_start = w[0] - 1 + a.reference_start
            a.reference_id = self.bam_cache.reference_id(opposite_breakpoint.chr)
            a.query_name = read.query_name
            a.set_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT, 1, value_type='i')
            a.next_reference_start = read.next_reference_start
            a.next_reference_id = read.next_reference_id
            a.mapping_quality = NA_MAPPING_QUALITY
            try:
                cigar, offset = cigar_tools.extend_softclipping(
                    a.cigar, self.sc_extension_stop)
                a.cigar = cigar
                a.reference_start = a.reference_start + offset
            except AttributeError:
                # if the matches section is too small you can't extend the
                # softclipping
                pass
            s = cigar_tools.score(a.cigar)
            a.cigar = cigar_tools.join(a.cigar)
            if a.reference_id == a.next_reference_id:
                # https://samtools.github.io/hts-specs/SAMv1.pdf
                # unsigned observed template length equals the number of bases from the leftmost
                # mapped base to the rightmost mapped base
                tlen = abs(a.reference_start - a.next_reference_start) + 1
                if a.reference_start < a.next_reference_start:
                    a.template_length = tlen
                else:
                    a.template_length = -1 * tlen
            else:
                a.template_length = 0

            if cigar_tools.alignment_matches(a.cigar) >= self.min_sample_size_to_apply_percentage \
                    and cigar_tools.match_percent(a.cigar) < self.min_anchor_match:
                continue
            if cigar_tools.longest_exact_match(a.cigar) < self.min_anchor_exact \
                    and cigar_tools.longest_fuzzy_match(a.cigar, self.fuzzy_mismatch_number) < self.min_anchor_fuzzy:
                continue
            if self.max_sc_preceeding_anchor is not None:
                if opposite_breakpoint.orient == ORIENT.LEFT:
                    if a.cigar[0][0] == CIGAR.S and a.cigar[0][1] > self.max_sc_preceeding_anchor:
                        continue
                elif opposite_breakpoint.orient == ORIENT.RIGHT:
                    if a.cigar[-1][0] == CIGAR.S and a.cigar[-1][1] > self.max_sc_preceeding_anchor:
                        continue
            scores.append((s, cigar_tools.match_percent(a.cigar), a))

        scores = sorted(scores, key=lambda x: (x[0], x[1]), reverse=True) if len(scores) > 0 else []

        if len(scores) > 1:
            if scores[0][0] != scores[1][0] and scores[0][1] != scores[1][1]:
                # not multimap, pick highest scoring alignment
                clipped = scores[0][2]
                self.split_reads[1 if first_breakpoint else 0].add(clipped)  # add to the opposite breakpoint
        elif len(scores) == 1:
            clipped = scores[0][2]
            self.split_reads[1 if first_breakpoint else 0].add(clipped)  # add to the opposite breakpoint
        return True

[docs]    def decide_sequenced_strand(self, reads):
        """
        given a set of reads, determines the sequenced strand (if possible) and then returns the majority
        strand found

        Args:
            reads (set of :class:`pysam.AlignedSegment`): set of reads

        Returns:
            STRAND: the sequenced strand

        Raises:
            ValueError: input was an empty set or the ratio was not sufficient to decide on a strand
        """
        if len(reads) == 0:
            raise ValueError('cannot determine the strand of a set of reads if the set is empty')

        strand_calls = {STRAND.POS: 0, STRAND.NEG: 0}
        for read in reads:
            try:
                strand = read_tools.sequenced_strand(read, self.strand_determining_read)
                strand_calls[strand] = strand_calls.get(strand, 0) + 1
            except ValueError as err:
                pass
        if sum(strand_calls.values()) == 0:
            raise ValueError('Could not determine strand. Insufficient mapped reads')
        if strand_calls[STRAND.POS] == 0:
            return STRAND.NEG
        elif strand_calls[STRAND.NEG] == 0:
            return STRAND.POS
        else:
            ratio = strand_calls[STRAND.POS] / (strand_calls[STRAND.NEG] + strand_calls[STRAND.POS])
            neg_ratio = 1 - ratio
            if ratio >= self.assembly_strand_concordance:
                return STRAND.POS
            elif neg_ratio >= self.assembly_strand_concordance:
                return STRAND.NEG
            raise ValueError('Could not determine the strand. Equivocal POS/(NEG + POS) ratio', ratio, strand_calls)

[docs]    def assemble_contig(self, log=devnull):
        """
        uses the split reads and the partners of the half mapped reads to create a contig
        representing the sequence across the breakpoints

        if it is not strand specific then sequences are sorted alphanumerically and only the
        first of a pair is kept (paired by sequence)
        """
        strand_specific = self.stranded and self.bam_cache.stranded
        # gather reads for the putative assembly
        assembly_sequences = {}
        targeted = 0
        # add split reads
        for r in list(itertools.chain.from_iterable(self.split_reads)) + list(self.spanning_reads):
            # ignore targeted realignments
            if r.has_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT) and r.get_tag(PYSAM_READ_FLAGS.TARGETED_ALIGNMENT):
                targeted += 1
                continue
            assembly_sequences.setdefault(r.query_sequence, set()).add(r)
            rqs_comp = reverse_complement(r.query_sequence)
            assembly_sequences.setdefault(rqs_comp, set()).add(r)

        # add half-mapped reads
        # exclude half-mapped reads if there is 'n' split reads that target align
        if targeted < self.assembly_min_tgt_to_exclude_half_map and \
                self.assembly_include_half_mapped_reads:
            for r in itertools.chain.from_iterable(self.half_mapped):
                assembly_sequences.setdefault(r.query_sequence, set()).add(r)
                rqs_comp = reverse_complement(r.query_sequence)
                assembly_sequences.setdefault(rqs_comp, set()).add(r)

        # add flanking reads
        if self.assembly_include_flanking_pairs:
            for read, mate in self.flanking_pairs:
                assembly_sequences.setdefault(read.query_sequence, set()).add(read)
                rqs_comp = reverse_complement(read.query_sequence)
                assembly_sequences.setdefault(rqs_comp, set()).add(read)

                assembly_sequences.setdefault(mate.query_sequence, set()).add(mate)
                rqs_comp = reverse_complement(mate.query_sequence)
                assembly_sequences.setdefault(rqs_comp, set()).add(mate)

        log('assembly size of {} sequences'.format(len(assembly_sequences) // 2))

        contigs = assemble(
            assembly_sequences,
            assembly_min_edge_weight=self.assembly_min_edge_weight,
            assembly_min_nc_edge_weight=self.assembly_min_nc_edge_weight,
            assembly_max_paths=self.assembly_max_paths,
            log=log,
            assembly_min_exact_match_to_remap=self.assembly_min_exact_match_to_remap,
            assembly_min_contig_length=self.assembly_min_contig_length,
            assembly_max_kmer_size=self.assembly_max_kmer_size,
            assembly_max_kmer_strict=self.assembly_max_kmer_strict
        )

        # add the input reads
        for ctg in contigs:
            for read_seq in ctg.remapped_sequences:
                ctg.input_reads.update(assembly_sequences[read_seq.query_sequence])

        # now determine the strand from the remapped reads if possible
        if self.stranded and self.bam_cache.stranded:  # strand specific
            for contig in contigs:
                build_strand = {STRAND.POS: 0, STRAND.NEG: 0}  # if neg will have to flip
                for read_seq in contig.remapped_sequences:
                    for read in assembly_sequences[read_seq.query_sequence]:
                        if read.is_unmapped:
                            continue
                        flip = False
                        if read.query_sequence != read_seq.query_sequence:
                            flip = not flip
                        try:
                            seq_strand = read_tools.sequenced_strand(read, self.strand_determining_read)
                            if seq_strand == STRAND.NEG:
                                flip = not flip
                            build_strand[STRAND.NEG if flip else STRAND.POS] += 1
                        except ValueError as err:
                            pass
                if sum(build_strand.values()) == 0:
                    continue
                elif build_strand[STRAND.POS] == 0:
                    flipped_build = True
                elif build_strand[STRAND.NEG] == 0:
                    flipped_build = False
                else:
                    ratio = build_strand[STRAND.POS] / (build_strand[STRAND.NEG] + build_strand[STRAND.POS])
                    neg_ratio = 1 - ratio
                    if ratio >= self.assembly_strand_concordance:
                        flipped_build = False
                    elif neg_ratio >= self.assembly_strand_concordance:
                        flipped_build = True
                    else:
                        continue
                if flipped_build:
                    contig.seq = reverse_complement(contig.seq)
                contig.strand_specific = True

        filtered_contigs = {}
        # sort so that the function is deterministic
        for c in sorted(contigs, key=lambda x: (x.remap_score() * -1, x.seq)):
            if c.remap_score() < self.assembly_min_remapped_seq:  # filter on evidence level
                continue
            if self.stranded and self.bam_cache.stranded:
                filtered_contigs.setdefault(c.seq, c)
            else:
                rseq = reverse_complement(c.seq)
                if c.seq not in filtered_contigs and rseq not in filtered_contigs:
                    filtered_contigs[c.seq] = c
        self.contigs = list(filtered_contigs.values())

[docs]    @classmethod
    def load_multiple(cls, evidence, log=devnull):
        """
        loads evidence from the bam file for multiple evidence objects at once

        Args:
            evidence (list of :class:`Evidence`): list of evidence objects to collect evidence for

        Warning:
            this is not exactly equivalent to multiple calls of load_evidence because it
            does not keep a running total of reads between bins and cannot adjust dynamically.
            Effectively this means load_evidence may potentially collect more evidence
        """
        log('gathering evidence for {} events'.format(len(evidence)))
        cache = evidence[0].bam_cache
        filter_secondary_alignments = evidence[0].filter_secondary_alignments
        min_mapping_quality = evidence[0].min_mapping_quality
        max_expected_fragment_size = evidence[0].max_expected_fragment_size
        min_expected_fragment_size = evidence[0].min_expected_fragment_size
        read_length = evidence[0].read_length
        fetch_min_bin_size = evidence[0].fetch_min_bin_size
        protocol = evidence[0].protocol

        # check the inputs make sense
        for ev in evidence:
            if cache is not ev.bam_cache:
                raise UserWarning('cannot load_multiple when bam_cache is not the same object')
            if filter_secondary_alignments != ev.filter_secondary_alignments:
                raise UserWarning('cannot load_multiple when filter_secondary_alignments differs')
            if min_mapping_quality != ev.min_mapping_quality:
                raise UserWarning('cannot load_multiple when the min_mapping_quality differs')
            if read_length != ev.read_length:
                raise UserWarning('cannot load_multiple when the read_length differs')
            max_expected_fragment_size = min([ev.max_expected_fragment_size, max_expected_fragment_size])
            min_expected_fragment_size = max([ev.min_expected_fragment_size, min_expected_fragment_size])
            fetch_min_bin_size = max([ev.fetch_min_bin_size, fetch_min_bin_size])
            if ev.protocol == PROTOCOL.TRANS:
                protocol = PROTOCOL.TRANS

        def cache_if_true(read):
            if read.is_unmapped or read.mate_is_unmapped:
                return True
            elif any([
                filter_secondary_alignments and read.is_secondary,
                read.mapping_quality < min_mapping_quality
            ]):
                return False
            elif set([x[0] for x in read.cigar]) & {CIGAR.D, CIGAR.N, CIGAR.S, CIGAR.I}:
                return True
            elif read.is_proper_pair and protocol != PROTOCOL.TRANS:
                min_frag_est = abs(read.reference_start - read.next_reference_start) - read_length
                max_frag_est = min_frag_est + 3 * read_length
                if min_frag_est >= min_expected_fragment_size and max_frag_est <= max_expected_fragment_size:
                    return False
            return True

        def filter_if_true(read):
            if not cache_if_true(read):
                return True
            elif read.is_unmapped:
                return True
            return False

        # create the intervals to investigate
        read_limits = {}  # chr => interval => cap
        for ev in evidence:
            # first breakpoint window
            bins = BamCache._generate_fetch_bins(
                ev.outer_window1[0], ev.outer_window1[1], ev.fetch_reads_bins, fetch_min_bin_size)
            read_cap = ev.fetch_reads_limit // len(bins)
            for b in bins:
                read_limits.setdefault(ev.break1.chr, {})
                read_limits[ev.break1.chr][b] = max([read_limits[ev.break1.chr].get(b, 0), read_cap])
            # second breakpoint window
            bins = BamCache._generate_fetch_bins(
                ev.outer_window2[0], ev.outer_window2[1], ev.fetch_reads_bins, fetch_min_bin_size)
            read_cap = ev.fetch_reads_limit // len(bins)
            for b in bins:
                read_limits.setdefault(ev.break2.chr, {})
                read_limits[ev.break2.chr][b] = max([read_limits[ev.break2.chr].get(b, 0), read_cap])
            # compatible_windows
            if ev.compatible_windows and ev.collect_from_outer_window():
                # first compatible breakpoint window
                bins = BamCache._generate_fetch_bins(
                    ev.compatible_window1[0], ev.compatible_window1[1], ev.fetch_reads_bins, fetch_min_bin_size)
                read_cap = ev.fetch_reads_limit // len(bins)
                for b in bins:
                    read_limits.setdefault(ev.break1.chr, {})
                    read_limits[ev.break1.chr][b] = max([read_limits[ev.break1.chr].get(b, 0), read_cap])
                # second compatible breakpoint window
                bins = BamCache._generate_fetch_bins(
                    ev.compatible_window2[0], ev.compatible_window2[1], ev.fetch_reads_bins, fetch_min_bin_size)
                read_cap = ev.fetch_reads_limit // len(bins)
                for b in bins:
                    read_limits.setdefault(ev.break2.chr, {})
                    read_limits[ev.break2.chr][b] = max([read_limits[ev.break2.chr].get(b, 0), read_cap])
        # get the reads for any given interval, map over each applicable evidence
        fetch_regions = []

        for chr in read_limits:
            weighted_intervals = Interval.split_overlap(*read_limits[chr], weight_mapping=read_limits[chr])
            # Merge any intervals that are < min size?
            temp = {}
            intervals = sorted(weighted_intervals)
            i = 0
            while i < len(intervals):
                curr = intervals[i]
                if len(curr) < fetch_min_bin_size:
                    # merge with the following interval
                    merge_itvl = curr
                    merge_weight = weighted_intervals[curr]
                    i += 1
                    while len(merge_itvl) < fetch_min_bin_size and i < len(intervals):
                        merge_itvl = intervals[i] | merge_itvl
                        merge_weight += weighted_intervals[intervals[i]]
                        i += 1
                    temp[merge_itvl] = merge_weight
                else:
                    temp[curr] = weighted_intervals[curr]
                    i += 1
            weighted_intervals = temp
            for bin, limit in weighted_intervals.items():
                fetch_regions.append((chr, bin.start, bin.end, limit))

        putative_half_maps = set()
        putative_flanking = set()
        fetch_regions = sorted(fetch_regions, reverse=True)

        for i in range(0, len(fetch_regions)):
            chr, start, end, limit = fetch_regions[i]
            log('({} of {}) loading the bin {}:{}-{} (limit {}; size {})'.format(
                i + 1, len(fetch_regions), chr, start, end, limit, end - start + 1))
            for read in cache.fetch(
                chr, start, end,
                limit=limit,
                cache_if=cache_if_true,
                filter_if=filter_if_true,
                stop_on_cached_read=True
            ):
                read_itvl = Interval(read.reference_start + 1, read.reference_end)
                if read.mate_is_unmapped:
                    putative_half_maps.add(read)
                    continue
                # breakpoint region 1
                for ev in [e for e in evidence if e.break1.chr == chr]:
                    compatible_type = SVTYPE.INS if SVTYPE.DUP in ev.putative_event_types() else SVTYPE.DUP
                    if not Interval.overlaps(read_itvl, ev.outer_window1):
                        continue
                    ev.counts[0] += 1
                    if not ev.collect_split_read(read, first_breakpoint=True):
                        ev.collect_spanning_read(read)
                    try:
                        mates = cache.get_mate(read, allow_file_access=False)
                        for mate in mates:
                            ev.collect_flanking_pair(read, mate)
                            if ev.compatible_windows:
                                ev.collect_compatible_flanking_pair(read, mate, compatible_type)
                    except KeyError:
                        putative_flanking.add(read)
                # breakpoint region 2
                for ev in [e for e in evidence if e.break2.chr == chr]:
                    compatible_type = SVTYPE.INS if SVTYPE.DUP in ev.putative_event_types() else SVTYPE.DUP
                    if not Interval.overlaps(read_itvl, ev.outer_window2):
                        continue
                    ev.counts[1] += 1
                    ev.collect_split_read(read, first_breakpoint=False)
                    try:
                        mates = cache.get_mate(read, allow_file_access=False)
                        for mate in mates:
                            ev.collect_flanking_pair(read, mate)
                            if ev.compatible_windows:
                                ev.collect_compatible_flanking_pair(read, mate, compatible_type)
                    except KeyError:
                        putative_flanking.add(read)
        # go over the flanking /half-mapped that have uncached mates previously
        # if we didn't go over them yet, then they are outside the target region and should be ignored
        log('gathering cached flanking pairs', len(putative_flanking))
        for read in putative_flanking:
            try:
                mates = cache.get_mate(read, allow_file_access=False)
                for mate in mates:
                    for ev in evidence:
                        ev.collect_flanking_pair(read, mate)
                        if ev.compatible_windows:
                            compatible_type = SVTYPE.INS if SVTYPE.DUP in ev.putative_event_types() else SVTYPE.DUP
                            ev.collect_compatible_flanking_pair(read, mate, compatible_type)
            except KeyError:
                pass
        log('gathering cached half-mapped reads', len(putative_half_maps))
        mates_found = 0
        for read in putative_half_maps:
            try:
                mates = cache.get_mate(read, allow_file_access=False)
                mates_found += 1
                for mate in mates:
                    for ev in evidence:
                        ev.collect_half_mapped(read, mate)
            except KeyError:
                pass
        log(mates_found, 'half-mapped mates found')

[docs]    def load_evidence(self, log=devnull):
        """
        open the associated bam file and read and store the evidence
        does some preliminary read-quality filtering
        """
        max_dist = max(
            len(Interval.union(self.break1, self.break2)),
            len(self.untemplated_seq if self.untemplated_seq else '')
        )

        def cache_if_true(read):
            if read.is_unmapped or read.mate_is_unmapped:
                return True
            elif any([
                self.filter_secondary_alignments and read.is_secondary,
                read.mapping_quality < self.min_mapping_quality
            ]):
                return False
            elif set([x[0] for x in read.cigar]) & {CIGAR.D, CIGAR.N, CIGAR.S, CIGAR.I}:
                return True
            elif read.is_proper_pair and self.protocol != PROTOCOL.TRANS:
                min_frag_est = abs(read.reference_start - read.next_reference_start) - self.read_length
                max_frag_est = min_frag_est + 3 * self.read_length
                if min_frag_est >= self.min_expected_fragment_size and max_frag_est <= self.max_expected_fragment_size:
                    return False
            return True

        def filter_if_true(read):
            if not cache_if_true(read):
                return True
            elif read.is_unmapped:
                return True
            return False

        flanking_pairs = set()  # collect putative pairs
        half_mapped_partners1 = set()
        half_mapped_partners2 = set()

        for read in self.bam_cache.fetch_from_bins(
                '{0}'.format(self.break1.chr),
                self.outer_window1[0],
                self.outer_window1[1],
                read_limit=self.fetch_reads_limit,
                sample_bins=self.fetch_reads_bins,
                min_bin_size=self.fetch_min_bin_size,
                cache=True,
                cache_if=cache_if_true,
                filter_if=filter_if_true):
            if read.mapping_quality < self.min_mapping_quality:
                continue
            self.counts[0] += 1
            if read.is_unmapped:
                continue
            if not self.collect_split_read(read, True):
                self.collect_spanning_read(read)
            if read.mate_is_unmapped:
                half_mapped_partners1.add(read)
            elif any([read_tools.orientation_supports_type(read, et) for et in self.putative_event_types()]) and \
                    (read.reference_id != read.next_reference_id) == self.interchromosomal:
                flanking_pairs.add(read)

        for read in self.bam_cache.fetch_from_bins(
                '{0}'.format(self.break2.chr),
                self.outer_window2[0],
                self.outer_window2[1],
                read_limit=self.fetch_reads_limit,
                sample_bins=self.fetch_reads_bins,
                min_bin_size=self.fetch_min_bin_size,
                cache=True,
                cache_if=cache_if_true,
                filter_if=filter_if_true):
            if read.mapping_quality < self.min_mapping_quality:
                continue

            self.counts[1] += 1

            if read.is_unmapped:
                continue
            if not self.collect_split_read(read, False):
                self.collect_spanning_read(read)
            if read.mate_is_unmapped:
                half_mapped_partners2.add(read)
            elif any([read_tools.orientation_supports_type(read, et) for et in self.putative_event_types()]) and \
                    (read.reference_id != read.next_reference_id) == self.interchromosomal:
                flanking_pairs.add(read)
        for fl in sorted(flanking_pairs, key=lambda x: (x.query_name, x.reference_start)):
            # try and get the mate from the cache
            try:
                mates = self.bam_cache.get_mate(fl, allow_file_access=False)
                for mate in mates:
                    self.collect_flanking_pair(fl, mate)
            except KeyError:
                pass

        if self.compatible_windows:
            compatible_type = SVTYPE.DUP
            if SVTYPE.DUP in self.putative_event_types():
                compatible_type = SVTYPE.INS

            compt_flanking = set()
            for read in self.bam_cache.fetch_from_bins(
                    '{0}'.format(self.break1.chr),
                    self.compatible_windows[0][0],
                    self.compatible_windows[0][1],
                    read_limit=self.fetch_reads_limit,
                    sample_bins=self.fetch_reads_bins,
                    min_bin_size=self.fetch_min_bin_size,
                    cache=True,
                    cache_if=cache_if_true,
                    filter_if=filter_if_true):
                if read_tools.orientation_supports_type(read, compatible_type):
                    compt_flanking.add(read)

            for read in self.bam_cache.fetch_from_bins(
                    '{0}'.format(self.break2.chr),
                    self.compatible_windows[1][0],
                    self.compatible_windows[1][1],
                    read_limit=self.fetch_reads_limit,
                    sample_bins=self.fetch_reads_bins,
                    min_bin_size=self.fetch_min_bin_size,
                    cache=True,
                    cache_if=cache_if_true,
                    filter_if=filter_if_true):
                if read_tools.orientation_supports_type(read, compatible_type):
                    compt_flanking.add(read)

            for fl in compt_flanking:
                # try and get the mate from the cache
                try:
                    mates = self.bam_cache.get_mate(fl, allow_file_access=False)
                    for mate in mates:
                        try:
                            self.collect_compatible_flanking_pair(fl, mate, compatible_type)
                        except ValueError:
                            pass
                except KeyError:
                    pass

        # now collect the half mapped reads
        log(
            'collected', len(half_mapped_partners1 | half_mapped_partners2),
            'putative half mapped reads', time_stamp=False)
        mates_found = 0
        for read in half_mapped_partners1 | half_mapped_partners2:
            # try and get the mate from the cache
            try:
                mates = self.bam_cache.get_mate(read, allow_file_access=False)
                mates_found += 1
                for mate in mates:
                    self.collect_half_mapped(read, mate)
            except KeyError:
                pass
        log(mates_found, 'half-mapped mates found')

[docs]    def copy(self):
        raise NotImplementedError('not appropriate for copy of evidence')

[docs]    def flatten(self):
        row = BreakpointPair.flatten(self)
        row.update({
            COLUMNS.raw_flanking_pairs: len(self.flanking_pairs),
            COLUMNS.raw_spanning_reads: len(self.spanning_reads),
            COLUMNS.raw_break1_split_reads: len(self.split_reads[0]),
            COLUMNS.raw_break2_split_reads: len(self.split_reads[1]),
            COLUMNS.raw_break1_half_mapped_reads: len(self.half_mapped[0]),
            COLUMNS.raw_break2_half_mapped_reads: len(self.half_mapped[1]),
            COLUMNS.protocol: self.protocol,
            COLUMNS.event_type: ';'.join(sorted(self.putative_event_types())),
            COLUMNS.contigs_assembled: len(self.contigs),
            COLUMNS.break1_ewindow: '{}-{}'.format(*self.outer_window1),
            COLUMNS.break2_ewindow: '{}-{}'.format(*self.outer_window2),
            COLUMNS.break1_ewindow_count: self.counts[0],
            COLUMNS.break2_ewindow_count: self.counts[1],
            COLUMNS.contigs_aligned: sum([len(c.alignments) for c in self.contigs]),
            COLUMNS.contigs_assembled: len(self.contigs)
        })
        return row

[docs]    def get_bed_repesentation(self):
        bed = []
        name = self.data.get(COLUMNS.cluster_id, None)
        bed.append((self.break1.chr, self.outer_window1[0] - 1, self.outer_window1[1], name))
        bed.append((self.break1.chr, self.inner_window1[0] - 1, self.inner_window1[1], name))
        bed.append((self.break2.chr, self.outer_window2[0] - 1, self.outer_window2[1], name))
        bed.append((self.break2.chr, self.inner_window2[0] - 1, self.inner_window2[1], name))
        return bed