Source code for mavis.align

"""
Should take in a sam file from a aligner like bwa aln or bwa mem and convert it into a

"""
import itertools
import pysam
import subprocess
import warnings
import os
from copy import copy
from .constants import COLUMNS, SVTYPE, CIGAR, reverse_complement, ORIENT
from .bam import cigar as cigar_tools
from .bam import read as read_tools
from .interval import Interval
from .util import devnull
from .breakpoint import BreakpointPair
from .error import InvalidRearrangement
from vocab import Vocab


SUPPORTED_ALIGNER = Vocab(BWA_MEM='bwa mem', BLAT='blat')


[docs]class SplitAlignment: def __init__(self, read1, read2=None): if read2 is not None and any([ read1.reference_name > read2.reference_name, read1.reference_name == read2.reference_name and read1.reference_start > read2.reference_start ]): read1, read2 = read2, read1 self.read1 = read1 self.read2 = read2 if self.read2 is not None: read2_seq = read2.query_sequence if not self.opposing_strands else reverse_complement(read2.query_sequence) if read1.query_sequence != read2_seq: raise ValueError('valid split alignments must share the same (or reverse complement) sequence') self.query_sequence = self.read1.query_sequence # TODO: if the reads 'share' query sequence overlap, remove this from the second alignment def __getitem__(self, index): if isinstance(index, int): if index == 0: return self.read1 elif index == 1: return self.read2 else: raise IndexError('index out of bounds', index) else: raise ValueError('index must be an integer') @property def opposing_strands(self): if self.read2 is None: return False else: return self.read1.is_reverse != self.read2.is_reverse
[docs] def query_coverage_read1(self): qc1 = query_coverage_interval(self.read1) return qc1
[docs] def query_coverage_read2(self): seqlen = len(self.read1.query_sequence) qc1 = self.query_coverage_read1() qc2 = qc1 if self.read2 is not None: qc2 = query_coverage_interval(self.read2) if self.read2.is_reverse != self.read1.is_reverse: qc2 = Interval(seqlen - qc2.end, seqlen - qc2.start) return qc2
[docs] def query_coverage(self): """ interval representing the total region of the input sequence that is covered by the combination of alignments """ if self.read2 is None: return self.query_coverage_read1() else: return self.query_coverage_read1() | self.query_coverage_read2()
[docs] def query_consumption(self): """ fraction of the query sequence which is aligned (everything not soft-clipped) in either alignment """ if self.read2 is None or Interval.overlaps(self.query_coverage_read1(), self.query_coverage_read2()): return len(self.query_coverage()) / len(self.query_sequence) else: return (len(self.query_coverage_read1()) + len(self.query_coverage_read2())) / len(self.query_sequence)
[docs] def query_overlap_extension(self): if self.read2 is not None: max_init_overlap = max(len(self.query_coverage_read1()), len(self.query_coverage_read2())) total_overlap = len(self.query_coverage()) - max_init_overlap return total_overlap else: return 0
[docs] def score(self, consec_bonus=10): def score_matches(cigar): return sum([v + (v - 1) * consec_bonus for s, v in cigar if s == CIGAR.EQ]) score = score_matches(self.read1.cigar) qlen = len(self.query_coverage()) if self.read2 is not None: score += score_matches(self.read2.cigar) if Interval.overlaps(self.query_coverage_read1(), self.query_coverage_read2()): qlen += len(self.query_coverage_read1() & self.query_coverage_read2()) return score / (qlen + (qlen - 1) * consec_bonus)
[docs] def select_supporting_alignments( bpp, alignments, min_query_consumption, min_extend_overlap, max_event_size, min_anchor_size, merge_inner_anchor, merge_outer_anchor ): """ give a breakpoint pair and a set of alignments for contigs associated with the given pair, alignments are paired (some events cannot be represented with a single bamfile alignment) and the most appropriate alignments supporting the breakpoint pair are selected and returned """ # now for each bpp assign an alignment to each contig putative_alignments = [] putative_event_types = set(bpp.putative_event_types()) if {SVTYPE.INS, SVTYPE.DUP} & putative_event_types: putative_event_types = putative_event_types | {SVTYPE.INS, SVTYPE.DUP} # for events on the same template and strand we expect to find a single contig alignment if not bpp.interchromosomal and not bpp.opposing_strands: for read in alignments: try: aln = SplitAlignment(read) except ValueError: continue # if it covers both breakpoints add to putative alignments ref_cover = Interval(read.reference_start, read.reference_end - 1) if all([ aln.read1.reference_name == bpp.break1.chr, Interval.overlaps(bpp.outer_window1, ref_cover), Interval.overlaps(bpp.outer_window2, ref_cover) ]): if aln.query_consumption() < min_query_consumption: continue # split the continuous alignment, assume ins/dup or indel ins = sum([v for c, v in read.cigar if c == CIGAR.I] + [0]) dln = sum([v for c, v in read.cigar if c in [CIGAR.D, CIGAR.N]] + [0]) aln.read1 = copy(read) aln.read1.cigar = cigar_tools.merge_internal_events(aln.read1.cigar, merge_inner_anchor, merge_outer_anchor) for event_type in putative_event_types: if event_type in {SVTYPE.INS, SVTYPE.DUP} and ins > 0 and ins > dln: putative_alignments.append(aln) elif event_type in {SVTYPE.DEL, SVTYPE.INV} and dln > 0 and dln > ins: putative_alignments.append(aln) # don't use reads in combined alignments if they have already been assigned in a single alignment combo_prohibited = [x for x, y in putative_alignments] for read1, read2 in itertools.combinations([x for x in alignments if x not in combo_prohibited], 2): # do they overlap both breakpoints if read2 is not None and any([ read1.reference_name > read2.reference_name, read1.reference_name == read2.reference_name and read1.reference_start > read2.reference_start ]): read1, read2 = read2, read1 if read1.reference_name != bpp.break1.chr or read2.reference_name != bpp.break2.chr: continue read1 = read_tools.convert_events_to_softclipping( read1, bpp.break1.orient, max_event_size=max_event_size, min_anchor_size=min_anchor_size) read2 = read_tools.convert_events_to_softclipping( read2, bpp.break2.orient, max_event_size=max_event_size, min_anchor_size=min_anchor_size) try: aln = SplitAlignment(read1, read2) except ValueError: continue # check that the coverage intervals overlap the event windows if any([ not Interval.overlaps((aln.read1.reference_start + 1, aln.read1.reference_end), bpp.outer_window1), not Interval.overlaps((aln.read2.reference_start + 1, aln.read2.reference_end), bpp.outer_window2) ]): continue # reads should have unique reference overlap if not bpp.interchromosomal and aln.read1.reference_end > aln.read2.reference_end: continue # check that the combination extends the amount of the initial query sequence we consume qc = len(aln.query_coverage()) if any([ len(aln.query_coverage_read1()) >= qc or len(aln.query_coverage_read2()) >= qc, aln.query_consumption() < min_query_consumption, aln.read2 is not None and aln.query_overlap_extension() < min_extend_overlap ]): continue try: call = BreakpointPair.call_breakpoint_pair(read1, read2) if not set(BreakpointPair.classify(call)) & putative_event_types: continue except (InvalidRearrangement, AssertionError): continue putative_alignments.append(aln) return putative_alignments
[docs] @staticmethod def breakpoint_contig_remapped_depth(breakpoint, contig, read): if breakpoint.chr != read.reference_name: raise AssertionError('breakpoint chromosome does not match read reference', breakpoint, read.reference_name) if len(breakpoint) > 1: raise NotImplementedError('only applies to exact breakpoint calls') # get the reference positions for each breakpoint interval from the breakpointpair # convert this to the query intervals using the alignment # for each query interval calculate the read coverage as a pileup over the distance s = read.reference_start + 1 t = read.reference_end if breakpoint.orient == ORIENT.LEFT: if breakpoint.start < s: return 0 t = min(breakpoint.start, t) elif breakpoint.orient == ORIENT.RIGHT: if breakpoint.start > t: return 0 s = max(s, breakpoint.start) qrange = read_tools.map_ref_range_to_query_range(read, Interval(s, t)) return contig.remap_depth(qrange)
[docs]def query_coverage_interval(read): """ Returns: :class:`~mavis.interval.Interval`: The portion of the original query sequence that is aligned by this read """ seq = read.query_sequence s = 0 t = len(seq) - 1 if read.cigar[0][0] == CIGAR.S: s += read.cigar[0][1] if read.cigar[-1][0] == CIGAR.S: t -= read.cigar[-1][1] return Interval(s, t)
[docs]def align_contigs( evidence, INPUT_BAM_CACHE, reference_genome, aligner, aligner_reference, aligner_output_file='aligner_out.temp', aligner_fa_input_file='aligner_in.fa', blat_min_identity=0.7, contig_aln_min_query_consumption=0.5, contig_aln_max_event_size=50, contig_aln_min_anchor_size=50, contig_aln_merge_inner_anchor=20, contig_aln_merge_outer_anchor=20, blat_limit_top_aln=25, is_protein=False, min_extend_overlap=10, clean_files=True, log=devnull, **kwargs): """ given a set of contigs, call the aligner from the command line and adds the results to the contigs associated with each Evidence object """ if is_protein: raise NotImplementedError('currently does not support aligning protein sequences') try: # write the input sequences to a fasta file query_id_mapping = {} sequences = set() count = 1 ev_by_seq = {} for e in evidence: for c in e.contigs: sequences.add(c.seq) ev_by_seq.setdefault(c.seq, []).append(e.data.get(COLUMNS.cluster_id, None)) with open(aligner_fa_input_file, 'w') as fh: for seq in sequences: n = 'seq{}'.format(count) log(n, [x for x in ev_by_seq[seq] if x is not None]) query_id_mapping[n] = seq fh.write('>' + n + '\n' + seq + '\n') count += 1 if len(sequences) == 0: return log('will use', aligner, 'to align', len(sequences), 'unique sequences', time_stamp=False) # call the aligner using subprocess if aligner == SUPPORTED_ALIGNER.BLAT: from .blat import process_blat_output # call the aligner using subprocess blat_min_identity *= 100 blat_options = kwargs.pop( 'blat_options', ["-stepSize=5", "-repMatch=2253", "-minScore=0", "-minIdentity={0}".format(blat_min_identity)]) # call the blat subprocess # will raise subprocess.CalledProcessError if non-zero exit status # parameters from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4 log([SUPPORTED_ALIGNER.BLAT, aligner_reference, aligner_fa_input_file, aligner_output_file, '-out=pslx', '-noHead'] + blat_options) subprocess.check_output([ SUPPORTED_ALIGNER.BLAT, aligner_reference, aligner_fa_input_file, aligner_output_file, '-out=pslx', '-noHead'] + blat_options) reads_by_query = process_blat_output( INPUT_BAM_CACHE=INPUT_BAM_CACHE, query_id_mapping=query_id_mapping, reference_genome=reference_genome, aligner_output_file=aligner_output_file, blat_limit_top_aln=blat_limit_top_aln, is_protein=is_protein ) elif aligner == SUPPORTED_ALIGNER.BWA_MEM: command = '{} {} {} -Y'.format(aligner, aligner_reference, aligner_fa_input_file) log(command) # for bwa with open(aligner_output_file, 'w') as f: subprocess.call(command, stdout=f, shell=True) with pysam.AlignmentFile(aligner_output_file, 'r') as samfile: reads_by_query = {} for read in samfile.fetch(): read = read_tools.SamRead.copy(read) read.reference_id = INPUT_BAM_CACHE.reference_id(read.reference_name) if read.is_paired: read.next_reference_id = INPUT_BAM_CACHE.reference_id(read.next_reference_name) read.cigar = cigar_tools.recompute_cigar_mismatch(read, reference_genome[read.reference_name]) query_seq = query_id_mapping[read.query_name] reads_by_query.setdefault(query_seq, []).append(read) else: raise NotImplementedError('unsupported aligner', aligner) for e in evidence: for contig in e.contigs: aln = reads_by_query.get(contig.seq, []) putative_alignments = SplitAlignment.select_supporting_alignments( e, aln, min_extend_overlap=min_extend_overlap, min_query_consumption=contig_aln_min_query_consumption, min_anchor_size=contig_aln_min_anchor_size, max_event_size=contig_aln_max_event_size, merge_inner_anchor=contig_aln_merge_inner_anchor, merge_outer_anchor=contig_aln_merge_outer_anchor ) contig.alignments.extend(putative_alignments) finally: # clean up if clean_files: for f in [aligner_output_file, aligner_fa_input_file]: if os.path.exists(f): try: os.remove(f) except OSError as e: warnings.warn(repr(e))