from .base import Evidence
from ..interval import Interval
from ..constants import ORIENT, PROTOCOL, SVTYPE
from ..annotate.variant import overlapping_transcripts
from ..breakpoint import Breakpoint
[docs]class GenomeEvidence(Evidence):
def __init__(self, *pos, **kwargs):
Evidence.__init__(self, *pos, **kwargs)
self.protocol = PROTOCOL.GENOME
self.outer_windows = (
GenomeEvidence._generate_window(
self.break1,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length
),
GenomeEvidence._generate_window(
self.break2,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length
)
)
self.inner_windows = (
Interval(
max([self.break1.start - self.call_error - self.read_length + 1, 1]),
self.break1.end + self.call_error + self.read_length - 1
),
Interval(
max([self.break2.start - self.call_error - self.read_length + 1, 1]),
self.break2.end + self.call_error + self.read_length - 1
)
)
if SVTYPE.INS in self.putative_event_types():
comb = len(self.break1 | self.break2)
if comb > len(self.break1) and comb > len(self.break2):
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.RIGHT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.LEFT, strand=self.break2.strand)
self.compatible_windows = (
GenomeEvidence._generate_window(
compt_break1,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length
),
GenomeEvidence._generate_window(
compt_break2,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length
)
)
elif SVTYPE.DUP in self.putative_event_types():
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.LEFT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.RIGHT, strand=self.break2.strand)
self.compatible_windows = (
GenomeEvidence._generate_window(
compt_break1,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length
),
GenomeEvidence._generate_window(
compt_break2,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length
)
)
@staticmethod
def _generate_window(breakpoint, max_expected_fragment_size, call_error, read_length):
"""
given some input breakpoint uses the current evidence setting to determine an
appropriate window/range of where one should search for supporting reads
Args:
breakpoint (Breakpoint): the breakpoint we are generating the evidence window for
read_length (int): the read length
call_error (int):
adds a buffer to the calculations if confidence in the breakpoint calls is low can increase this
Returns:
Interval: the range where reads should be read from the bam looking for evidence for this event
"""
start = breakpoint.start - max_expected_fragment_size - call_error + 1
end = breakpoint.end + max_expected_fragment_size + call_error - 1
if breakpoint.orient == ORIENT.LEFT:
end = breakpoint.end + call_error + read_length - 1
elif breakpoint.orient == ORIENT.RIGHT:
start = breakpoint.start - call_error - read_length + 1
return Interval(max([1, start]), max([end, 1]))
[docs] def compute_fragment_size(self, read, mate=None):
return Interval(abs(read.template_length))
[docs]class TranscriptomeEvidence(Evidence):
def __init__(self, ANNOTATIONS, *pos, **kwargs):
Evidence.__init__(self, *pos, **kwargs)
self.protocol = PROTOCOL.TRANS
# get the list of overlapping transcripts
self.overlapping_transcripts = (
overlapping_transcripts(ANNOTATIONS, self.break1),
overlapping_transcripts(ANNOTATIONS, self.break2)
)
self.outer_windows = (
TranscriptomeEvidence._generate_window(
self.break1,
transcripts=self.overlapping_transcripts[0],
read_length=self.read_length,
call_error=self.call_error,
max_expected_fragment_size=self.max_expected_fragment_size
),
TranscriptomeEvidence._generate_window(
self.break2,
transcripts=self.overlapping_transcripts[1],
read_length=self.read_length,
call_error=self.call_error,
max_expected_fragment_size=self.max_expected_fragment_size
)
)
tgt = self.call_error + self.read_length - 1
temp = TranscriptomeEvidence.traverse_exonic_distance(
self.break1.start, tgt, ORIENT.LEFT, self.overlapping_transcripts[0])
w1 = TranscriptomeEvidence.traverse_exonic_distance(
self.break1.end, tgt, ORIENT.RIGHT, self.overlapping_transcripts[0])
w1 = w1 | temp
temp = TranscriptomeEvidence.traverse_exonic_distance(
self.break2.start, tgt, ORIENT.LEFT, self.overlapping_transcripts[1])
w2 = TranscriptomeEvidence.traverse_exonic_distance(
self.break2.end, tgt, ORIENT.RIGHT, self.overlapping_transcripts[1])
w2 = w2 | temp
self.inner_windows = (w1, w2)
if SVTYPE.INS in self.putative_event_types():
comb = len(self.break1 | self.break2)
if comb > len(self.break1) and comb > len(self.break2):
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.RIGHT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.LEFT, strand=self.break2.strand)
self.compatible_windows = (
TranscriptomeEvidence._generate_window(
compt_break1,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length,
transcripts=self.overlapping_transcripts[0]
),
TranscriptomeEvidence._generate_window(
compt_break2,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length,
transcripts=self.overlapping_transcripts[1]
)
)
elif SVTYPE.DUP in self.putative_event_types():
compt_break1 = Breakpoint(
self.break1.chr, self.break1.start, self.break1.end, orient=ORIENT.LEFT, strand=self.break1.strand)
compt_break2 = Breakpoint(
self.break2.chr, self.break2.start, self.break2.end, orient=ORIENT.RIGHT, strand=self.break2.strand)
self.compatible_windows = (
TranscriptomeEvidence._generate_window(
compt_break1,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length,
transcripts=self.overlapping_transcripts[0]
),
TranscriptomeEvidence._generate_window(
compt_break2,
max_expected_fragment_size=self.max_expected_fragment_size,
call_error=self.call_error,
read_length=self.read_length,
transcripts=self.overlapping_transcripts[1]
)
)
[docs] @staticmethod
def traverse_exonic_distance(start, distance, direction, transcripts):
"""
given some genomic position and a distance. Uses the input transcripts to
compute all possible genomic end positions at that distance if intronic
positions are ignored
Args:
start (int): the genomic start position
distance (int): the amount of exonic/intergenic units to traverse
direction (ORIENT): the direction wrt to the positive/forward reference strand to traverse
transcripts (:class:`list` of :class:`usTranscript`): list of transcripts to use
"""
is_left = True if direction == ORIENT.LEFT else False
input_distance = distance
positions = [start - distance + 1 if is_left else start + distance - 1]
for ust in transcripts:
distance = input_distance # reset the distance between transcripts
pos = start
if is_left:
if start > ust.end:
available = start - ust.end
if available <= distance:
distance -= available
pos = ust.end + 1
else:
pos = start - distance + 1
distance = 0
elif start < ust.start:
pos = start - distance + 1
distance = 0
for i, ex in enumerate(ust.exons[::-1]):
if distance == 0:
break
if start >= ex.start and start <= ex.end: # within this exon
available = start - ex.start + 1
if available <= distance:
pos = ex.start
distance -= available
else: # more distance available than we want to travel
pos = start - distance + 1
distance = 0
elif start < ex.start: # before this exon
continue
else: # after this exon
available = len(ex)
if available <= distance:
pos = ex.start
distance -= len(ex)
else: # more distance available than we want to travel
pos = ex.end - distance + 1
distance = 0
if distance > 0: # didn't consume all the distance, keep going
pos = ust.start - distance
else:
if start > ust.end:
pos = start + distance - 1
distance = 0
elif start < ust.start:
available = ust.start - start
if available <= distance:
distance -= available
pos = ust.start - 1
else:
pos = start + distance - 1
distance = 0
for i, ex in enumerate(ust.exons):
if distance == 0:
break
if start >= ex.start and start <= ex.end: # within this exon
available = ex.end - start + 1
if available <= distance:
pos = ex.end
distance -= available
else: # more distance available than we want to travel
pos = start + distance - 1
distance = 0
elif start < ex.start: # before this exon
available = len(ex)
if available <= distance:
pos = ex.end
distance -= len(ex)
else: # more distance available than we want to travel
pos = ex.start + distance - 1
distance = 0
else: # after this exon
continue
if distance > 0: # didn't consume all the distance, keep going right
pos = ust.end + distance
positions.append(pos)
d = Interval(min(positions), max(positions))
return d
[docs] def compute_fragment_size(self, read, mate):
if read.reference_start > mate.reference_start:
read, mate = mate, read
t = self.overlapping_transcripts[0] | self.overlapping_transcripts[1]
return TranscriptomeEvidence.compute_exonic_distance(read.reference_start + 1, mate.reference_end, t)
[docs] @staticmethod
def compute_exonic_distance(start, end, transcripts):
"""
give the current list of transcripts, computes the putative exonic/intergenic distance
given two genomic positions. Intronic positions are ignored
"""
all_fragments = [end - start + 1]
for ust in transcripts:
sections = [Interval(start, end)]
for intron in [(s.end + 1, t.start - 1) for s, t in zip(ust.exons, ust.exons[1::])]:
if intron[1] < intron[0]:
continue
intron = Interval(intron[0], intron[1])
temp = []
for curr in sections:
temp.extend(curr - intron)
sections = temp
dist = sum([len(e) for e in sections])
all_fragments.append(dist)
return Interval(min(all_fragments), max(all_fragments))
@staticmethod
def _generate_window(breakpoint, transcripts, read_length, call_error, max_expected_fragment_size):
"""
given some input breakpoint uses the current evidence setting to determine an
appropriate window/range of where one should search for supporting reads
Args:
breakpoint (Breakpoint): the breakpoint we are generating the evidence window for
annotations (dict of str and list of Gene): the set of reference annotations: genes, transcripts, etc
read_length (int): the read length
median_fragment_size (int): the median insert size
call_error (int):
adds a buffer to the calculations if confidence in the breakpoint calls is low can increase this
stdev_fragment_size:
the standard deviation away from the median for regular (non STV) read pairs
Returns:
Interval: the range where reads should be read from the bam looking for evidence for this event
"""
window = GenomeEvidence._generate_window(
breakpoint,
max_expected_fragment_size=max_expected_fragment_size,
call_error=call_error,
read_length=read_length
)
tgt_left = breakpoint.start - window.start + 1 # amount to expand to the left
tgt_right = window.end - breakpoint.end + 1 # amount to expand to the right
if len(transcripts) == 0: # case 1. no overlapping transcripts
return window
w1 = TranscriptomeEvidence.traverse_exonic_distance(breakpoint.start, tgt_left, ORIENT.LEFT, transcripts)
w2 = TranscriptomeEvidence.traverse_exonic_distance(breakpoint.end, tgt_right, ORIENT.RIGHT, transcripts)
return w1 | w2