from ..constants import COLUMNS, STRAND, CALL_METHOD, SVTYPE, PROTOCOL, DISEASE_STATUS
from ..breakpoint import Breakpoint, BreakpointPair
from ..interval import Interval
from .constants import PAIRING_STATE
[docs]def alphanumeric_choice(bpp1, bpp2):
"""
Args:
bpp1 (BreakpointPair)
bpp2 (BreakpointPair)
returns the one with transcript with alphanumeric priority, with transcript1 chosen for ties
"""
if (bpp1.transcript1, bpp1.transcript2) < (bpp2.transcript1, bpp2.transcript2):
return bpp1
elif (bpp1.transcript1, bpp1.transcript2) > (bpp2.transcript1, bpp2.transcript2):
return bpp2
else:
raise AssertionError("Both transcripts are equal")
[docs]def filter_by_annotations(bpp1, bpp2, best_transcripts):
"""
Args:
bpp1 (BreakPointPair):
bpp2 (BreakpointPair):
best_transcripts (:class `dict` of :any:`Transcript` by :class:`str`): the best transcripts of the annotations
based on their names
"""
# By priority
# Case 1 an event has 2 genes and transcripts and a fusion cdna (orf)
if bpp1.data[COLUMNS.fusion_cdna_coding_start] or bpp2.data[COLUMNS.fusion_cdna_coding_start]:
# take the one with the longest cdna length
if bpp1.data[COLUMNS.fusion_cdna_coding_start] is None:
return bpp2
elif bpp2.data[COLUMNS.fusion_cdna_coding_start] is None:
return bpp1
else:
bpp1_cdna_len = int(bpp1.data[COLUMNS.fusion_cdna_coding_end]) - \
int(bpp1.data[COLUMNS.fusion_cdna_coding_start])
bpp2_cdna_len = int(bpp2.data[COLUMNS.fusion_cdna_coding_end]) - \
int(bpp2.data[COLUMNS.fusion_cdna_coding_start])
return bpp1 if bpp1_cdna_len >= bpp2_cdna_len else bpp2
# Case 2 an event has 2 genes and transcripts
elif bpp1.data[COLUMNS.gene1] and bpp1.data[COLUMNS.gene2] or bpp2.data[COLUMNS.gene1] and bpp2.data[COLUMNS.gene2]:
# take the one with transcripts that are in best transcript, or the highest alphanumeric name
if bpp1.data[COLUMNS.gene1] is None or bpp1.data[COLUMNS.gene2] is None:
return bpp2
elif bpp2.data[COLUMNS.gene1] is None or bpp2.data[COLUMNS.gene2] is None:
return bpp1
else:
bpp1_t1, bpp1_t2 = (bpp1.data[COLUMNS.transcript1], bpp1.data[COLUMNS.transcript2])
bpp2_t1, bpp2_t2 = (bpp2.data[COLUMNS.transcript1], bpp2.data[COLUMNS.transcript2])
# both in best transcripts
if bpp1_t1 in best_transcripts and bpp1_t2 in best_transcripts and bpp2_t1 in best_transcripts \
and bpp2_t2 in best_transcripts:
try:
alpha = alphanumeric_choice(bpp1, bpp2)
except AssertionError: # both transcripts are the same
alpha = group_events(bpp1, bpp2)
return alpha
elif bpp1_t1 in best_transcripts and bpp1_t2 in best_transcripts:
return bpp1
elif bpp2_t1 in best_transcripts and bpp2_t2 in best_transcripts:
return bpp2
elif bpp1_t1 in best_transcripts or bpp1_t2 in best_transcripts:
return bpp1
elif bpp2_t1 in best_transcripts or bpp2_t2 in best_transcripts:
return bpp2
else:
try:
alpha = alphanumeric_choice(bpp1, bpp2)
except AssertionError: # both transcripts are the same
alpha = group_events(bpp1, bpp2)
return alpha
# Case 3 an event has 1 gene and transcript
elif bpp1.data[COLUMNS.gene1] or bpp1.data[COLUMNS.gene2] or bpp2.data[COLUMNS.gene1] or bpp2.data[COLUMNS.gene2]:
# take the one with transcripts that are in best transcript, or the highest alphanumeric name
if bpp1.data[COLUMNS.gene1] is None and bpp1.data[COLUMNS.gene2] is None:
return bpp2
elif bpp2.data[COLUMNS.gene1] is None and bpp2.data[COLUMNS.gene2] is None:
return bpp1
else:
bpp1_t1, bpp1_t2 = (bpp1.data[COLUMNS.transcript1], bpp1.data[COLUMNS.transcript2])
bpp2_t1, bpp2_t2 = (bpp2.data[COLUMNS.transcript1], bpp2.data[COLUMNS.transcript2])
if bpp1_t1 in best_transcripts or bpp1_t2 in best_transcripts:
return bpp1
elif bpp2_t1 in best_transcripts or bpp2_t2 in best_transcripts:
return bpp2
else:
try:
alpha = alphanumeric_choice(bpp1, bpp2)
except AssertionError: # both transcripts are the same
alpha = group_events(bpp1, bpp2)
return alpha
# Case 4 both have no genes present - will keep the positive strand event
else:
if bpp1.break1.strand == STRAND.POS:
return bpp1
else:
return bpp2
[docs]def filter_by_call_method(bpp1, bpp2):
# ranking scores of the methods (more is better)
RANKING = {
CALL_METHOD.CONTIG: 4,
CALL_METHOD.SPLIT: 3,
CALL_METHOD.SPAN: 2,
CALL_METHOD.FLANK: 1
}
# This treats split flank the same as flank split
bpp_call_score = RANKING[bpp1.break1_call_method] + \
RANKING[bpp1.break2_call_method]
existing_call_score = RANKING[bpp2.break1_call_method] + \
RANKING[bpp2.break2_call_method]
if bpp_call_score < existing_call_score:
return bpp1
elif bpp_call_score > existing_call_score:
return bpp2
else:
raise AssertionError("Both call methods are equal")
[docs]def group_events(bpp1, bpp2):
# take the outer regions of the breakpoints
new_bpp = BreakpointPair(
Breakpoint(bpp1.break1.chr,
min(bpp1.break1.start, bpp2.break1.start),
max(bpp1.break1.end, bpp2.break1.end),
orient=bpp1.break1.orient,
strand=bpp1.break1.strand),
Breakpoint(bpp1.break2.chr,
min(bpp1.break2.start, bpp2.break2.start),
max(bpp1.break2.end, bpp2.break2.end),
orient=bpp1.break2.orient,
strand=bpp1.break2.strand),
opposing_strands=bpp1.opposing_strands,
stranded=bpp1.stranded)
# Note: There are some attributes that shouldn't be lost if different, currently appending the information
# The evidence could be better off as a max instead of a join
columns_to_keep = [COLUMNS.contig_seq, COLUMNS.break1_call_method, COLUMNS.break2_call_method,
COLUMNS.break1_split_reads, COLUMNS.break2_split_reads, COLUMNS.contig_alignment_score,
COLUMNS.spanning_reads, COLUMNS.flanking_pairs, COLUMNS.tools,
COLUMNS.product_id, COLUMNS.event_type, COLUMNS.annotation_id,
COLUMNS.pairing, COLUMNS.annotation_figure,
COLUMNS.contig_remapped_reads]
for i in bpp1.data.keys():
if bpp1.data[i] != bpp2.data[i]:
new_bpp.data[i] = ''
if i in columns_to_keep:
new_bpp.data[i] = ";".join(sorted(list(set(str(bpp1.data[i]).split(';') +
str(bpp2.data[i]).split(';')))))
else:
new_bpp.data[i] = bpp1.data[i]
if bpp1.untemplated_seq == bpp2.untemplated_seq:
new_bpp.untemplated_seq = bpp1.untemplated_seq
return new_bpp
[docs]def annotate_dgv(bpps, dgv_regions_by_reference_name, distance=0):
for bpp in bpps:
if bpp.break1.chr != bpp.break2.chr:
continue # assume the dgv does not have translocations
for r in dgv_regions_by_reference_name.get(bpp.break1.chr, []):
if abs(Interval.dist((r.start, r.start), bpp.break1)) <= distance and \
abs(Interval.dist((r.end, r.end), bpp.break2)) <= distance:
refname = r.reference_object
try:
refname = r.reference_object.name
except AttributeError:
pass
bpp.data['dgv'] = '{}({}:{}-{})'.format(r.name, refname, r.start, r.end)
return bpps
[docs]def get_pairing_state(current_protocol, current_disease_state, other_protocol, other_disease_state, is_matched=False):
"""
given two libraries, returns the appropriate descriptor for their matched state
Args:
current_protocol (PROTOCOL): the protocol of the current library
current_disease_state (DISEASE_STATUS): the disease status of the current library
other_protocol (PROTOCOL): protocol of the library being comparing to
other_disease_state (DISEASE_STATUS): disease status of the library being compared to
is_matched (bool): True if the libraries are paired
Returns:
(PAIRING_STATE): descriptor of the pairing of the two libraries
"""
PROTOCOL.enforce(current_protocol)
PROTOCOL.enforce(other_protocol)
DISEASE_STATUS.enforce(current_disease_state)
DISEASE_STATUS.enforce(other_disease_state)
curr = (current_protocol, current_disease_state)
other = (other_protocol, other_disease_state)
DG = (PROTOCOL.GENOME, DISEASE_STATUS.DISEASED)
DT = (PROTOCOL.TRANS, DISEASE_STATUS.DISEASED)
NG = (PROTOCOL.GENOME, DISEASE_STATUS.NORMAL)
if curr == DG and other == NG:
return PAIRING_STATE.GERMLINE if is_matched else PAIRING_STATE.SOMATIC
elif curr == DG and other == DT:
return PAIRING_STATE.EXP if is_matched else PAIRING_STATE.NO_EXP
elif curr == DT and other == DG:
return PAIRING_STATE.GENOMIC if is_matched else PAIRING_STATE.NO_GENOMIC
elif curr == DT and other == NG:
return PAIRING_STATE.GERMLINE if is_matched else PAIRING_STATE.SOMATIC
elif curr == NG and other == DT:
return PAIRING_STATE.EXP if is_matched else PAIRING_STATE.NO_EXP
else:
return PAIRING_STATE.MATCH if is_matched else PAIRING_STATE.NO_MATCH
[docs]def filter_by_evidence(
bpps,
filter_min_remapped_reads=5,
filter_min_spanning_reads=5,
filter_min_flanking_reads=5,
filter_min_flanking_only_reads=10,
filter_min_split_reads=5,
filter_min_linking_split_reads=1
):
filtered = []
removed = []
for bpp in bpps:
if bpp.break1_call_method == CALL_METHOD.CONTIG and bpp.break2_call_method == CALL_METHOD.CONTIG:
# inherently the breakpoints have been linked
if int(bpp.contig_remapped_reads) < filter_min_remapped_reads:
removed.append(bpp)
continue
elif bpp.break1_call_method == CALL_METHOD.SPAN and bpp.break2_call_method == CALL_METHOD.SPAN:
if bpp.spanning_reads < filter_min_spanning_reads:
removed.append(bpp)
continue
elif bpp.break1_call_method == CALL_METHOD.SPLIT and bpp.break2_call_method == CALL_METHOD.SPLIT:
if any([
bpp.break1_split_reads < filter_min_split_reads,
bpp.break2_split_reads < filter_min_split_reads,
all([
bpp.event_type != SVTYPE.INS,
bpp.break2_split_reads_forced + bpp.break1_split_reads_forced < filter_min_linking_split_reads
]),
all([
bpp.event_type == SVTYPE.INS,
bpp.flanking_pairs < filter_min_linking_split_reads,
bpp.break2_split_reads_forced + bpp.break1_split_reads_forced < filter_min_linking_split_reads
]),
bpp.break1_split_reads + bpp.break2_split_reads -
(bpp.break2_split_reads_forced + bpp.break1_split_reads_forced) < 1
]):
removed.append(bpp)
continue
elif bpp.break1_call_method == CALL_METHOD.SPLIT and bpp.break2_call_method == CALL_METHOD.FLANK:
if bpp.break1_split_reads < filter_min_split_reads or bpp.flanking_pairs < filter_min_flanking_reads:
removed.append(bpp)
continue
elif bpp.break1_call_method == CALL_METHOD.FLANK and bpp.break2_call_method == CALL_METHOD.SPLIT:
if bpp.break1_split_reads < filter_min_split_reads or bpp.flanking_pairs < filter_min_flanking_reads:
removed.append(bpp)
continue
elif bpp.break1_call_method == CALL_METHOD.FLANK and bpp.break2_call_method == CALL_METHOD.FLANK:
if bpp.flanking_pairs < filter_min_flanking_only_reads:
removed.append(bpp)
continue
else:
raise AssertionError('unexpected value for break1_call_method or break2_call_method: {}, {}'.format(
bpp.break1_call_method, bpp.break2_call_method))
filtered.append(bpp)
return filtered, removed