import sys
import tab
import re
from subprocess import check_output
import os
from mavis.constants import MavisNamespace
from mavis.util import bash_expands, mkdirp
import argparse
[docs]def get_bam_file(library_id):
output = check_output(['retrieve_reads_to_genome_bam_file', library_id])
bam_file = output.decode('UTF-8').split('\n')[0]
bam_file = re.split(r'\s+', bam_file)[-1]
return bam_file
[docs]def get_libraries(poglist):
all_libraries = []
ffglob = '/projects/POG/POG_data/{}/flatfile/*.tab'.format(poglist)
for flatfile in bash_expands(ffglob):
_, libraries = tab.read_file(
flatfile, rename={'library_name': ['name']},
cast={'diseased_status': lambda x: x.lower(), 'protocol': lambda x: 'transcriptome' if 'rna' in x.lower() else 'genome'}
)
match = re.match(r'.*(POG\d+)', flatfile)
pogid = match.group(1)
for lib in libraries:
lib['bam_file'] = get_bam_file(lib['name'])
all_libraries.append(MavisNamespace(id=pogid, **lib))
return all_libraries
[docs]def main():
parser = argparse.ArgumentParser()
parser.add_argument('pogs', nargs='+', help='pog ids to run in a single mavis run')
parser.add_argument('--grep', help='grep filter input files', default=None)
args = parser.parse_args()
pogs = [p.upper() for p in args.pogs]
for i, pog in enumerate(pogs):
if not pog.startswith('POG'):
pogs[i] = 'POG' + pog
pogs = sorted(pogs)
print('searching for libraries and files related to', pogs)
all_libraries = []
POGLIST = ','.join(pogs)
if len(pogs) > 1:
POGLIST = '{{{}}}'.format(POGLIST)
commands = ['--no_defaults']
libraries = get_libraries(POGLIST)
TRANS_LIBS = ' '.join([l.name for l in libraries if l.protocol == 'transcriptome'])
GENOME_LIBS = ' '.join([l.name for l in libraries if l.protocol != 'transcriptome'])
for lib in libraries:
# [--library <name> {genome,transcriptome} {diseased,normal} [strand_specific] [/path/to/bam/file]]
commands.append('--library {} {} {} {} {}'.format(lib.name, lib.protocol, lib.diseased_status, lib.protocol == 'transcriptome', lib.bam_file))
OUTPUT = '-'.join(pogs)
# --convert <alias> FILEPATH {manta,delly,transabyss,pindel,chimerascan,mavis,defuse} <stranded>
written = False
if TRANS_LIBS:
# TA
files = '/projects/POG/POG_data/{}/rna/*/trans-ABySS/fusions/*tsv'.format(POGLIST)
if bash_expands(files):
commands.append('--convert ta_trans {} transabyss True'.format(repr(files)))
commands.append('--input ta_trans {}'.format(TRANS_LIBS))
converted = '{}/converted_inputs/ta_trans.tab'.format(OUTPUT)
if not os.path.exists(converted):
command = 'mavis convert -o {} -n {} --file_type transabyss --strand_specific True'.format(converted, files)
print('\n>>>', command)
check_output(command, shell=True)
written = True
# defuse
files = '/projects/POG/POG_data/{}/sv/deFUSE/deFUSE-*/*/results/results.classify.tsv'.format(POGLIST)
if bash_expands(files):
commands.append('--convert defuse {} defuse True'.format(repr(files)))
commands.append('--input defuse {}'.format(TRANS_LIBS))
converted = '{}/converted_inputs/defuse.tab'.format(OUTPUT)
if not os.path.exists(converted):
command = 'mavis convert -o {} -n {} --file_type defuse --strand_specific False'.format(converted, files)
print('\n>>>', command)
check_output(command, shell=True)
written = True
if GENOME_LIBS:
# ta
files = '/projects/POG/POG_data/{}/wgs/*/trans-ABySS/fusions/*tsv'.format(POGLIST)
if bash_expands(files):
commands.append('--convert ta_genome {} transabyss True'.format(repr(files)))
commands.append('--input ta_genome {}'.format(GENOME_LIBS))
converted = '{}/converted_inputs/ta_genome.tab'.format(OUTPUT)
if not os.path.exists(converted):
command = 'mavis convert -o {} -n {} --file_type transabyss'.format(converted, files)
print('\n>>>', command)
check_output(command, shell=True)
written = True
# manta
files = '/projects/POG/POG_data/{}/sv/manta/manta-*/*/{{diploidSV,somaticSV}}.vcf'.format(POGLIST)
if bash_expands(files):
commands.append('--convert manta {} manta True'.format(repr(files)))
commands.append('--input manta {}'.format(GENOME_LIBS))
converted = '{}/converted_inputs/manta.tab'.format(OUTPUT)
if not os.path.exists(converted):
command = 'mavis convert -o {} -n {} --file_type manta'.format(converted, files)
print('\n>>>', command)
check_output(command, shell=True)
written = True
# delly
files = '/projects/POG/POG_data/{}/sv/delly/delly-*/*/Somatic_Germline_Quality_tagged.vcf'.format(POGLIST)
if bash_expands(files):
commands.append('--convert delly {} delly True'.format(repr(files)))
commands.append('--input delly {}'.format(GENOME_LIBS))
converted = '{}/converted_inputs/delly.tab'.format(OUTPUT)
if not os.path.exists(converted):
command = 'mavis convert -o {} -n {} --file_type delly'.format(converted, files)
print('\n>>>', command)
check_output(command, shell=True)
written = True
# filter on MYB? etc.
if args.grep and written:
command = 'cwd=$(pwd); cd {}; for x in *.tab; do head -1 $x > $x.temp; grep "{}" $x >> $x.temp; cat $x.temp > $x; rm $x.temp; done; cd $cwd'.format(os.path.join(OUTPUT, 'converted_inputs'), args.grep)
print('\n>>>', command)
check_output(command, shell=True)
command = 'mavis config ' + ' '.join(commands) + ' -w {}.cfg'.format(OUTPUT)
print('\n>>>', command)
check_output(command, shell=True)
command = 'mavis pipeline {0}.cfg -o {0}'.format(OUTPUT)
print('\n>>>', command)
check_output(command, shell=True)
if __name__ == '__main__':
main()