Source code for full_pog

import sys
import tab
import re
from subprocess import check_output
import os
from mavis.constants import MavisNamespace
from mavis.util import bash_expands, mkdirp
import argparse



[docs]def get_bam_file(library_id): output = check_output(['retrieve_reads_to_genome_bam_file', library_id]) bam_file = output.decode('UTF-8').split('\n')[0] bam_file = re.split(r'\s+', bam_file)[-1] return bam_file
[docs]def get_libraries(poglist): all_libraries = [] ffglob = '/projects/POG/POG_data/{}/flatfile/*.tab'.format(poglist) for flatfile in bash_expands(ffglob): _, libraries = tab.read_file( flatfile, rename={'library_name': ['name']}, cast={'diseased_status': lambda x: x.lower(), 'protocol': lambda x: 'transcriptome' if 'rna' in x.lower() else 'genome'} ) match = re.match(r'.*(POG\d+)', flatfile) pogid = match.group(1) for lib in libraries: lib['bam_file'] = get_bam_file(lib['name']) all_libraries.append(MavisNamespace(id=pogid, **lib)) return all_libraries
[docs]def main(): parser = argparse.ArgumentParser() parser.add_argument('pogs', nargs='+', help='pog ids to run in a single mavis run') parser.add_argument('--grep', help='grep filter input files', default=None) args = parser.parse_args() pogs = [p.upper() for p in args.pogs] for i, pog in enumerate(pogs): if not pog.startswith('POG'): pogs[i] = 'POG' + pog pogs = sorted(pogs) print('searching for libraries and files related to', pogs) all_libraries = [] POGLIST = ','.join(pogs) if len(pogs) > 1: POGLIST = '{{{}}}'.format(POGLIST) commands = ['--no_defaults'] libraries = get_libraries(POGLIST) TRANS_LIBS = ' '.join([l.name for l in libraries if l.protocol == 'transcriptome']) GENOME_LIBS = ' '.join([l.name for l in libraries if l.protocol != 'transcriptome']) for lib in libraries: # [--library <name> {genome,transcriptome} {diseased,normal} [strand_specific] [/path/to/bam/file]] commands.append('--library {} {} {} {} {}'.format(lib.name, lib.protocol, lib.diseased_status, lib.protocol == 'transcriptome', lib.bam_file)) OUTPUT = '-'.join(pogs) # --convert <alias> FILEPATH {manta,delly,transabyss,pindel,chimerascan,mavis,defuse} <stranded> written = False if TRANS_LIBS: # TA files = '/projects/POG/POG_data/{}/rna/*/trans-ABySS/fusions/*tsv'.format(POGLIST) if bash_expands(files): commands.append('--convert ta_trans {} transabyss True'.format(repr(files))) commands.append('--input ta_trans {}'.format(TRANS_LIBS)) converted = '{}/converted_inputs/ta_trans.tab'.format(OUTPUT) if not os.path.exists(converted): command = 'mavis convert -o {} -n {} --file_type transabyss --strand_specific True'.format(converted, files) print('\n>>>', command) check_output(command, shell=True) written = True # defuse files = '/projects/POG/POG_data/{}/sv/deFUSE/deFUSE-*/*/results/results.classify.tsv'.format(POGLIST) if bash_expands(files): commands.append('--convert defuse {} defuse True'.format(repr(files))) commands.append('--input defuse {}'.format(TRANS_LIBS)) converted = '{}/converted_inputs/defuse.tab'.format(OUTPUT) if not os.path.exists(converted): command = 'mavis convert -o {} -n {} --file_type defuse --strand_specific False'.format(converted, files) print('\n>>>', command) check_output(command, shell=True) written = True if GENOME_LIBS: # ta files = '/projects/POG/POG_data/{}/wgs/*/trans-ABySS/fusions/*tsv'.format(POGLIST) if bash_expands(files): commands.append('--convert ta_genome {} transabyss True'.format(repr(files))) commands.append('--input ta_genome {}'.format(GENOME_LIBS)) converted = '{}/converted_inputs/ta_genome.tab'.format(OUTPUT) if not os.path.exists(converted): command = 'mavis convert -o {} -n {} --file_type transabyss'.format(converted, files) print('\n>>>', command) check_output(command, shell=True) written = True # manta files = '/projects/POG/POG_data/{}/sv/manta/manta-*/*/{{diploidSV,somaticSV}}.vcf'.format(POGLIST) if bash_expands(files): commands.append('--convert manta {} manta True'.format(repr(files))) commands.append('--input manta {}'.format(GENOME_LIBS)) converted = '{}/converted_inputs/manta.tab'.format(OUTPUT) if not os.path.exists(converted): command = 'mavis convert -o {} -n {} --file_type manta'.format(converted, files) print('\n>>>', command) check_output(command, shell=True) written = True # delly files = '/projects/POG/POG_data/{}/sv/delly/delly-*/*/Somatic_Germline_Quality_tagged.vcf'.format(POGLIST) if bash_expands(files): commands.append('--convert delly {} delly True'.format(repr(files))) commands.append('--input delly {}'.format(GENOME_LIBS)) converted = '{}/converted_inputs/delly.tab'.format(OUTPUT) if not os.path.exists(converted): command = 'mavis convert -o {} -n {} --file_type delly'.format(converted, files) print('\n>>>', command) check_output(command, shell=True) written = True # filter on MYB? etc. if args.grep and written: command = 'cwd=$(pwd); cd {}; for x in *.tab; do head -1 $x > $x.temp; grep "{}" $x >> $x.temp; cat $x.temp > $x; rm $x.temp; done; cd $cwd'.format(os.path.join(OUTPUT, 'converted_inputs'), args.grep) print('\n>>>', command) check_output(command, shell=True) command = 'mavis config ' + ' '.join(commands) + ' -w {}.cfg'.format(OUTPUT) print('\n>>>', command) check_output(command, shell=True) command = 'mavis pipeline {0}.cfg -o {0}'.format(OUTPUT) print('\n>>>', command) check_output(command, shell=True)
if __name__ == '__main__': main()