Source code for full_pog

import sys
import tab
import re
from subprocess import check_output
import os
from mavis.constants import MavisNamespace
from mavis.util import bash_expands, mkdirp
import argparse



[docs]def get_bam_file(library_id):
    output = check_output(['retrieve_reads_to_genome_bam_file', library_id])
    bam_file = output.decode('UTF-8').split('\n')[0]
    bam_file = re.split(r'\s+', bam_file)[-1]
    return bam_file


[docs]def get_libraries(poglist):
    all_libraries = []
    ffglob = '/projects/POG/POG_data/{}/flatfile/*.tab'.format(poglist)
    for flatfile in bash_expands(ffglob):
        _, libraries = tab.read_file(
                flatfile, rename={'library_name': ['name']},
                cast={'diseased_status': lambda x: x.lower(), 'protocol': lambda x: 'transcriptome' if 'rna' in x.lower() else 'genome'}
        )
        match = re.match(r'.*(POG\d+)', flatfile)
        pogid = match.group(1)
        for lib in libraries:
            lib['bam_file'] = get_bam_file(lib['name'])
            all_libraries.append(MavisNamespace(id=pogid, **lib))
    return all_libraries


[docs]def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('pogs', nargs='+', help='pog ids to run in a single mavis run')
    parser.add_argument('--grep', help='grep filter input files', default=None)
    args = parser.parse_args()

    pogs = [p.upper() for p in args.pogs]

    for i, pog in enumerate(pogs):
        if not pog.startswith('POG'):
            pogs[i] = 'POG' + pog

    pogs = sorted(pogs)

    print('searching for libraries and files related to', pogs)
    all_libraries = []
    POGLIST = ','.join(pogs)
    if len(pogs) > 1:
        POGLIST = '{{{}}}'.format(POGLIST)

    commands = ['--no_defaults']

    libraries = get_libraries(POGLIST)

    TRANS_LIBS = ' '.join([l.name for l in libraries if l.protocol == 'transcriptome'])
    GENOME_LIBS = ' '.join([l.name for l in libraries if l.protocol != 'transcriptome'])

    for lib in libraries:
        # [--library <name> {genome,transcriptome} {diseased,normal} [strand_specific] [/path/to/bam/file]]
        commands.append('--library {} {} {} {} {}'.format(lib.name, lib.protocol, lib.diseased_status, lib.protocol == 'transcriptome', lib.bam_file))

    OUTPUT = '-'.join(pogs)
    # --convert <alias> FILEPATH {manta,delly,transabyss,pindel,chimerascan,mavis,defuse} <stranded>
    written = False
    if TRANS_LIBS:
        # TA
        files = '/projects/POG/POG_data/{}/rna/*/trans-ABySS/fusions/*tsv'.format(POGLIST)
        if bash_expands(files):
            commands.append('--convert ta_trans {} transabyss True'.format(repr(files)))
            commands.append('--input ta_trans {}'.format(TRANS_LIBS))
            converted = '{}/converted_inputs/ta_trans.tab'.format(OUTPUT)
            if not os.path.exists(converted):
                command = 'mavis convert -o {} -n {} --file_type transabyss --strand_specific True'.format(converted, files)
                print('\n>>>', command)
                check_output(command, shell=True)
                written = True
        # defuse
        files = '/projects/POG/POG_data/{}/sv/deFUSE/deFUSE-*/*/results/results.classify.tsv'.format(POGLIST)
        if bash_expands(files):
            commands.append('--convert defuse {} defuse True'.format(repr(files)))
            commands.append('--input defuse {}'.format(TRANS_LIBS))
            converted = '{}/converted_inputs/defuse.tab'.format(OUTPUT)
            if not os.path.exists(converted):
                command = 'mavis convert -o {} -n {} --file_type defuse --strand_specific False'.format(converted, files)
                print('\n>>>', command)
                check_output(command, shell=True)
                written = True

    if GENOME_LIBS:
        # ta
        files = '/projects/POG/POG_data/{}/wgs/*/trans-ABySS/fusions/*tsv'.format(POGLIST)
        if bash_expands(files):
            commands.append('--convert ta_genome {} transabyss True'.format(repr(files)))
            commands.append('--input ta_genome {}'.format(GENOME_LIBS))
            converted = '{}/converted_inputs/ta_genome.tab'.format(OUTPUT)
            if not os.path.exists(converted):
                command = 'mavis convert -o {} -n {} --file_type transabyss'.format(converted, files)
                print('\n>>>', command)
                check_output(command, shell=True)
                written = True
        # manta
        files = '/projects/POG/POG_data/{}/sv/manta/manta-*/*/{{diploidSV,somaticSV}}.vcf'.format(POGLIST)
        if bash_expands(files):
            commands.append('--convert manta {} manta True'.format(repr(files)))
            commands.append('--input manta {}'.format(GENOME_LIBS))
            converted = '{}/converted_inputs/manta.tab'.format(OUTPUT)
            if not os.path.exists(converted):
                command = 'mavis convert -o {} -n {} --file_type manta'.format(converted, files)
                print('\n>>>', command)
                check_output(command, shell=True)
                written = True
        # delly
        files = '/projects/POG/POG_data/{}/sv/delly/delly-*/*/Somatic_Germline_Quality_tagged.vcf'.format(POGLIST)
        if bash_expands(files):
            commands.append('--convert delly {} delly True'.format(repr(files)))
            commands.append('--input delly {}'.format(GENOME_LIBS))
            converted = '{}/converted_inputs/delly.tab'.format(OUTPUT)
            if not os.path.exists(converted):
                command = 'mavis convert -o {} -n {} --file_type delly'.format(converted, files)
                print('\n>>>', command)
                check_output(command, shell=True)
                written = True

    # filter on MYB? etc.
    if args.grep and written:
        command = 'cwd=$(pwd); cd {}; for x in *.tab; do head -1 $x > $x.temp; grep "{}" $x >> $x.temp; cat $x.temp > $x; rm $x.temp; done; cd $cwd'.format(os.path.join(OUTPUT, 'converted_inputs'), args.grep)
        print('\n>>>', command)
        check_output(command, shell=True)

    command = 'mavis config ' + ' '.join(commands) + ' -w {}.cfg'.format(OUTPUT)
    print('\n>>>', command)
    check_output(command, shell=True)

    command = 'mavis pipeline {0}.cfg -o {0}'.format(OUTPUT)
    print('\n>>>', command)
    check_output(command, shell=True)


if __name__ == '__main__':
    main()