Source code for mavis.pipeline.config

from argparse import Namespace
from configparser import ConfigParser, ExtendedInterpolation
import argparse
import os
import TSV
import re
from .. import __version__
from ..constants import PROTOCOL
from ..validate.constants import VALIDATION_DEFAULTS
from .util import get_blat_version, get_samtools_version, PIPELINE_STEP
from ..illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS

QSUB_TAGS = dict(validate_memory=12, default_memory=4, queue='transabyss.q')

LIBRARY_DEFAULT_TAGS = dict(
    min_clusters_per_file=50,
    max_files=10,
    cluster_clique_size=15,
    cluster_radius=20,
    min_orf_size=120,
    max_orf_cap=3,
    min_domain_mapping_match=0.8,
    max_proximity=5000,
    uninformative_filter=True,
    stranded_bam=False,
    domain_regex_filter='^PF\d+$'
)
LIBRARY_DEFAULT_TAGS.update(VALIDATION_DEFAULTS.__dict__)

REFERENCE_DEFAULT_TAGS = dict(
    low_memory=False
)

LIBRARY_REQUIRED_TAGS = dict(
    protocol=PROTOCOL.enforce,
    bam_file=str, 
    read_length=int, 
    median_fragment_size=int, 
    stdev_fragment_size=int, 
    inputs=lambda x: x.split(';') if x else [], 
    pairing=lambda x: x.split(';') if x else []
)

PAIRING_DEFAULTS = dict(
    split_call_distance=10,
    contig_call_distance=0,
    flanking_call_distance=0,
    max_proximity=5000,
    low_memory=False
)

REFERENCE_REQUIRED_FILES = ['template_metadata', 'reference_genome', 'annotations', 'masking', 'blat_2bit_reference']

REFERENCE_DEFAULTS = {}
for fname in REFERENCE_REQUIRED_FILES:
    env_var = 'MAVIS_' + fname.upper()
    REFERENCE_DEFAULTS[fname] = os.environ.get(env_var, None)


[docs]def write_config(filename, include_defaults=False):
    config = ConfigParser()
    
    for sec in ['DEFAULTS', 'reference', '<LIBRARY NAME>', 'qsub', 'illustrate']:
        config[sec] = {}
    
    for tag in REFERENCE_REQUIRED_FILES:
        config['reference'][tag] = '<REQUIRED>'
    for tag in LIBRARY_REQUIRED_TAGS:
        config['<LIBRARY NAME>'][tag] = '<REQUIRED>'
    
    if include_defaults:
        for tag in REFERENCE_REQUIRED_FILES:
            config['reference'][tag] = '<REQUIRED>' if REFERENCE_DEFAULTS[tag] is None else REFERENCE_DEFAULTS[tag]
        for tag, val in QSUB_TAGS.items():
            config['qsub'][tag] = str(val)
        for tag, val in LIBRARY_DEFAULT_TAGS.items():
            config['DEFAULTS'][tag] = str(val)
        for tag, val in ILLUSTRATION_DEFAULTS.__dict__.items():
            config['illustrate'][tag] = str(val)
    
    for sec in config:
        for tag in config[sec]:
            if '_regex_' in tag:
                config[sec][tag] = re.sub('\$', '$$', config[sec][tag])
    with open(filename, 'w') as configfile:
        config.write(configfile)


[docs]def cast(value, cast_func):
    if cast_func == bool:
        value = TSV.tsv_boolean(value)
    else:
        value = cast_func(value)
    return value


[docs]def validate_and_cast_section(section, defaults):
    d = {}
    for attr, value in section.items():
        if attr not in defaults:
            raise KeyError('tag not recognized', attr)
        elif defaults[attr] is None and attr == 'assembly_max_kmer_size':
            if value == 'None':
                d[attr] = None
            elif attr == 'assembly_max_kmer_size':
                d[attr] = cast(value, int)
            else:
                d[attr] = cast(value, type(defaults[attr]))
        else:
            d[attr] = cast(value, type(defaults[attr]))
    return d


[docs]def read_config(filepath):
    """
    reads the configuration settings from the configuration file

    Args:
        filepath (str): path to the input configuration file

    Returns:
        class:`list` of :class:`Namespace`: namespace arguments for each library
    """
    parser = ConfigParser(interpolation=ExtendedInterpolation())
    parser.read(filepath)

    # get the library sections and add the default settings
    library_sections = []
    for sec in parser.sections():
        if sec not in ['DEFAULTS', 'reference', 'qsub', 'illustrate']:
            library_sections.append(sec)
    
    all_libs = {}
    args = {}
    args.update(QSUB_TAGS)
    args.update(REFERENCE_DEFAULT_TAGS)
    all_libs.update(LIBRARY_DEFAULT_TAGS)
    illustration_defaults = {}
    for k, v in ILLUSTRATION_DEFAULTS.__dict__.items():
        illustration_defaults[k.lower()] = v
    args.update(illustration_defaults)
    args.update(PAIRING_DEFAULTS)
    # check that the reference files all exist
    for attr, fname in parser['reference'].items():
        if attr in REFERENCE_REQUIRED_FILES and not os.path.exists(fname):
            raise KeyError(attr, 'file at', fname, 'dose not exist')
        args[attr] = fname
    for attr in REFERENCE_REQUIRED_FILES:
        if attr not in parser['reference']:
            raise KeyError('missing required tag', attr, 'in reference section')

    # type check the qsub options
    if 'qsub' in parser:
        args.update(validate_and_cast_section(parser['qsub'], QSUB_TAGS))
    
    # cast the defaults
    if 'DEFAULTS' in parser:
        d = validate_and_cast_section(parser['DEFAULTS'], LIBRARY_DEFAULT_TAGS)
        all_libs.update(d)
    
    if 'illustrate' in parser:
        args.update(validate_and_cast_section(parser['illustrate'], illustration_defaults))
    
    if 'pairing' in parser:
        args.update(validate_and_cast_section(parser['pairing'], PAIRING_DEFAULTS))
    sections = []
    for sec in library_sections:
        d = {}
        d.update(all_libs)
        temp = {k: v for k, v in parser[sec].items() if k not in LIBRARY_REQUIRED_TAGS}
        temp = validate_and_cast_section(temp, LIBRARY_DEFAULT_TAGS)
        d.update(temp)
        for attr in LIBRARY_REQUIRED_TAGS:
            if attr not in parser[sec]:
                raise KeyError('required tag', attr, 'not found in library section', sec)
            d[attr] = LIBRARY_REQUIRED_TAGS[attr](parser[sec][attr])
        d['library'] = sec
        sections.append(Namespace(**d))
    if len(library_sections) < 1:
        raise UserWarning('configuration file must have 1 or more library sections')

    return Namespace(**args), sections


[docs]def parse_arguments(pstep):
    PIPELINE_STEP.enforce(pstep)

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '-v', '--version', action='version', version='%(prog)s version ' + __version__,
        help='Outputs the version number'
    )

    if pstep != PIPELINE_STEP.PIPELINE:
        g = parser.add_argument_group('reference input arguments')
        g.add_argument(
            '--annotations',
            default=REFERENCE_DEFAULTS['annotations'],
            help='path to the reference annotations of genes, transcript, exons, domains, etc.'
        )
        if pstep in [PIPELINE_STEP.ANNOTATE, PIPELINE_STEP.VALIDATE]:
            g.add_argument(
                '--reference_genome',
                default=REFERENCE_DEFAULTS['reference_genome'],
                help='path to the human reference genome in fa format'
            )
        if pstep == PIPELINE_STEP.ANNOTATE:
            g.add_argument(
                '--template_metadata', default=REFERENCE_DEFAULTS['template_metadata'],
                help='file containing the cytoband template information'
            )
        if pstep in [PIPELINE_STEP.CLUSTER, PIPELINE_STEP.VALIDATE]:
            g.add_argument(
                '--masking',
                default=REFERENCE_DEFAULTS['masking'],
            )
        g.add_argument(
            '--low_memory', default=PAIRING_DEFAULTS['low_memory'], type=TSV.tsv_boolean,
            help='when working on a machine with less memory this is sacrifice time for memory where possible'
        )
        if pstep == PIPELINE_STEP.VALIDATE:
            g.add_argument(
                '--blat_2bit_reference', default=REFERENCE_DEFAULTS['blat_2bit_reference'],
                help='path to the 2bit reference file used for blatting contig sequences'
            )
    else:
        parser.add_argument('config', help='path to the pipeline configuration file')
        parser.add_argument(
            '-f', '--force_overwrite', default=False, type=TSV.tsv_boolean,
            help='set flag to overwrite existing reviewed files'
        )
    if pstep == PIPELINE_STEP.PIPELINE:
        m = parser.add_mutually_exclusive_group(required=True)
        m.add_argument('--output', help='path to the output directory')
        m.add_argument('--write', default=False, action='store_true', help='write a config')
    else:
        parser.add_argument('--output', help='path to the output directory', required=True)

    if pstep == PIPELINE_STEP.ANNOTATE:
        parser.add_argument(
            '--output_svgs', default=True, type=TSV.tsv_boolean,
            help='set flag to suppress svg drawings of putative annotations')
        parser.add_argument(
            '--min_orf_size', default=LIBRARY_DEFAULT_TAGS['min_orf_size'], type=int, 
            help='minimum size for putative ORFs'
        )
        parser.add_argument(
            '--max_orf_cap', default=LIBRARY_DEFAULT_TAGS['max_orf_cap'], type=int, 
            help='keep the n longest orfs'
        )
        parser.add_argument(
            '--min_domain_mapping_match', default=LIBRARY_DEFAULT_TAGS['min_domain_mapping_match'], type=float,
            help='minimum percent match for the domain to be considered aligned'
        )
        g = parser.add_argument_group('visualization options')
        g.add_argument(
            '--domain_regex_filter', default='^PF\d+$',
            help='only show domains which names (external identifiers) match the given pattern'
        )

    if pstep == PIPELINE_STEP.CLUSTER or pstep == PIPELINE_STEP.ANNOTATE or pstep == PIPELINE_STEP.PAIR:
        parser.add_argument(
            '--max_proximity', default=LIBRARY_DEFAULT_TAGS['max_proximity'], type=int,
            help='The maximum distance away from breakpoints to look for proximal genes'
        )
        parser.add_argument('-n', '--inputs', required=True, nargs='+', help='1 or more input files')
    elif pstep == PIPELINE_STEP.VALIDATE:
        parser.add_argument('-n', '--input', help='path to the input file', required=True)

    if pstep == PIPELINE_STEP.CLUSTER or pstep == PIPELINE_STEP.VALIDATE:
        parser.add_argument('-l', '--library', help='library name')
        parser.add_argument('--protocol', help='the library protocol: genome or transcriptome', choices=PROTOCOL.values())

    if pstep == PIPELINE_STEP.CLUSTER:
        g = parser.add_argument_group('output arguments')
        g.add_argument(
            '--max_files', default=LIBRARY_DEFAULT_TAGS['max_files'], type=int, dest='max_files',
            help='defines the maximum number of files that can be created')
        g.add_argument(
            '--min_clusters_per_file', default=LIBRARY_DEFAULT_TAGS['min_clusters_per_file'], type=int,
            help='defines the minimum number of clusters per file')
        parser.add_argument(
            '-r', '--cluster_radius', help='radius to use in clustering', 
            default=LIBRARY_DEFAULT_TAGS['cluster_radius'], type=int)
        parser.add_argument(
            '-k', '--cluster_clique_size',
            help='parameter used for computing cliques, smaller is faster, above 20 will be slow',
            default=LIBRARY_DEFAULT_TAGS['cluster_clique_size'], type=int
        )
        parser.add_argument(
            '--uninformative_filter', default=LIBRARY_DEFAULT_TAGS['uninformative_filter'], 
            help='If flag is False then the clusters will not be filtered '
            'based on lack of annotation', type=TSV.tsv_boolean
        )

    if pstep == PIPELINE_STEP.PAIR:
        parser.add_argument(
            '--split_call_distance', default=10, type=int,
            help='distance allowed between breakpoint calls when pairing from split read (and higher) resolution calls'
        )
        parser.add_argument(
            '--contig_call_distance', default=0, type=int,
            help='distance allowed between breakpoint calls when pairing from contig (and higher) resolution calls'
        )
        parser.add_argument(
            '--flanking_call_distance', default=0, type=int,
            help='distance allowed between breakpoint calls when pairing from contig (and higher) resolution calls'
        )

    if pstep == PIPELINE_STEP.VALIDATE:
        g = parser.add_argument_group('evidence arguments')
        for attr, value in VALIDATION_DEFAULTS.__dict__.items():
            vtype = type(value)
            if type(value) == bool:
                vtype = TSV.tsv_boolean
            g.add_argument('--{}'.format(attr), default=value, type=vtype, help='see user manual for desc')
        parser.add_argument(
            '-b', '--bam_file',
            help='path to the input bam file', required=True
        )
        parser.add_argument(
            '--stranded_bam', default=False, type=TSV.tsv_boolean,
            help='indicates that the input bam file is strand specific'
        )
        g.add_argument('--read_length', type=int, help='the length of the reads in the bam file', required=True)
        g.add_argument(
            '--stdev_fragment_size', type=int, help='expected standard deviation in insert sizes', required=True
        )
        g.add_argument(
            '--median_fragment_size', type=int, help='median inset size for pairs in the bam file', required=True
        )

    args = parser.parse_args()
    if pstep == PIPELINE_STEP.VALIDATE:
        args.samtools_version = get_samtools_version()
        args.blat_version = get_blat_version()
    try:
        args.output = os.path.abspath(args.output)
        if os.path.exists(args.output) and not args.force_overwrite:
            parser.print_help()
            print(
                '\nerror: output directory {} exists, --force_overwrite must be specified or the directory removed'.format(
                    repr(args.output)))
            exit(1)
    except (AttributeError, TypeError):
        pass

    return args