Source code for mavis.pipeline.config

from argparse import Namespace
from configparser import ConfigParser, ExtendedInterpolation
import argparse
import os
import TSV
import re
from .. import __version__
from ..constants import PROTOCOL
from ..validate.constants import VALIDATION_DEFAULTS
from .util import get_blat_version, get_samtools_version, PIPELINE_STEP
from ..illustrate.constants import DEFAULTS as ILLUSTRATION_DEFAULTS

QSUB_TAGS = dict(validate_memory=12, default_memory=4, queue='transabyss.q')

LIBRARY_DEFAULT_TAGS = dict(
    min_clusters_per_file=50,
    max_files=10,
    cluster_clique_size=15,
    cluster_radius=20,
    min_orf_size=120,
    max_orf_cap=3,
    min_domain_mapping_match=0.8,
    max_proximity=5000,
    uninformative_filter=True,
    stranded_bam=False,
    domain_regex_filter='^PF\d+$'
)
LIBRARY_DEFAULT_TAGS.update(VALIDATION_DEFAULTS.__dict__)

REFERENCE_DEFAULT_TAGS = dict(
    low_memory=False
)

LIBRARY_REQUIRED_TAGS = dict(
    protocol=PROTOCOL.enforce,
    bam_file=str, 
    read_length=int, 
    median_fragment_size=int, 
    stdev_fragment_size=int, 
    inputs=lambda x: x.split(';') if x else [], 
    pairing=lambda x: x.split(';') if x else []
)

PAIRING_DEFAULTS = dict(
    split_call_distance=10,
    contig_call_distance=0,
    flanking_call_distance=0,
    max_proximity=5000,
    low_memory=False
)

REFERENCE_REQUIRED_FILES = ['template_metadata', 'reference_genome', 'annotations', 'masking', 'blat_2bit_reference']

REFERENCE_DEFAULTS = {}
for fname in REFERENCE_REQUIRED_FILES:
    env_var = 'MAVIS_' + fname.upper()
    REFERENCE_DEFAULTS[fname] = os.environ.get(env_var, None)


[docs]def write_config(filename, include_defaults=False): config = ConfigParser() for sec in ['DEFAULTS', 'reference', '<LIBRARY NAME>', 'qsub', 'illustrate']: config[sec] = {} for tag in REFERENCE_REQUIRED_FILES: config['reference'][tag] = '<REQUIRED>' for tag in LIBRARY_REQUIRED_TAGS: config['<LIBRARY NAME>'][tag] = '<REQUIRED>' if include_defaults: for tag in REFERENCE_REQUIRED_FILES: config['reference'][tag] = '<REQUIRED>' if REFERENCE_DEFAULTS[tag] is None else REFERENCE_DEFAULTS[tag] for tag, val in QSUB_TAGS.items(): config['qsub'][tag] = str(val) for tag, val in LIBRARY_DEFAULT_TAGS.items(): config['DEFAULTS'][tag] = str(val) for tag, val in ILLUSTRATION_DEFAULTS.__dict__.items(): config['illustrate'][tag] = str(val) for sec in config: for tag in config[sec]: if '_regex_' in tag: config[sec][tag] = re.sub('\$', '$$', config[sec][tag]) with open(filename, 'w') as configfile: config.write(configfile)
[docs]def cast(value, cast_func): if cast_func == bool: value = TSV.tsv_boolean(value) else: value = cast_func(value) return value
[docs]def validate_and_cast_section(section, defaults): d = {} for attr, value in section.items(): if attr not in defaults: raise KeyError('tag not recognized', attr) elif defaults[attr] is None and attr == 'assembly_max_kmer_size': if value == 'None': d[attr] = None elif attr == 'assembly_max_kmer_size': d[attr] = cast(value, int) else: d[attr] = cast(value, type(defaults[attr])) else: d[attr] = cast(value, type(defaults[attr])) return d
[docs]def read_config(filepath): """ reads the configuration settings from the configuration file Args: filepath (str): path to the input configuration file Returns: class:`list` of :class:`Namespace`: namespace arguments for each library """ parser = ConfigParser(interpolation=ExtendedInterpolation()) parser.read(filepath) # get the library sections and add the default settings library_sections = [] for sec in parser.sections(): if sec not in ['DEFAULTS', 'reference', 'qsub', 'illustrate']: library_sections.append(sec) all_libs = {} args = {} args.update(QSUB_TAGS) args.update(REFERENCE_DEFAULT_TAGS) all_libs.update(LIBRARY_DEFAULT_TAGS) illustration_defaults = {} for k, v in ILLUSTRATION_DEFAULTS.__dict__.items(): illustration_defaults[k.lower()] = v args.update(illustration_defaults) args.update(PAIRING_DEFAULTS) # check that the reference files all exist for attr, fname in parser['reference'].items(): if attr in REFERENCE_REQUIRED_FILES and not os.path.exists(fname): raise KeyError(attr, 'file at', fname, 'dose not exist') args[attr] = fname for attr in REFERENCE_REQUIRED_FILES: if attr not in parser['reference']: raise KeyError('missing required tag', attr, 'in reference section') # type check the qsub options if 'qsub' in parser: args.update(validate_and_cast_section(parser['qsub'], QSUB_TAGS)) # cast the defaults if 'DEFAULTS' in parser: d = validate_and_cast_section(parser['DEFAULTS'], LIBRARY_DEFAULT_TAGS) all_libs.update(d) if 'illustrate' in parser: args.update(validate_and_cast_section(parser['illustrate'], illustration_defaults)) if 'pairing' in parser: args.update(validate_and_cast_section(parser['pairing'], PAIRING_DEFAULTS)) sections = [] for sec in library_sections: d = {} d.update(all_libs) temp = {k: v for k, v in parser[sec].items() if k not in LIBRARY_REQUIRED_TAGS} temp = validate_and_cast_section(temp, LIBRARY_DEFAULT_TAGS) d.update(temp) for attr in LIBRARY_REQUIRED_TAGS: if attr not in parser[sec]: raise KeyError('required tag', attr, 'not found in library section', sec) d[attr] = LIBRARY_REQUIRED_TAGS[attr](parser[sec][attr]) d['library'] = sec sections.append(Namespace(**d)) if len(library_sections) < 1: raise UserWarning('configuration file must have 1 or more library sections') return Namespace(**args), sections
[docs]def parse_arguments(pstep): PIPELINE_STEP.enforce(pstep) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '-v', '--version', action='version', version='%(prog)s version ' + __version__, help='Outputs the version number' ) if pstep != PIPELINE_STEP.PIPELINE: g = parser.add_argument_group('reference input arguments') g.add_argument( '--annotations', default=REFERENCE_DEFAULTS['annotations'], help='path to the reference annotations of genes, transcript, exons, domains, etc.' ) if pstep in [PIPELINE_STEP.ANNOTATE, PIPELINE_STEP.VALIDATE]: g.add_argument( '--reference_genome', default=REFERENCE_DEFAULTS['reference_genome'], help='path to the human reference genome in fa format' ) if pstep == PIPELINE_STEP.ANNOTATE: g.add_argument( '--template_metadata', default=REFERENCE_DEFAULTS['template_metadata'], help='file containing the cytoband template information' ) if pstep in [PIPELINE_STEP.CLUSTER, PIPELINE_STEP.VALIDATE]: g.add_argument( '--masking', default=REFERENCE_DEFAULTS['masking'], ) g.add_argument( '--low_memory', default=PAIRING_DEFAULTS['low_memory'], type=TSV.tsv_boolean, help='when working on a machine with less memory this is sacrifice time for memory where possible' ) if pstep == PIPELINE_STEP.VALIDATE: g.add_argument( '--blat_2bit_reference', default=REFERENCE_DEFAULTS['blat_2bit_reference'], help='path to the 2bit reference file used for blatting contig sequences' ) else: parser.add_argument('config', help='path to the pipeline configuration file') parser.add_argument( '-f', '--force_overwrite', default=False, type=TSV.tsv_boolean, help='set flag to overwrite existing reviewed files' ) if pstep == PIPELINE_STEP.PIPELINE: m = parser.add_mutually_exclusive_group(required=True) m.add_argument('--output', help='path to the output directory') m.add_argument('--write', default=False, action='store_true', help='write a config') else: parser.add_argument('--output', help='path to the output directory', required=True) if pstep == PIPELINE_STEP.ANNOTATE: parser.add_argument( '--output_svgs', default=True, type=TSV.tsv_boolean, help='set flag to suppress svg drawings of putative annotations') parser.add_argument( '--min_orf_size', default=LIBRARY_DEFAULT_TAGS['min_orf_size'], type=int, help='minimum size for putative ORFs' ) parser.add_argument( '--max_orf_cap', default=LIBRARY_DEFAULT_TAGS['max_orf_cap'], type=int, help='keep the n longest orfs' ) parser.add_argument( '--min_domain_mapping_match', default=LIBRARY_DEFAULT_TAGS['min_domain_mapping_match'], type=float, help='minimum percent match for the domain to be considered aligned' ) g = parser.add_argument_group('visualization options') g.add_argument( '--domain_regex_filter', default='^PF\d+$', help='only show domains which names (external identifiers) match the given pattern' ) if pstep == PIPELINE_STEP.CLUSTER or pstep == PIPELINE_STEP.ANNOTATE or pstep == PIPELINE_STEP.PAIR: parser.add_argument( '--max_proximity', default=LIBRARY_DEFAULT_TAGS['max_proximity'], type=int, help='The maximum distance away from breakpoints to look for proximal genes' ) parser.add_argument('-n', '--inputs', required=True, nargs='+', help='1 or more input files') elif pstep == PIPELINE_STEP.VALIDATE: parser.add_argument('-n', '--input', help='path to the input file', required=True) if pstep == PIPELINE_STEP.CLUSTER or pstep == PIPELINE_STEP.VALIDATE: parser.add_argument('-l', '--library', help='library name') parser.add_argument('--protocol', help='the library protocol: genome or transcriptome', choices=PROTOCOL.values()) if pstep == PIPELINE_STEP.CLUSTER: g = parser.add_argument_group('output arguments') g.add_argument( '--max_files', default=LIBRARY_DEFAULT_TAGS['max_files'], type=int, dest='max_files', help='defines the maximum number of files that can be created') g.add_argument( '--min_clusters_per_file', default=LIBRARY_DEFAULT_TAGS['min_clusters_per_file'], type=int, help='defines the minimum number of clusters per file') parser.add_argument( '-r', '--cluster_radius', help='radius to use in clustering', default=LIBRARY_DEFAULT_TAGS['cluster_radius'], type=int) parser.add_argument( '-k', '--cluster_clique_size', help='parameter used for computing cliques, smaller is faster, above 20 will be slow', default=LIBRARY_DEFAULT_TAGS['cluster_clique_size'], type=int ) parser.add_argument( '--uninformative_filter', default=LIBRARY_DEFAULT_TAGS['uninformative_filter'], help='If flag is False then the clusters will not be filtered ' 'based on lack of annotation', type=TSV.tsv_boolean ) if pstep == PIPELINE_STEP.PAIR: parser.add_argument( '--split_call_distance', default=10, type=int, help='distance allowed between breakpoint calls when pairing from split read (and higher) resolution calls' ) parser.add_argument( '--contig_call_distance', default=0, type=int, help='distance allowed between breakpoint calls when pairing from contig (and higher) resolution calls' ) parser.add_argument( '--flanking_call_distance', default=0, type=int, help='distance allowed between breakpoint calls when pairing from contig (and higher) resolution calls' ) if pstep == PIPELINE_STEP.VALIDATE: g = parser.add_argument_group('evidence arguments') for attr, value in VALIDATION_DEFAULTS.__dict__.items(): vtype = type(value) if type(value) == bool: vtype = TSV.tsv_boolean g.add_argument('--{}'.format(attr), default=value, type=vtype, help='see user manual for desc') parser.add_argument( '-b', '--bam_file', help='path to the input bam file', required=True ) parser.add_argument( '--stranded_bam', default=False, type=TSV.tsv_boolean, help='indicates that the input bam file is strand specific' ) g.add_argument('--read_length', type=int, help='the length of the reads in the bam file', required=True) g.add_argument( '--stdev_fragment_size', type=int, help='expected standard deviation in insert sizes', required=True ) g.add_argument( '--median_fragment_size', type=int, help='median inset size for pairs in the bam file', required=True ) args = parser.parse_args() if pstep == PIPELINE_STEP.VALIDATE: args.samtools_version = get_samtools_version() args.blat_version = get_blat_version() try: args.output = os.path.abspath(args.output) if os.path.exists(args.output) and not args.force_overwrite: parser.print_help() print( '\nerror: output directory {} exists, --force_overwrite must be specified or the directory removed'.format( repr(args.output))) exit(1) except (AttributeError, TypeError): pass return args