Source code for mavis.bam.cache

import pysam
import warnings
import atexit


[docs]class BamCache: """ caches reads by name to facilitate getting read mates without jumping around the file if we've already read that section """ def __init__(self, bamfile, stranded=False): """ Args: bamfile (str): path to the input bam file """ self.cache = {} self.stranded = stranded self.fh = bamfile if not hasattr(bamfile, 'fetch'): self.fh = pysam.AlignmentFile(bamfile, 'rb') else: try: self.fh = bamfile.fh except AttributeError: pass atexit.register(self.close) # makes the file 'auto close' on normal python exit
[docs] def add_read(self, read): """ Args: read (pysam.AlignedSegment): the read to add to the cache """ self.cache.setdefault(read.query_name, set()).add(read)
[docs] def reference_id(self, chrom): """ Args: chrom (str): the chromosome/reference name Returns: int: the reference id corresponding to input chromosome name """ tid = self.fh.get_tid(chrom) if tid == -1: raise KeyError('invalid reference name not present in bam file') return tid
[docs] def get_read_reference_name(self, read): """ Args: read (pysam.AlignedSegment): the read we want the chromosome name for Returns: str: the name of the chromosome """ return self.fh.get_reference_name(read.reference_id)
@classmethod def _generate_fetch_bins(cls, start, stop, sample_bins, bin_gap_size): """ Args: start (int): the start if the area to fetch reads from stop (int): the end of the region sample_bins (int): the number of bins to split the region into bin_gap_size (int): the space to skip between bins """ bin_size = int(((stop - start + 1) - bin_gap_size * (sample_bins - 1)) / sample_bins) fetch_regions = [(start, start + bin_size - 1)] # exclusive ranges for fetch for i in range(0, sample_bins - 1): st = fetch_regions[-1][1] + bin_gap_size + 1 end = st + bin_size fetch_regions.append((st, end)) fetch_regions[-1] = fetch_regions[-1][0], stop return fetch_regions
[docs] def fetch( self, chrom, start, stop, read_limit=10000, cache=False, sample_bins=3, cache_if=lambda x: True, bin_gap_size=0, filter_if=lambda x: False ): """ wrapper around the fetch method, returns a list to avoid errors with changing the file pointer position from within the loop. Also caches reads if requested and can return a limited read number Args: chrom (str): the chromosome start (int): the start position stop (int): the end position read_limit (int): the maximum number of reads to parse cache (bool): flag to store reads sample_bins (int): number of bins to split the region into cache_if (callable): function to check to against a read to determine if it should be cached bin_gap_size (int): gap between the bins for the fetch area Returns: :class:`set` of :class:`pysam.AlignedSegment`: set of reads gathered from the region """ # try using the cache to avoid fetching regions more than once result = [] bin_limit = int(read_limit / sample_bins) if read_limit else None # split into multiple fetches based on the 'sample_bins' for fstart, fend in self.__class__._generate_fetch_bins(start, stop, sample_bins, bin_gap_size): count = 0 for read in self.fh.fetch(chrom, fstart, fend): if bin_limit is not None and count >= bin_limit: warnings.warn( 'hit read limit. Fetch will not cache all reads for this bin: {}-{}'.format(fstart, fend)) break if not filter_if(read): result.append(read) if cache and cache_if(read): self.add_read(read) count += 1 return set(result)
[docs] def get_mate(self, read, primary_only=True, allow_file_access=True): """ Args: read (pysam.AlignedSegment): the read primary_only (bool): ignore secondary alignments allow_file_access (bool): determines if the bam can be accessed to try to find the mate Returns: :class:`list` of :class:`pysam.AlignedSegment`: list of mates of the input read """ # NOTE: will return all mate alignments that have been cached putative_mates = self.cache.get(read.query_name, set()) mates = [] for mate in putative_mates: if not mate.is_unmapped: if any([ read.is_read1 == mate.is_read1, read.next_reference_start != mate.reference_start, read.next_reference_id != mate.reference_id, primary_only and (mate.is_secondary or mate.is_supplementary), abs(read.template_length) != abs(mate.template_length) ]): continue mates.append(mate) if len(mates) == 0: if not allow_file_access or read.mate_is_unmapped: raise KeyError('mate is not found in the cache') else: warnings.warn( 'looking for uncached mate of {0}. This requires file access and'.format(read.query_name) + ' requests may be slow. This should also not be using in a loop iterating using the file pointer ' + ' as it will change the file pointer position') m = self.fh.mate(read) self.add_read(m) return [m] return mates
[docs] def close(self): """ close the bam file handle """ try: self.fh.close() except AttributeError: pass