Source code for mavis.bam.cache

import pysam
import warnings
import atexit


[docs]class BamCache:
    """
    caches reads by name to facilitate getting read mates without jumping around
    the file if we've already read that section
    """
    def __init__(self, bamfile, stranded=False):
        """
        Args:
            bamfile (str): path to the input bam file
        """
        self.cache = {}
        self.stranded = stranded
        self.fh = bamfile
        if not hasattr(bamfile, 'fetch'):
            self.fh = pysam.AlignmentFile(bamfile, 'rb')
        else:
            try:
                self.fh = bamfile.fh
            except AttributeError:
                pass
        atexit.register(self.close)  # makes the file 'auto close' on normal python exit

[docs]    def add_read(self, read):
        """
        Args:
            read (pysam.AlignedSegment): the read to add to the cache
        """
        self.cache.setdefault(read.query_name, set()).add(read)

[docs]    def reference_id(self, chrom):
        """
        Args:
            chrom (str): the chromosome/reference name
        Returns:
            int: the reference id corresponding to input chromosome name
        """
        tid = self.fh.get_tid(chrom)
        if tid == -1:
            raise KeyError('invalid reference name not present in bam file')
        return tid

[docs]    def get_read_reference_name(self, read):
        """
        Args:
            read (pysam.AlignedSegment): the read we want the chromosome name for
        Returns:
            str: the name of the chromosome
        """
        return self.fh.get_reference_name(read.reference_id)

    @classmethod
    def _generate_fetch_bins(cls, start, stop, sample_bins, bin_gap_size):
        """
        Args:
            start (int): the start if the area to fetch reads from
            stop (int): the end of the region
            sample_bins (int): the number of bins to split the region into
            bin_gap_size (int): the space to skip between bins
        """
        bin_size = int(((stop - start + 1) - bin_gap_size * (sample_bins - 1)) / sample_bins)

        fetch_regions = [(start, start + bin_size - 1)]  # exclusive ranges for fetch
        for i in range(0, sample_bins - 1):
            st = fetch_regions[-1][1] + bin_gap_size + 1
            end = st + bin_size
            fetch_regions.append((st, end))
        fetch_regions[-1] = fetch_regions[-1][0], stop
        return fetch_regions

[docs]    def fetch(
        self, chrom, start, stop, read_limit=10000, cache=False, sample_bins=3,
        cache_if=lambda x: True, bin_gap_size=0, filter_if=lambda x: False
    ):
        """
        wrapper around the fetch method, returns a list to avoid errors with changing the file pointer
        position from within the loop. Also caches reads if requested and can return a limited read number

        Args:
            chrom (str): the chromosome
            start (int): the start position
            stop (int): the end position
            read_limit (int): the maximum number of reads to parse
            cache (bool): flag to store reads
            sample_bins (int): number of bins to split the region into
            cache_if (callable): function to check to against a read to determine if it should be cached
            bin_gap_size (int): gap between the bins for the fetch area

        Returns:
            :class:`set` of :class:`pysam.AlignedSegment`: set of reads gathered from the region
        """
        # try using the cache to avoid fetching regions more than once
        result = []
        bin_limit = int(read_limit / sample_bins) if read_limit else None
        # split into multiple fetches based on the 'sample_bins'
        for fstart, fend in self.__class__._generate_fetch_bins(start, stop, sample_bins, bin_gap_size):
            count = 0
            for read in self.fh.fetch(chrom, fstart, fend):
                if bin_limit is not None and count >= bin_limit:
                    warnings.warn(
                        'hit read limit. Fetch will not cache all reads for this bin: {}-{}'.format(fstart, fend))
                    break
                if not filter_if(read):
                    result.append(read)
                if cache and cache_if(read):
                    self.add_read(read)
                count += 1
        return set(result)

[docs]    def get_mate(self, read, primary_only=True, allow_file_access=True):
        """
        Args:
            read (pysam.AlignedSegment): the read
            primary_only (bool): ignore secondary alignments
            allow_file_access (bool): determines if the bam can be accessed to try to find the mate
        Returns:
            :class:`list` of :class:`pysam.AlignedSegment`: list of mates of the input read
        """
        # NOTE: will return all mate alignments that have been cached
        putative_mates = self.cache.get(read.query_name, set())
        mates = []
        for mate in putative_mates:
            if not mate.is_unmapped:
                if any([
                    read.is_read1 == mate.is_read1,
                    read.next_reference_start != mate.reference_start,
                    read.next_reference_id != mate.reference_id,
                    primary_only and (mate.is_secondary or mate.is_supplementary),
                    abs(read.template_length) != abs(mate.template_length)
                ]):
                    continue
            mates.append(mate)
        if len(mates) == 0:
            if not allow_file_access or read.mate_is_unmapped:
                raise KeyError('mate is not found in the cache')
            else:
                warnings.warn(
                    'looking for uncached mate of {0}. This requires file access and'.format(read.query_name) +
                    ' requests may be slow. This should also not be using in a loop iterating using the file pointer ' +
                    ' as it will change the file pointer position')
                m = self.fh.mate(read)
                self.add_read(m)
                return [m]
        return mates

[docs]    def close(self):
        """
        close the bam file handle
        """
        try:
            self.fh.close()
        except AttributeError:
            pass