Source code for deeptools.sumCoveragePerBin

import numpy as np
import multiprocessing
import time

from deeptools import countReadsPerBin
from deeptools.utilities import getTLen
from deeptoolsintervals import GTF


[docs]class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin): r"""This is an extension of CountReadsPerBin for use with plotFingerprint. There, we need to sum the per-base coverage. """
[docs] def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentFromRead_func=None): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. >>> test = Tester() >>> import pysam >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) For this case the reads are length 36. The number of overlapping read fragments is 4 and 5 for the positions tested. Note that reads are NOT extended, due to there being a 0 length input list of BAM files! >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', ... [(5000833, 5000834), (5000834, 5000835)]) array([4., 5.]) In the following case the reads length is 50. Reads are not extended. >>> c.extendReads=False >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) array([2., 4., 4.]) """ if not fragmentFromRead_func: fragmentFromRead_func = self.get_fragment_from_read nbins = len(regions) if len(regions[0]) == 3: nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] coverages = np.zeros(nbins, dtype='float64') if self.defaultFragmentLength == 'read length': extension = 0 else: extension = self.maxPairedFragmentLength blackList = None if self.blackListFileName is not None: blackList = GTF(self.blackListFileName) vector_start = 0 for idx, reg in enumerate(regions): if len(reg) == 3: tileSize = int(reg[2]) nRegBins = (reg[1] - reg[0]) // tileSize else: nRegBins = 1 tileSize = int(reg[1] - reg[0]) # Blacklisted regions have a coverage of 0 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): continue regStart = int(max(0, reg[0] - extension)) regEnd = reg[1] + int(extension) # If alignments are extended and there's a blacklist, ensure that no # reads originating in a blacklist are fetched if blackList and reg[0] > 0 and extension > 0: o = blackList.findOverlaps(chrom, regStart, reg[0]) if o is not None and len(o) > 0: regStart = o[-1][1] o = blackList.findOverlaps(chrom, reg[1], regEnd) if o is not None and len(o) > 0: regEnd = o[0][0] start_time = time.time() # caching seems faster. TODO: profile the function c = 0 try: # BAM input if chrom not in bamHandle.references: raise NameError("chromosome {} not found in bam file".format(chrom)) except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages += _ continue else: raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) prev_pos = set() lpos = None # of previous processed read pair for read in bamHandle.fetch(chrom, regStart, regEnd): if read.is_unmapped: continue if self.minMappingQuality and read.mapq < self.minMappingQuality: continue # filter reads based on SAM flag if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue # Fragment lengths tLen = getTLen(read) if self.minFragmentLength > 0 and tLen < self.minFragmentLength: continue if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: continue # get rid of duplicate reads that have same position on each of the # pairs if self.ignoreDuplicates: # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions if tLen >= 0: s = read.pos e = s + tLen else: s = read.pnext e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext if lpos is not None and lpos == read.reference_start \ and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: continue if lpos != read.reference_start: prev_pos.clear() lpos = read.reference_start prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) # since reads can be split (e.g. RNA-seq reads) each part of the # read that maps is called a position block. try: position_blocks = fragmentFromRead_func(read) except TypeError: # the get_fragment_from_read functions returns None in some cases. # Those cases are to be skipped, hence the continue line. continue last_eIdx = None for fragmentStart, fragmentEnd in position_blocks: if fragmentEnd is None or fragmentStart is None: continue fragmentLength = fragmentEnd - fragmentStart if fragmentLength == 0: continue # skip reads that are not in the region being # evaluated. if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: continue if fragmentStart < reg[0]: fragmentStart = reg[0] if fragmentEnd > reg[0] + len(coverages) * tileSize: fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) if eIdx >= len(coverages): eIdx = len(coverages) - 1 if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: continue # First bin if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: _ = fragmentEnd - fragmentStart else: _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart if _ > tileSize: _ = tileSize coverages[sIdx] += _ _ = sIdx + 1 while _ < eIdx: coverages[_] += tileSize _ += 1 while eIdx - sIdx >= nRegBins: eIdx -= 1 if eIdx > sIdx: _ = fragmentEnd - (reg[0] + eIdx * tileSize) if _ > tileSize: _ = tileSize elif _ < 0: _ = 0 coverages[eIdx] += _ last_eIdx = eIdx c += 1 if self.verbose: endTime = time.time() print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) vector_start += nRegBins # change zeros to NAN if self.zerosToNans: coverages[coverages == 0] = np.nan return coverages
[docs]class Tester(object): def __init__(self): """ The distribution of reads between the two bam files is as follows. They cover 200 bp 0 100 200 |------------------------------------------------------------| A =============== =============== B =============== =============== =============== =============== """ import os self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" self.bamFile_PE = self.root + "test_paired2.bam" self.chrom = '3R'