Source code for deeptools.getFragmentAndReadSize

import numpy as np

# own tools
from deeptools import bamHandler
from deeptools import mapReduce

old_settings = np.seterr(all='ignore')


[docs] def getFragmentLength_wrapper(args): return getFragmentLength_worker(*args)
[docs] def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins): """ Queries the reads at the given region for the distance between reads and the read length Parameters ---------- chrom : str chromosome name start : int region start end : int region end bamFile : str BAM file name distanceBetweenBins : int the number of bases at the end of each bin to ignore Returns ------- np.array an np.array, where first column is fragment length, the second is for read length """ bam = bamHandler.openBam(bamFile) end = max(start + 1, end - distanceBetweenBins) if chrom in bam.references: reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) for r in bam.fetch(chrom, start, end) if r.is_proper_pair and r.is_read1 and not r.is_unmapped]) if not len(reads): # if the previous operation produces an empty list # it could be that the data is not paired, then # we try with out filtering reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) for r in bam.fetch(chrom, start, end) if not r.is_unmapped]) else: raise NameError("chromosome {} not found in bam file".format(chrom)) if not len(reads): reads = np.array([]).reshape(0, 2) return reads
[docs] def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, binSize=50000, distanceBetweenBins=1000000, numberOfProcessors=None, verbose=False): """ Estimates the fragment length and read length through sampling Parameters ---------- bamFile : str BAM file name return_lengths : bool numberOfProcessors : int verbose : bool binSize : int distanceBetweenBins : int Returns ------- d : dict tuple of two dictionaries, one for the fragment length and the other for the read length. The dictionaries summarise the mean, median etc. values """ bam_handle = bamHandler.openBam(bamFile) chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) distanceBetweenBins *= 2 fl = [] # Fix issue #522, allow distanceBetweenBins == 0 if distanceBetweenBins == 0: imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=binSize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), getFragmentLength_wrapper, chrom_sizes, genomeChunkLength=stepsize, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) fl = np.concatenate(imap_res) if len(fl): fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: fragment_len_dict = {'sample_size': len(fragment_length), 'min': fragment_length.min(), 'qtile25': np.percentile(fragment_length, 25), 'mean': np.mean(fragment_length), 'median': np.median(fragment_length), 'qtile75': np.percentile(fragment_length, 75), 'max': fragment_length.max(), 'std': np.std(fragment_length), 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))), 'qtile10': np.percentile(fragment_length, 10), 'qtile20': np.percentile(fragment_length, 20), 'qtile30': np.percentile(fragment_length, 30), 'qtile40': np.percentile(fragment_length, 40), 'qtile60': np.percentile(fragment_length, 60), 'qtile70': np.percentile(fragment_length, 70), 'qtile80': np.percentile(fragment_length, 80), 'qtile90': np.percentile(fragment_length, 90), 'qtile99': np.percentile(fragment_length, 99)} else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: fragment_len_dict['lengths'] = fragment_length read_len_dict = {'sample_size': len(read_length), 'min': read_length.min(), 'qtile25': np.percentile(read_length, 25), 'mean': np.mean(read_length), 'median': np.median(read_length), 'qtile75': np.percentile(read_length, 75), 'max': read_length.max(), 'std': np.std(read_length), 'mad': np.median(np.abs(read_length - np.median(read_length))), 'qtile10': np.percentile(read_length, 10), 'qtile20': np.percentile(read_length, 20), 'qtile30': np.percentile(read_length, 30), 'qtile40': np.percentile(read_length, 40), 'qtile60': np.percentile(read_length, 60), 'qtile70': np.percentile(read_length, 70), 'qtile80': np.percentile(read_length, 80), 'qtile90': np.percentile(read_length, 90), 'qtile99': np.percentile(read_length, 99)} if return_lengths: read_len_dict['lengths'] = read_length else: fragment_len_dict = None read_len_dict = None return fragment_len_dict, read_len_dict