Source code for deeptools.getScorePerBigWigBin

import pyBigWig
import numpy as np
import os
import sys
import shutil
import warnings

# deepTools packages
import deeptools.mapReduce as mapReduce
import deeptools.utilities
# debug = 0

old_settings = np.seterr(all='ignore')


[docs] def countReadsInRegions_wrapper(args): # Using arguments unpacking! return countFragmentsInRegions_worker(*args)
[docs] def countFragmentsInRegions_worker(chrom, start, end, bigWigFiles, stepSize, binLength, save_data, bedRegions=None ): """ returns the average score in each bigwig file at each 'stepSize' position within the interval start, end for a 'binLength' window. Because the idea is to get counts for window positions at different positions for sampling the bins are equally spaced and *not adjacent*. If a list of bedRegions is given, then the number of reads that overlaps with each region is counted. Test dataset with two samples covering 200 bp. >>> test = Tester() Fragment coverage. >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0]) array([[1., 1., 2., 2.], [1., 1., 1., 3.]]) >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0]) array([[1.5], [1.5]]) BED regions: >>> bedRegions = [[test.chrom, [(45, 55)]], [test.chrom, [(95, 105)]], [test.chrom, [(145, 155)]]] >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False, ... bedRegions=bedRegions)[0]) array([[1. , 1.5, 2. ], [1. , 1. , 2. ]]) """ assert start < end, "start {} bigger that end {}".format(start, end) # array to keep the scores for the regions sub_score_per_bin = [] rows = 0 bigwig_handles = [] for foo in bigWigFiles: bigwig_handles.append(pyBigWig.open(foo)) regions_to_consider = [] if bedRegions: for reg in bedRegions: regs = [] for exon in reg[1]: regs.append((exon[0], exon[1])) regions_to_consider.append(regs) else: for i in range(start, end, stepSize): if (i + binLength) > end: regions_to_consider.append([(i, end)]) # last bin (may be smaller) else: regions_to_consider.append([(i, i + binLength)]) if save_data: _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') _file_name = _file.name else: _file_name = '' warnings.simplefilter("default") i = 0 for reg in regions_to_consider: avgReadsArray = [] i += 1 for idx, bwh in enumerate(bigwig_handles): if chrom not in bwh.chroms(): unmod_name = chrom if chrom.startswith('chr'): # remove the chr part from chromosome name chrom = chrom[3:] else: # prefix with 'chr' the chromosome name chrom = 'chr' + chrom if chrom not in bwh.chroms(): exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx])) weights = [] scores = [] for exon in reg: weights.append(exon[1] - exon[0]) score = bwh.stats(chrom, exon[0], exon[1]) if score is None or score == [None] or np.isnan(score[0]): score = [np.nan] scores.extend(score) avgReadsArray.append(np.average(scores, weights=weights)) # mean of fragment coverage for region sub_score_per_bin.extend(avgReadsArray) rows += 1 if save_data: starts = [] ends = [] for exon in reg: starts.append(str(exon[0])) ends.append(str(exon[1])) starts = ",".join(starts) ends = ",".join(ends) _file.write("\t".join(map(str, [chrom, starts, ends])) + "\t") _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n") if save_data: _file.close() warnings.resetwarnings() # the output is a matrix having as many rows as the variable 'row' # and as many columns as bigwig files. The rows correspond to # each of the regions processed by the worker. # np.array([[score1_1, score1_2], # [score2_1, score2_2]] return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
[docs] def getChromSizes(bigwigFilesList): """ Get chromosome sizes from bigWig file with pyBigWig Test dataset with two samples covering 200 bp. >>> test = Tester() Chromosome name(s) and size(s). >>> assert getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([])) """ def print_chr_names_and_size(chr_set): sys.stderr.write("chromosome\tlength\n") for name, size in chr_set: sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size)) bigwigFilesList = bigwigFilesList[:] common_chr = set() for fname in bigwigFilesList: fh = pyBigWig.open(fname) common_chr = common_chr.union(set(fh.chroms().items())) fh.close() non_common_chr = set() for bw in bigwigFilesList: _names_and_size = set(pyBigWig.open(bw).chroms().items()) if len(common_chr & _names_and_size) == 0: # try to add remove 'chr' from the chromosme name _corr_names_size = set() for chrom_name, size in _names_and_size: if chrom_name.startswith('chr'): _corr_names_size.add((chrom_name[3:], size)) else: _corr_names_size.add(('chr' + chrom_name, size)) if len(common_chr & _corr_names_size) == 0: message = "No common chromosomes found. Are the bigwig files " \ "from the same species and same assemblies?\n" sys.stderr.write(message) print_chr_names_and_size(common_chr) sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n" "lengths from file\n{}\n".format(bw)) print_chr_names_and_size(_names_and_size) exit(1) else: _names_and_size = _corr_names_size non_common_chr |= common_chr ^ _names_and_size common_chr = common_chr & _names_and_size if len(non_common_chr) > 0: sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n") print_chr_names_and_size(non_common_chr) # get the list of common chromosome names and sizes return sorted(common_chr), non_common_chr
[docs] def getScorePerBin(bigWigFiles, binLength, numberOfProcessors=1, verbose=False, region=None, bedFile=None, blackListFileName=None, stepSize=None, chrsToSkip=[], out_file_for_raw_data=None, allArgs=None): """ This function returns a matrix containing scores (median) for the coverage of fragments within a region. Each row corresponds to a sampled region. Likewise, each column corresponds to a bigwig file. Test dataset with two samples covering 200 bp. >>> test = Tester() >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3)) array([[1., 1., 2., 2.], [1., 1., 1., 3.]]) """ # Try to determine an optimal fraction of the genome (chunkSize) # that is sent to workers for analysis. If too short, too much time # is spent loading the files # if too long, some processors end up free. # the following is a heuristic # get list of common chromosome names and sizes chrom_sizes, non_common = getChromSizes(bigWigFiles) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample # or a mixture of male/female and is unreliable. # Also the skip may contain heterochromatic regions and # mitochondrial DNA if chrsToSkip and len(chrsToSkip): chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip] chrnames, chrlengths = list(zip(*chrom_sizes)) if stepSize is None: stepSize = binLength # for adjacent bins # set chunksize based on number of processors used chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6)) # make chunkSize multiple of binLength chunkSize -= chunkSize % binLength if verbose: print("step size is {}".format(stepSize)) if region: # in case a region is used, append the tilesize region += ":{}".format(binLength) # mapReduce( (staticArgs), func, chromSize, etc. ) if out_file_for_raw_data: save_file = True else: save_file = False # Handle GTF options transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file), countReadsInRegions_wrapper, chrom_sizes, genomeChunkLength=chunkSize, bedFile=bedFile, blackListFileName=blackListFileName, region=region, numberOfProcessors=numberOfProcessors, transcriptID=transcriptID, exonID=exonID, keepExons=keepExons, transcript_id_designator=transcript_id_designator) if out_file_for_raw_data: if len(non_common): sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " "the chromosomes that were not common between the bigwig files\n") # concatenate intermediary bedgraph files ofile = open(out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one f = open(tempFileName, 'r') shutil.copyfileobj(f, ofile) f.close() os.remove(tempFileName) ofile.close() # the matrix scores are in the first element of each of the entries in imap_res score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0) return score_per_bin
[docs] class Tester(object): def __init__(self): """ The the two bigWig files are as follows: $ cat /tmp/testA.bg 3R 0 100 1 3R 100 200 2 $ cat /tmp/testB.bg 3R 0 150 1 3R 150 200 3 They cover 200 bp: 0 50 100 150 200 |------------------------------------------------------------| A 111111111111111111111111111111122222222222222222222222222222 B 111111111111111111111111111111111111111111111333333333333333 """ self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" self.bwFile1 = self.root + "testA.bw" self.bwFile2 = self.root + "testB.bw" self.bwFile_PE = self.root + "test_paired2.bw" self.chrom = '3R'
# global debug # debug = 0