Source code for deeptools.getScorePerBigWigBin

import pyBigWig
import numpy as np
import os
import sys
import shutil
import warnings

# deepTools packages
import deeptools.mapReduce as mapReduce
import deeptools.utilities
# debug = 0

old_settings = np.seterr(all='ignore')


[docs]def countReadsInRegions_wrapper(args):
    # Using arguments unpacking!
    return countFragmentsInRegions_worker(*args)


[docs]def countFragmentsInRegions_worker(chrom, start, end,
                                   bigWigFiles,
                                   stepSize, binLength,
                                   save_data,
                                   bedRegions=None
                                   ):
    """ returns the average score in each bigwig file at each 'stepSize'
    position within the interval start, end for a 'binLength' window.
    Because the idea is to get counts for window positions at
    different positions for sampling the bins are equally spaced
    and *not adjacent*.

    If a list of bedRegions is given, then the number of reads
    that overlaps with each region is counted.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Fragment coverage.
    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
    array([[ 1.5],
           [ 1.5]])

    BED regions:
    >>> bedRegions = [(test.chrom, 45, 55), (test.chrom, 95, 105), (test.chrom, 145, 155)]
    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
    ... bedRegions=bedRegions)[0])
    array([[ 1. ,  1.5,  2. ],
           [ 1. ,  1. ,  2. ]])
    """
    assert start < end, "start {} bigger that end {}".format(start, end)

    # array to keep the scores for the regions
    sub_score_per_bin = []

    rows = 0

    bigwig_handlers = [pyBigWig.open(bw) for bw in bigWigFiles]

    regions_to_consider = []
    if bedRegions:
        for chrom, start, end in bedRegions:
            regions_to_consider.append((chrom, start, end, end - start))
    else:
        for i in xrange(start, end, stepSize):
            if (i + binLength) > end:
                regions_to_consider.append((chrom, i, end, end - i))  # last bin (may be smaller)
            else:
                regions_to_consider.append((chrom, i, i + binLength, binLength))

    if save_data:
        _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
        _file_name = _file.name
    else:
        _file_name = ''
    warnings.simplefilter("default")
    i = 0
    for chrom, start, end, binLength in regions_to_consider:
        avgReadsArray = []
        i += 1

        for idx, bwh in enumerate(bigwig_handlers):
            if chrom not in bwh.chroms().keys():
                unmod_name = chrom
                if chrom.startswith('chr'):
                    # remove the chr part from chromosome name
                    chrom = chrom[3:]
                else:
                    # prefix with 'chr' the chromosome name
                    chrom = 'chr' + chrom
                if chrom not in bwh.chroms().keys():
                    exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))

            score = bwh.stats(chrom, start, end)

            if score is None or score == [None] or np.isnan(score[0]):
                score = [np.nan]
            avgReadsArray.append(score[0])  # mean of fragment coverage for region
        # print "{} Region: {}:{:,}-{:,} {}  {} {}".format(i, chrom, start, end, binLength, avgReadsArray[0], avgReadsArray[1])

        sub_score_per_bin.extend(avgReadsArray)
        rows += 1
        if save_data:
            _file.write("\t".join(map(str, [chrom, start, end])) + "\t")
            _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")

    if save_data:
        _file.close()
    warnings.resetwarnings()

    # the output is a matrix having as many rows as the variable 'row'
    # and as many columns as bigwig files. The rows correspond to
    # each of the regions processed by the worker.
    # np.array([[score1_1, score1_2],
    #           [score2_1, score2_2]]
    return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name


[docs]def getChromSizes(bigwigFilesList):
    """
    Get chromosome sizes from bigWig file with pyBigWig

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Chromosome name(s) and size(s).
    >>> getChromSizes([test.bwFile1, test.bwFile2])
    ([('3R', 200L)], set([]))
    """
    # check that the path to USCS bedGraphToBigWig as set in the config
    # is installed and is executable.

    def print_chr_names_and_size(chr_set):
        sys.stderr.write("chromosome\tlength\n")
        for name, size in chr_set:
            sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))

    bigwigFilesList = bigwigFilesList[:]

    common_chr = set(pyBigWig.open(bigwigFilesList.pop()).chroms().items())
    non_common_chr = set()
    for bw in bigwigFilesList:
        _names_and_size = set(pyBigWig.open(bw).chroms().items())
        if len(common_chr & _names_and_size) == 0:
            #  try to add remove 'chr' from the chromosme name
            _corr_names_size = set()
            for chrom_name, size in _names_and_size:
                if chrom_name.startswith('chr'):
                    _corr_names_size.add((chrom_name[3:], size))
                else:
                    _corr_names_size.add(('chr' + chrom_name, size))
            if len(common_chr & _corr_names_size) == 0:
                message = "No common chromosomes found. Are the bigwig files " \
                          "from the same species and same assemblies?\n"
                sys.stderr.write(message)
                print_chr_names_and_size(common_chr)

                sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
                                 "lengths from file\n{}\n".format(bw))
                print_chr_names_and_size(_names_and_size)
                exit(1)
            else:
                _names_and_size = _corr_names_size

        non_common_chr |= common_chr ^ _names_and_size
        common_chr = common_chr & _names_and_size

    if len(non_common_chr) > 0:
        sys.stderr.write("\nThe following chromosome names did not match between the the bigwig files\n")
        print_chr_names_and_size(non_common_chr)

    # get the list of common chromosome names and sizes
    return sorted(common_chr), non_common_chr


[docs]def getScorePerBin(bigWigFiles, binLength,
                   numberOfProcessors=1,
                   verbose=False, region=None,
                   bedFile=None,
                   blackListFileName=None,
                   stepSize=None,
                   chrsToSkip=[],
                   out_file_for_raw_data=None):
    """
    This function returns a matrix containing scores (median) for the coverage
    of fragments within a region. Each row corresponds to a sampled region.
    Likewise, each column corresponds to a bigwig file.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()
    >>> np.transpose(getScorePerBin([test.bwFile1, test.bwFile2], 50, 3))
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    """

    # Try to determine an optimal fraction of the genome (chunkSize)
    # that is sent to workers for analysis. If too short, too much time
    # is spent loading the files
    # if too long, some processors end up free.
    # the following is a heuristic

    # get list of common chromosome names and sizes
    chrom_sizes, non_common = getChromSizes(bigWigFiles)
    # skip chromosome in the list. This is usually for the
    # X chromosome which may have either one copy  in a male sample
    # or a mixture of male/female and is unreliable.
    # Also the skip may contain heterochromatic regions and
    # mitochondrial DNA
    if chrsToSkip and len(chrsToSkip):
        chrom_sizes = [x for x in chrom_sizes if x[0] not in chrsToSkip]

    chrnames, chrlengths = zip(*chrom_sizes)
    if stepSize is None:
        stepSize = binLength  # for adjacent bins

    # set chunksize based on number of processors used
    chunkSize = max(sum(chrlengths) / numberOfProcessors, int(1e6))
    # make chunkSize multiple of binLength
    chunkSize -= chunkSize % binLength
    if verbose:
        print "step size is {}".format(stepSize)

    if region:
        # in case a region is used, append the tilesize
        region += ":{}".format(binLength)
    # mapReduce( (staticArgs), func, chromSize, etc. )
    if out_file_for_raw_data:
        save_file = True
    else:
        save_file = False

    imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file),
                                   countReadsInRegions_wrapper,
                                   chrom_sizes,
                                   genomeChunkLength=chunkSize,
                                   bedFile=bedFile,
                                   blackListFileName=blackListFileName,
                                   region=region,
                                   numberOfProcessors=numberOfProcessors)

    if out_file_for_raw_data:
        if len(non_common):
            sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for "
                             "the chromosomes that were not common between the bigwig files\n")

        # concatenate intermediary bedgraph files
        for _values, tempFileName in imap_res:
            if tempFileName:
                # concatenate all intermediate tempfiles into one
                shutil.copyfileobj(open(tempFileName, 'r'), out_file_for_raw_data)
                os.remove(tempFileName)

        out_file_for_raw_data.close()

    # the matrix scores are in the first element of each of the entries in imap_res
    score_per_bin = np.concatenate([x[0] for x in imap_res], axis=0)
    return score_per_bin


[docs]class Tester(object):

    def __init__(self):
        """
        The the two bigWig files are as follows:
        $ cat /tmp/testA.bg
        3R      0       100     1
        3R      100     200     2

        $ cat /tmp/testB.bg
        3R      0       150     1
        3R      150     200     3

        They cover 200 bp:

              0              50              100            150            200
              |------------------------------------------------------------|
            A  111111111111111111111111111111122222222222222222222222222222


            B  111111111111111111111111111111111111111111111333333333333333

        """

        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
        self.bwFile1 = self.root + "testA.bw"
        self.bwFile2 = self.root + "testB.bw"
        self.bwFile_PE = self.root + "test_paired2.bw"
        self.chrom = '3R'
        # global debug
        # debug = 0