Source code for deeptools.correctReadCounts

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from scipy.stats import poisson
from deeptools import writeBedGraph
from deeptools.SES_scaleFactor import estimateScaleFactor

debug = 1
old_settings = np.seterr(all='ignore')


[docs]def computeLambda(tileCoverage, args):
    """
    This function is called by the writeBedGraph workers for every
    tile in the genome that is considered
    """

    treatmentWindowTags = tileCoverage[0]
    controlWindowTags = tileCoverage[1]
    treatmentExtraSignalTags = treatmentWindowTags - args['treatmentMean']

    controlLambda = args['controlMean'] + (treatmentExtraSignalTags * args['controlSignalRatio'])

    log10pvalue = -1 * poisson.logsf(controlWindowTags, controlLambda) / np.log(10)

    return log10pvalue


[docs]def computePvalue(tileCoverage, args):
    """
    This function is called by the writeBedGraph workers for every
    tile in the genome that is considered

    It computes a pvalue based on an expected lambda comming from
    the correction of treatment when the input is considered.
    """

    treatmentWindowTags = tileCoverage[0]
    controlWindowTags = tileCoverage[1]

    treatmentLambda = controlWindowTags * args['treatmentControlRatio']

    log10pvalue = min(300, -1 * poisson.logsf(treatmentWindowTags, treatmentLambda) / np.log(10))

    return log10pvalue


[docs]def computeCorrectedReadcounts(tileCoverage, args):
    """
    This function is called by the writeBedGraph workers for every
    tile in the genome that is considered

    It computes a pvalue based on an expected lambda comming from
    the correction of treatment when the input is considered.
    """

    treatmentWindowTags = tileCoverage[0]
    controlWindowTags = tileCoverage[1]
    treatmentCorrectedTags = treatmentWindowTags - args['treatmentControlRatio'] * (controlWindowTags - args['controlMean'])

    return treatmentCorrectedTags


[docs]def correctReadCounts(bamFilesList, binLength, numberOfSamples, defaultFragmentLength,
                      outFileName, outFileFormat, outFileNameCorr=None, region=None,
                      extendPairedEnds=True,
                      numberOfProcessors=1, Nsigmas=2, maxSignalRatio=10, blackListFileName=None, verbose=False):

    bam1 = writeBedGraph.openBam(bamFilesList[0])
    bam2 = writeBedGraph.openBam(bamFilesList[1])

    treatmentMapped = bam1.mapped
    controlMapped = bam2.mapped
    treatmentControlRatioMapped = float(treatmentMapped) / controlMapped

    # 1. Get a table containing number of reads in a sample from the genome.
    #    Only regions for which both samples have more than zero counts are considered

    scaleFactorsDict = estimateScaleFactor(bamFilesList,
                                           binLength,
                                           numberOfSamples,
                                           defaultFragmentLength,
                                           1,
                                           blackListFileName=blackListFileName,
                                           numberOfProcessors=numberOfProcessors,
                                           verbose=verbose)

    """
    num_reads_per_region = getNumReadsPerBin(bamFilesList, binLength, numberOfSamples, defaultFragmentLength, numberOfProcessors, skipZeros=True, verbose=verbose)
    if verbose:
        print "number of non-zero regions sampled: {}".format(num_reads_per_region.shape[0])

    # 2. get Mean and std of treatment (col1) and control (col2)

    treatmentMean, controlMean = np.mean(num_reads_per_region, axis=0) # axis=0: that measn by column
    treatmentStd, controlStd   = np.std(num_reads_per_region, axis=0)
    treatmentTotal, controlTotal   = np.sum(num_reads_per_region, axis=0)

    # 3. Calculate residual in treatment & control data, at regions for which treatment
    #    signal exceeds mean + std * Nsigmas
    #    (these are expected to be the regions at which the signal > mean-signal,
    #    so the residual signal is positive)

    overRows = np.where(num_reads_per_region[:,0].copy() >= treatmentMean + treatmentStd*Nsigmas )[0]
    over_Nsigma_regions = num_reads_per_region[overRows, :]

    treatmentSigMean, controlSigMean = np.mean(over_Nsigma_regions, axis=0)

    treatmentExtraSignal = treatmentSigMean - treatmentMean
    controlExtraSignal   = controlSigMean - controlMean

    treatmentControlRatio = float(treatmentTotal) / controlTotal
    adjSignalRatio = maxSignalRatio * treatmentControlRatio;
    treatmentSignalRatio = float(treatmentExtraSignal) / controlExtraSignal

    if treatmentSignalRatio < adjSignalRatio and treatmentSignalRatio > 0:
        treatmentSignalRatio = adjSignalRatio

    if treatmentSignalRatio < 1:
        raise NameError("estimated signal in control file {} is greater than estimated signal in treatmant file {}. Perhaps the file names are swapped?".format(bamFilesList[0], bamFilesList[1]))

    else:
        controlSignalRatio = 1.0/treatmentSignalRatio

    controlRatio = 1.0 / treatmentControlRatio

    """

#    scaleFactors = scaleFactorsDict['size_factors']

    treatmentMean, controlMean = scaleFactorsDict['meanSES']
    treatmentControlRatio = scaleFactorsDict['size_factors'][1] / scaleFactorsDict['size_factors'][0]
    treatmentSignalRatio = treatmentControlRatio
    controlRatio = controlSignalRatio = 1.0 / treatmentControlRatio
    treatmentTotal = treatmentMapped
    controlTotal = controlMapped

    print "Treatment mean: {:.2f}, Treatment total:{:.2f}".format(treatmentMean, treatmentTotal)
    print "Control mean: {:.2f}, Control total:{}".format(controlMean, controlTotal)
    print "the ratio of treatment vs. control for enriched regions is: {:.2f}".format(treatmentSignalRatio)
    print "the ratio of treatment vs. control ratio: {:.2f} (if based on mapped reads: {:.2f})".format(treatmentControlRatio, treatmentControlRatioMapped)

    funcArgs = {'controlMean': controlMean,
                'treatmentMean': treatmentMean,
                'controlSignalRatio': controlSignalRatio,
                'controlRatio': controlRatio,
                'treatmentControlRatio': treatmentControlRatio
                }

    writeBedGraph.writeBedGraph(bamFilesList,
                                outFileName,
                                defaultFragmentLength, computePvalue,
                                funcArgs, tileSize=binLength, region=region,
                                format=outFileFormat,
                                zerosToNans=False,
                                blackListFileName=blackListFileName,
                                numberOfProcessors=numberOfProcessors,
                                extendPairedEnds=extendPairedEnds)

    if outFileNameCorr:
        writeBedGraph.writeBedGraph(bamFilesList,
                                    outFileNameCorr,
                                    defaultFragmentLength, computeCorrectedReadcounts,
                                    funcArgs, tileSize=binLength, region=region,
                                    format=outFileFormat,
                                    zerosToNans=False,
                                    blackListFileName=blackListFileName,
                                    numberOfProcessors=numberOfProcessors,
                                    extendPairedEnds=extendPairedEnds)


[docs]def controlLambda(tileCoverage, args):

    controlLambda = args['controlMean'] + (tileCoverage[0] * args['controlSignalRatio'])
    return controlLambda