Source code for deeptools.writeBedGraph_bam_and_bw

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import shutil
import tempfile
import numpy as np
import sys

# NGS packages
import pyBigWig

# own module
from deeptools import mapReduce
from deeptools.utilities import getCommonChrNames, toBytes
from deeptools.writeBedGraph import *
from deeptools import bamHandler

old_settings = np.seterr(all='ignore')


[docs] def getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize, missingDataAsZero=False): try: coverage = np.asarray(bigwigHandle.values(chrom, start, end)) except RuntimeError: # this error happens when chromosome # is not into the bigwig file return [] if coverage is None: return [] if missingDataAsZero is True: coverage[np.isnan(coverage)] = 0 # average the values per bin cov = np.array( [np.mean(coverage[x:x + tileSize]) for x in range(0, len(coverage), tileSize)]) return cov
[docs] def writeBedGraph_wrapper(args): return writeBedGraph_worker(*args)
[docs] def writeBedGraph_worker( chrom, start, end, tileSize, defaultFragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0, skipZeroOverZero=False, missingDataAsZero=False, fixedStep=False): r""" Writes a bedgraph having as base a number of bam files. The given func is called to compute the desired bedgraph value using the funcArgs tileSize """ if start > end: raise NameError("start position ({0}) bigger than " "end position ({1})".format(start, end)) coverage = [] for indexFile, fileFormat in bamOrBwFileList: if fileFormat == 'bam': bamHandle = bamHandler.openBam(indexFile) coverage.append(getCoverageFromBam( bamHandle, chrom, start, end, tileSize, defaultFragmentLength, extendPairedEnds, True)) bamHandle.close() elif fileFormat == 'bigwig': bigwigHandle = pyBigWig.open(indexFile) coverage.append( getCoverageFromBigwig( bigwigHandle, chrom, start, end, tileSize, missingDataAsZero)) bigwigHandle.close() _file = tempfile.NamedTemporaryFile(delete=False) previousValue = None lengthCoverage = len(coverage[0]) for tileIndex in range(lengthCoverage): tileCoverage = [] for index in range(len(bamOrBwFileList)): if smoothLength > 0: vectorStart, vectorEnd = getSmoothRange( tileIndex, tileSize, smoothLength, lengthCoverage) tileCoverage.append( np.mean(coverage[index][vectorStart:vectorEnd])) else: try: tileCoverage.append(coverage[index][tileIndex]) except IndexError: sys.exit("Chromosome {} probably not in one of the bigwig " "files. Remove this chromosome from the bigwig file " "to continue".format(chrom)) if skipZeroOverZero and np.sum(tileCoverage) == 0: previousValue = None continue value = func(tileCoverage, funcArgs) if fixedStep: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) try: _file.write(toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart, writeEnd, value))) except TypeError: _file.write(toBytes("{}\t{}\t{}\t{}\n".format(chrom, writeStart, writeEnd, value))) else: if previousValue is None: writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) previousValue = value elif previousValue == value: writeEnd = min(writeEnd + tileSize, end) elif previousValue != value: if not np.isnan(previousValue): _file.write( toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart, writeEnd, previousValue))) previousValue = value writeStart = writeEnd writeEnd = min(writeStart + tileSize, end) if not fixedStep: # write remaining value if not a nan if previousValue and writeStart != end and \ not np.isnan(previousValue): _file.write(toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart, end, previousValue))) tempFileName = _file.name _file.close() return chrom, start, end, tempFileName
[docs] def writeBedGraph( bamOrBwFileList, outputFileName, fragmentLength, func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1, format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, skipZeroOverZero=False, smoothLength=0, fixedStep=False, verbose=False): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file for a partition of the genome into tiles of given size and a value for each tile that corresponds to the given function and that is related to the coverage underlying the tile. """ bamHandles = [] mappedList = [] for indexedFile, fileFormat in bamOrBwFileList: if fileFormat == 'bam': bam, mapped, unmapped, stats = bamHandler.openBam(indexedFile, returnStats=True, nThreads=numberOfProcessors) bamHandles.append(bam) mappedList.append(mapped) if len(bamHandles): genomeChunkLength = getGenomeChunkLength(bamHandles, tileSize, mappedList) # check if both bam files correspond to the same species # by comparing the chromosome names: chromNamesAndSize, __ = getCommonChrNames(bamHandles, verbose=verbose) else: genomeChunkLength = int(10e6) cCommon_number = {} chromNamesAndSize = {} for fileName, fileFormat in bamOrBwFileList: if fileFormat == 'bigwig': fh = pyBigWig.open(fileName) else: continue for chromName, size in list(fh.chroms().items()): if chromName in chromNamesAndSize: cCommon_number[chromName] += 1 if chromNamesAndSize[chromName] != size: print("\nWARNING\n" "Chromosome {} length reported in the " "input files differ.\n{} for {}\n" "{} for {}.\n\nThe smallest " "length will be used".format( chromName, chromNamesAndSize[chromName], bamOrBwFileList[0][0], size, fileName)) chromNamesAndSize[chromName] = min( chromNamesAndSize[chromName], size) else: chromNamesAndSize[chromName] = size cCommon_number[chromName] = 1 fh.close() # get the list of common chromosome names and sizes if len(bamOrBwFileList) == 1: chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items()] else: chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() if k in cCommon_number and cCommon_number[k] == len(bamOrBwFileList)] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, func, funcArgs, extendPairedEnds, smoothLength, skipZeroOverZero, missingDataAsZero, fixedStep), writeBedGraph_wrapper, chromNamesAndSize, genomeChunkLength=genomeChunkLength, region=region, blackListFileName=blackListFileName, numberOfProcessors=numberOfProcessors, verbose=verbose) # Determine the sorted order of the temp files chrom_order = dict() for i, _ in enumerate(chromNamesAndSize): chrom_order[_[0]] = i res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() if format == 'bedgraph': of = open(outputFileName, 'wb') for r in res: if r is not None: _ = open(r[3], 'rb') shutil.copyfileobj(_, of) _.close() os.remove(r[3]) of.close() else: bedGraphToBigWig(chromNamesAndSize, [x[3] for x in res], outputFileName)