Source code for deeptools.correlation

import sys
import itertools
import numpy as np
import scipy.cluster.hierarchy as sch
import scipy.stats
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.ticker
import matplotlib.mlab
import matplotlib.markers

old_settings = np.seterr(all='ignore')


[docs]class Correlation: """ class to work with matrices having sample data to compute correlations, plot them and make scatter plots """ def __init__(self, matrix_file, corr_method=None, labels=None, remove_outliers=False, skip_zeros=False, log1p=False): self.load_matrix(matrix_file) self.skip_zeros = skip_zeros self.corr_method = corr_method self.corr_matrix = None # correlation matrix if labels is not None: # test that the length of labels # corresponds to the length of # samples self.labels = labels if self.matrix.shape[1] == 1: # There's nothing that can be done with a single sample sys.exit("\nPlease use a matrix with more than one sample\n") if skip_zeros is True: # remove rows containing only nans or zeros # that could be unmappable regions. self.remove_rows_of_zeros() if remove_outliers is True: # remove outliers, otherwise outliers will produce a very # high pearson correlation. Unnecessary for spearman correlation self.remove_outliers() if log1p is True: self.matrix = np.log1p(self.matrix) if corr_method: self.compute_correlation()
[docs] def load_matrix(self, matrix_file): """ loads a matrix file saved using the numpy savez method. Two keys are expected: 'matrix' and 'labels'. The matrix should contain one sample per row """ _ma = np.load(matrix_file) # matrix: cols correspond to samples self.matrix = np.asarray(_ma['matrix'].tolist()) if np.any(np.isnan(self.matrix)): num_nam = len(np.flatnonzero(np.isnan(self.matrix.flatten()))) sys.stderr.write("*Warning*. {} NaN values were found. They will be removed along with the " "corresponding bins in other samples for the computation " "and plotting\n".format(num_nam)) self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix)) self.labels = _ma['labels'] assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \ "to length of matrix samples"
@staticmethod
[docs] def get_outlier_indices(data, max_deviation=200): """ The method is based on the median absolute deviation. See Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and Handle Outliers", The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. returns the list, without the outliers The max_deviation=200 is like selecting a z-score larger than 200, just that it is based on the median and the median absolute deviation instead of the mean and the standard deviation. """ median = np.median(data) b_value = 1.4826 # value set for a normal distribution mad = b_value * np.median(np.abs(data)) outliers = [] if mad > 0: deviation = abs(data - median) / mad """ outliers = data[deviation > max_deviation] print "outliers removed {}".format(len(outliers)) print outliers """ outliers = np.flatnonzero(deviation > max_deviation) return outliers
[docs] def remove_outliers(self, verbose=True): """ get the outliers *per column* using the median absolute deviation method Returns the filtered matrix """ unfiltered = len(self.matrix) to_remove = None for col in self.matrix.T: outliers = self.get_outlier_indices(col) if to_remove is None: to_remove = set(outliers) else: # only set to remove those bins in which # the outliers are present in all cases (colums) # that's why the intersection is used to_remove = to_remove.intersection(outliers) if len(to_remove): to_keep = [x for x in range(self.matrix.shape[0]) if x not in to_remove] self.matrix = self.matrix[to_keep, :] if verbose: sys.stderr.write( "total/filtered/left: " "{}/{}/{}\n".format(unfiltered, unfiltered - len(to_keep), len(to_keep))) return self.matrix
[docs] def remove_rows_of_zeros(self): # remove rows containing all zeros or all nans _mat = np.nan_to_num(self.matrix) to_keep = _mat.sum(1) != 0 self.matrix = self.matrix[to_keep, :]
[docs] def save_corr_matrix(self, file_handle): """ saves the correlation matrix """ file_handle.write("\t'" + "'\t'".join(self.labels) + "'\n") fmt = "\t".join(np.repeat('%.4f', self.corr_matrix.shape[1])) + "\n" i = 0 for row in self.corr_matrix: file_handle.write( "'%s'\t" % self.labels[i] + fmt % tuple(row)) i += 1
[docs] def compute_correlation(self): """ computes spearman or pearson correlation for the samples in the matrix The matrix should contain the values of each sample per column that's why the transpose is used. >>> matrix = np.array([[1, 2, 3, np.nan], ... [1, 2, 3, 4], ... [6, 4, 3, 1]]).T >>> np.savez_compressed("/tmp/test_matrix.npz", matrix=matrix, labels=['a', 'b', 'c']) >>> c = Correlation("/tmp/test_matrix.npz", corr_method='pearson') the results should be as in R >>> c.compute_correlation().filled(np.nan) array([[ 1. , 1. , -0.98198051], [ 1. , 1. , -0.98198051], [-0.98198051, -0.98198051, 1. ]]) >>> c.corr_method = 'spearman' >>> c.corr_matrix = None >>> c.compute_correlation() array([[ 1., 1., -1.], [ 1., 1., -1.], [-1., -1., 1.]]) """ if self.corr_matrix is not None: return self.corr_matrix num_samples = len(self.labels) # initialize correlation matrix if self.corr_method == 'pearson': self.corr_matrix = np.ma.corrcoef(self.matrix.T, allow_masked=True) else: corr_matrix = np.zeros((num_samples, num_samples), dtype='float') # do an all vs all correlation using the # indices of the upper triangle rows, cols = np.triu_indices(num_samples) for index in xrange(len(rows)): row = rows[index] col = cols[index] corr_matrix[row, col] = scipy.stats.spearmanr(self.matrix[:, row], self.matrix[:, col])[0] # make the matrix symmetric self.corr_matrix = corr_matrix + np.triu(corr_matrix, 1).T return self.corr_matrix
[docs] def plot_correlation(self, plot_fiilename, plot_title='', vmax=None, vmin=None, colormap='jet', image_format=None, plot_numbers=False): """ plots a correlation using a symmetric heatmap """ num_rows = len(self.labels) corr_matrix = self.compute_correlation() # set a font size according to figure length if num_rows < 6: font_size = 14 elif num_rows > 40: font_size = 5 else: font_size = int(14 - 0.25 * num_rows) mpl.rcParams.update({'font.size': font_size}) # set the minimum and maximum values if vmax is None: vmax = 1 if vmin is None: vmin = 0 if corr_matrix .min() >= 0 else -1 # Compute and plot dendrogram. fig = plt.figure(figsize=(11, 9.5)) plt.suptitle(plot_title) axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66]) axdendro.set_axis_off() y_var = sch.linkage(corr_matrix, method='complete') z_var = sch.dendrogram(y_var, orientation='right', link_color_func=lambda k: 'darkred') axdendro.set_xticks([]) axdendro.set_yticks([]) cmap = plt.get_cmap(colormap) # this line simply makes a new cmap, based on the original # colormap that goes from 0.0 to 0.9 # This is done to avoid colors that # are too dark at the end of the range that do not offer # a good contrast between the correlation numbers that are # plotted on black. if plot_numbers: cmap = cmap.from_list(colormap + "clipped", cmap(np.linspace(0, 0.9, 10))) cmap.set_under((0., 0., 1.)) # Plot distance matrix. axmatrix = fig.add_axes([0.13, 0.1, 0.6, 0.7]) index = z_var['leaves'] corr_matrix = corr_matrix[index, :] corr_matrix = corr_matrix[:, index] img_mat = axmatrix.pcolormesh(corr_matrix, edgecolors='black', cmap=cmap, vmax=vmax, vmin=vmin) axmatrix.set_xlim(0, num_rows) axmatrix.set_ylim(0, num_rows) axmatrix.yaxis.tick_right() axmatrix.set_yticks(np.arange(corr_matrix .shape[0]) + 0.5) axmatrix.set_yticklabels(np.array(self.labels).astype('str')[index]) axmatrix.xaxis.set_tick_params(labeltop='on') axmatrix.xaxis.set_tick_params(labelbottom='off') axmatrix.set_xticks(np.arange(corr_matrix .shape[0]) + 0.5) axmatrix.set_xticklabels(np.array(self.labels).astype('str')[index], rotation=45, ha='left') axmatrix.tick_params( axis='x', which='both', bottom='off', top='off') axmatrix.tick_params( axis='y', which='both', left='off', right='off') # Plot colorbar axcolor = fig.add_axes([0.13, 0.065, 0.6, 0.02]) cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal') cobar.solids.set_edgecolor("face") if plot_numbers: for row in range(num_rows): for col in range(num_rows): axmatrix.text(row + 0.5, col + 0.5, "{:.2f}".format(corr_matrix[row, col]), ha='center', va='center') fig.savefig(plot_fiilename, format=image_format)
[docs] def plot_scatter(self, plot_fiilename, plot_title='', image_format=None, log1p=False): """ Plot the scatter plots of a matrix in which each row is a sample """ num_samples = self.matrix.shape[1] corr_matrix = self.compute_correlation() grids = gridspec.GridSpec(num_samples, num_samples) grids.update(wspace=0, hspace=0) fig = plt.figure(figsize=(2 * num_samples, 2 * num_samples)) plt.rcParams['font.size'] = 8.0 plt.suptitle(plot_title) min_value = self.matrix.min() max_value = self.matrix.max() if (min_value % 2 == 0 and max_value % 2 == 0) or \ (min_value % 1 == 0 and max_value % 2 == 1): # make one value odd and the other even max_value += 1 if log1p: major_locator = matplotlib.ticker.FixedLocator(range(min_value, max_value, 2)) minor_locator = matplotlib.ticker.FixedLocator(range(min_value, max_value, 1)) rows, cols = np.triu_indices(num_samples) for index in xrange(len(rows)): row = rows[index] col = cols[index] if row == col: # add titles as # empty plot in the diagonal ax = fig.add_subplot(grids[row, col]) ax.text(0.5, 0.5, self.labels[row], verticalalignment='center', horizontalalignment='center', fontsize=10, fontweight='bold', transform=ax.transAxes) ax.set_axis_off() continue ax = fig.add_subplot(grids[row, col]) if log1p: ax.xaxis.set_major_locator(major_locator) ax.xaxis.set_minor_locator(minor_locator) ax.yaxis.set_major_locator(major_locator) ax.yaxis.set_minor_locator(minor_locator) vector1 = self.matrix[:, row] vector2 = self.matrix[:, col] ax.text(0.2, 0.8, "{}={:.2f}".format(self.corr_method, corr_matrix[row, col]), horizontalalignment='left', transform=ax.transAxes) ax.get_yaxis().set_tick_params( which='both', left='off', right='off', direction='out') ax.get_xaxis().set_tick_params( which='both', top='off', bottom='off', direction='out') for tick in ax.xaxis.get_major_ticks(): tick.label.set_rotation('45') if col != num_samples - 1: ax.set_yticklabels([]) else: ax.yaxis.tick_right() ax.get_yaxis().set_tick_params( which='both', left='off', right='on', direction='out') if col - row == 1: ax.xaxis.tick_bottom() ax.get_xaxis().set_tick_params( which='both', top='off', bottom='on', direction='out') for tick in ax.xaxis.get_major_ticks(): tick.label.set_rotation('45') else: ax.set_xticklabels([]) ax.hist2d(vector1, vector2, bins=200, cmin=0.1) # downsample for plotting # choice_idx = np.random.randint(0, len(vector1),min(len(vector1), 500000)) # ax.plot(vector1[choice_idx], vector2[choice_idx], '.', markersize=1, # alpha=0.3, color='darkblue', # markeredgecolor=None) # ax.set_ylim(min_value, max_value) # ax.set_xlim(min_value,max_value) ax.set_ylim(min_value, ax.get_ylim()[1]) ax.set_xlim(min_value, ax.get_xlim()[1]) plt.savefig(plot_fiilename, format=image_format) plt.close()
[docs] def plot_pca(self, plot_filename, plot_title='', image_format=None, log1p=False): """ Plot the PCA of a matrix """ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5, 10)) # PCA mlab_pca = matplotlib.mlab.PCA(self.matrix) n = len(self.labels) markers = itertools.cycle(matplotlib.markers.MarkerStyle.filled_markers) colors = itertools.cycle(plt.cm.gist_rainbow(np.linspace(0, 1, n))) ax1.axhline(y=0, color="black", linestyle="dotted", zorder=1) ax1.axvline(x=0, color="black", linestyle="dotted", zorder=2) for i in range(n): ax1.scatter(mlab_pca.Wt[0, i], mlab_pca.Wt[1, i], marker=next(markers), color=next(colors), s=150, label=self.labels[i], zorder=i + 3) if plot_title == '': ax1.set_title('PCA') else: ax1.set_title(plot_title) ax1.set_xlabel('PC1') ax1.set_ylabel('PC2') lgd = ax1.legend(scatterpoints=1, loc='center left', borderaxespad=0.5, bbox_to_anchor=(1, 0.5), prop={'size': 12}, markerscale=0.9) # Scree plot eigenvalues = mlab_pca.s cumulative = [] c = 0 for x in mlab_pca.fracs: c += x cumulative.append(c) ind = np.arange(n) # the x locations for the groups width = 0.35 # the width of the bars ax2.bar(width + ind, eigenvalues, width * 2) ax2.set_ylabel('Eigenvalue') ax2.set_xlabel('Factors') ax2.set_title('Scree plot') ax2.set_xticks(ind + width * 2) ax2.set_xticklabels(ind + 1) ax3 = ax2.twinx() ax3.axhline(y=1, color="black", linestyle="dotted") ax3.plot(width * 2 + ind, cumulative[0:], "r-") ax3.plot(width * 2 + ind, cumulative[0:], "wo") ax3.set_ylim([0, 1.05]) ax3.set_ylabel('Cumulative variability') plt.subplots_adjust(top=3.85) plt.tight_layout() plt.savefig(plot_filename, format=image_format, bbox_extra_artists=(lgd,), bbox_inches='tight') plt.close()