Source code for hic3defdr.analysis.alternatives

"""
Experimental module exposing variants of the HiC3DeFDR model for benchmarking
purposes.
"""

import numpy as np
import scipy.stats as stats

from hic3defdr.analysis import HiC3DeFDR
from hic3defdr.util.printing import eprint
from hic3defdr.util.dispersion import mme_per_pixel
from hic3defdr.util.clusters import load_clusters
from hic3defdr.util.parallelization import parallel_apply
import hic3defdr.util.dispersion as dispersion


[docs]def poisson_fit_mu_hat(raw, f): return np.average(raw / f, weights=f, axis=1)
[docs]def poisson_logpmf(x, mu): return stats.poisson(mu).logpmf(x)
[docs]def poisson_lrt(raw, f, design, refit_mu=True): if refit_mu: mu_hat_null = poisson_fit_mu_hat(raw, f) mu_hat_alt = np.array( [poisson_fit_mu_hat(raw[:, design[:, c]], f[:, design[:, c]]) for c in range(design.shape[1])] ).T else: mu_hat_null = np.mean(raw / f, axis=1) mu_hat_alt = np.array( [np.mean(raw[:, design[:, c]] / f[:, design[:, c]], axis=1) for c in range(design.shape[1])]) mu_hat_alt_wide = np.dot(mu_hat_alt, design.T) null_ll = np.sum(poisson_logpmf(raw, mu_hat_null[:, None] * f), axis=1) alt_ll = np.sum(poisson_logpmf(raw, mu_hat_alt_wide * f), axis=1) llr = null_ll - alt_ll pvalues = stats.chi2(design.shape[1] - 1).sf(-2 * llr) return pvalues, llr, mu_hat_null, mu_hat_alt
[docs]class Poisson3DeFDR(HiC3DeFDR):
[docs] def estimate_disp(self, estimator='qcml', frac=None, auto_frac_factor=15., weighted_lowess=True, n_threads=-1): # note: all kwargs are ignored eprint('estimating dispersion') estimator = dispersion.__dict__[estimator] \ if estimator in dispersion.__dict__ else estimator eprint(' loading data') disp_idx, _ = self.load_data('disp_idx', 'all') row, offsets = self.load_data('row', 'all', idx=disp_idx) col, _ = self.load_data('col', 'all', idx=disp_idx) scaled, _ = self.load_data('scaled', 'all', idx=disp_idx) eprint(' computing pixel-wise mean per condition') disp_per_dist = np.zeros((self.dist_thresh_max+1, self.design.shape[1])) disp = np.zeros((disp_idx.sum(), self.design.shape[1])) def disp_fn(mean): return np.zeros_like(mean) for c, cond in enumerate(self.design.columns): self.save_disp_fn(cond, disp_fn) eprint(' saving estimated dispersions to disk') self.save_data(disp, 'disp', offsets) self.save_data(disp_per_dist, 'disp_per_dist')
[docs] def lrt(self, chrom=None, refit_mu=True, n_threads=-1, verbose=True): if chrom is None: if n_threads: parallel_apply( self.lrt, [{'chrom': c, 'refit_mu': refit_mu, 'verbose': False} for c in self.chroms], n_threads=n_threads ) else: for chrom in self.chroms: self.lrt(chrom=chrom, refit_mu=refit_mu) return eprint('running LRT for chrom %s' % chrom) eprint(' loading data', skip=not verbose) bias = self.load_bias(chrom) size_factors = self.load_data('size_factors', chrom) row = self.load_data('row', chrom) col = self.load_data('col', chrom) raw = self.load_data('raw', chrom) disp_idx = self.load_data('disp_idx', chrom) eprint(' computing LRT results', skip=not verbose) f = bias[row, :][disp_idx, :] * bias[col, :][disp_idx, :] * \ size_factors[disp_idx, :] pvalues, llr, mu_hat_null, mu_hat_alt = poisson_lrt( raw[disp_idx, :], f, self.design.values, refit_mu=True) if self.loop_patterns: eprint(' making loop_idx', skip=not verbose) loop_pixels = set().union( *sum((load_clusters(pattern.replace('<chrom>', chrom)) for pattern in self.loop_patterns.values()), [])) loop_idx = np.array([True if pixel in loop_pixels else False for pixel in zip(row[disp_idx], col[disp_idx])]) self.save_data(loop_idx, 'loop_idx', chrom) eprint(' saving results to disk', skip=not verbose) self.save_data(pvalues, 'pvalues', chrom) self.save_data(llr, 'llr', chrom) self.save_data(mu_hat_null, 'mu_hat_null', chrom) self.save_data(mu_hat_alt, 'mu_hat_alt', chrom)
[docs]class Unsmoothed3DeFDR(HiC3DeFDR):
[docs] def estimate_disp(self, estimator='qcml', frac=None, auto_frac_factor=15., weighted_lowess=True, n_threads=-1): # note: all kwargs are ignored eprint('estimating dispersion') eprint(' loading data') disp_idx, _ = self.load_data('disp_idx', 'all') row, offsets = self.load_data('row', 'all', idx=disp_idx) col, _ = self.load_data('col', 'all', idx=disp_idx) scaled, _ = self.load_data('scaled', 'all', idx=disp_idx) eprint(' computing pixel-wise mean per condition') disp = np.zeros((disp_idx.sum(), self.design.shape[1])) for c, cond in enumerate(self.design.columns): eprint(' estimating dispersion for condition %s' % cond) disp[:, c] = np.maximum(mme_per_pixel( scaled[:, self.design[cond]]), 1e-7) eprint(' saving estimated dispersions to disk') self.save_data(disp, 'disp', offsets)
[docs]class Global3DeFDR(HiC3DeFDR):
[docs] def estimate_disp(self, estimator='qcml', frac=None, auto_frac_factor=15., weighted_lowess=True, n_threads=-1): # note: all kwargs except estimator are ignored eprint('estimating dispersion') estimator = dispersion.__dict__[estimator] \ if estimator in dispersion.__dict__ else estimator eprint(' loading data') disp_idx, disp_idx_offsets = self.load_data('disp_idx', 'all') loop_idx, _ = self.load_data('loop_idx', 'all') row, offsets = self.load_data('row', 'all', idx=disp_idx) col, _ = self.load_data('col', 'all', idx=disp_idx) raw, _ = self.load_data('raw', 'all', idx=disp_idx) f = np.ones_like(raw, dtype=float) for i, chrom in enumerate(self.chroms): chrom_slice = slice(offsets[i], offsets[i+1]) row_chrom = row[chrom_slice] col_chrom = col[chrom_slice] disp_idx_chrom = disp_idx[disp_idx_offsets[i]:disp_idx_offsets[i+1]] bias = self.load_bias(chrom) size_factors = self.load_data('size_factors', chrom)[disp_idx_chrom] f[chrom_slice] = bias[row_chrom, :] * bias[col_chrom, :] \ * size_factors disp = np.zeros((disp_idx.sum(), self.design.shape[1])) disp_per_dist = np.zeros((self.dist_thresh_max+1, self.design.shape[1])) for c, cond in enumerate(self.design.columns): eprint(' estimating dispersion for condition %s' % cond) global_disp = estimator(raw[loop_idx, :][:, self.design[cond]], f=f[loop_idx, :][:, self.design[cond]]) disp[:, c] = global_disp disp_per_dist[:, c] = global_disp def disp_fn(mean): return np.ones_like(mean) * global_disp self.save_disp_fn(cond, disp_fn) eprint(' saving estimated dispersions to disk') self.save_data(disp, 'disp', offsets) self.save_data(disp_per_dist, 'disp_per_dist')