Source code for hic3defdr.util.dispersion

import numpy as np
from scipy.optimize import minimize_scalar
from scipy.special import gammaln

from hic3defdr.util.scaled_nb import inverse_mvr, equalize
from hic3defdr.util.binning import equal_bin
from hic3defdr.util.lowess import lowess_fit


[docs]def qcml(data, f=None, max_iter=10, tol=1e-4): """ Estimates the dispersion parameter of a NB distribution given the data via quantile-adjusted conditional maximum likelihood. One common dispersion will be estimated across all pixels in the data, though the individual pixels in the data may have distinct means. Parameters ---------- data : np.ndarray Rows should correspond to pixels, columns to replicates. f : np.ndarray Matrix of scaling factors parallel to ``data``. Pass None to skip scaling. Returns ------- float The estimated dispersion. """ if f is None: f = np.ones_like(data, dtype=float) disp = 0.01 delta = np.inf it = 1 while delta > tol and it <= max_iter: pseudodata = equalize(data, f, disp) new_disp = cml(pseudodata) delta = np.abs(disp - new_disp) disp = new_disp if delta < tol: break return disp
[docs]def cml(data, f=None): """ Estimates the dispersion parameter of a NB distribution given the data via conditional maximum likelihood. One common dispersion will be estimated across all pixels in the data, though the individual pixels in the data may have distinct means. Parameters ---------- data : np.ndarray Rows should correspond to pixels, columns to replicates. f : np.ndarray Matrix of scaling factors parallel to ``data``. Pass None to skip scaling. Returns ------- float The estimated dispersion. """ if f is not None: data /= f n = data.shape[1] z = np.sum(data, axis=1) def nll(delta): r = 1./delta - 1 return -np.sum((np.sum(gammaln(data + r), axis=1) + gammaln(n*r) - gammaln(z+n*r) - n * gammaln(r))) res = minimize_scalar(nll, bounds=(1e-4, 100./(100+1)), method='bounded') assert res.success delta_hat = res.x return delta_hat / (1-delta_hat)
[docs]def mme_per_pixel(data, f=None): """ Estimates the dispersion parameter of a separate NB distribution for each pixel given the data using a method of moments approach. Parameters ---------- data : np.ndarray Rows should correspond to pixels, columns to replicates. f : np.ndarray Matrix of scaling factors parallel to ``data``. Pass None to skip scaling. Returns ------- np.ndarray The estimated dispersion for each pixel. """ if f is not None: data /= f m = np.mean(data, axis=1) v = np.var(data, axis=1, ddof=1) return inverse_mvr(m, v)
[docs]def mme(data, f=None): """ Estimates the dispersion parameter of a NB distribution given the data using a method of moments approach. One common dispersion will be estimated across all pixels in the data, though the individual pixels in the data may have distinct means. Parameters ---------- data : np.ndarray Rows should correspond to pixels, columns to replicates. f : np.ndarray Matrix of scaling factors parallel to ``data``. Pass None to skip scaling. Returns ------- float The estimated dispersion. """ if f is not None: data /= f return np.nanmean(mme_per_pixel(data))
[docs]def estimate_dispersion(data, cov, estimator='qcml', n_bins=100, logx=False): """ Estimates trended dispersion for each point in ``data`` with respect to a covariate ``cov``, using ``estimator`` to estimate the dispersion within each of ``n_bins`` equal-number bins and fitting a curve through the per-bin dispersion estimates using lowess smoothing. This function is deprecated and has no callers. Parameters ---------- data : np.ndarray Rows should correspond to pixels, columns to replicates. cov : np.ndarray A vector of covariate values per pixel. estimator : 'cml', 'qcml', 'mme', or a function Pass 'cml', 'qcml', 'mme' to use conditional maximum likelihood (CML), qnorm-CML (qCML), or method of moments estimation (MME) to estimate the dispersion within each bin. Pass a function that takes in a (pixels, replicates) shaped array of data and returns a dispersion value to use that instead. n_bins : int The number of bins to use when binning the pixels according to ``cov``. logx : bool Whether or not to perform the lowess fit in log x space. Returns ------- smoothed_disp : np.ndarray Vector of smoothed dispersion estimates for each pixel. cov_per_bin, disp_per_bin : np.ndarray Average covariate value and estimated dispersion value, respectively, per bin. disp_smooth_func : function Vectorized function that returns a smoothed dispersion given the value of the covariate. """ if type(estimator) == str and estimator not in {'cml', 'qcml', 'mme'}: raise ValueError('estimator must be cml, qcml, mme, or a function') disp_func = globals()[estimator] if estimator in globals().keys() \ else estimator bins = equal_bin(cov, n_bins) cov_per_bin = np.array([np.mean(cov[bins == b]) for b in range(n_bins)]) disp_per_bin = np.array([disp_func(data[bins == b, :]) for b in range(n_bins)]) cov_idx = cov_per_bin > 0 disp_smooth_func = lowess_fit(cov_per_bin[cov_idx], disp_per_bin[cov_idx], left_boundary=None, logx=logx, logy=True) smoothed_disp = disp_smooth_func(cov) return smoothed_disp, cov_per_bin, disp_per_bin, disp_smooth_func