Source code for hic3defdr.util.scaled_nb

import numpy as np
import scipy.stats as stats
from scipy.special import gammaln
from scipy.optimize import newton, brentq

from lib5c.util.mathematics import gmean

from hic3defdr.util.printing import eprint
from hic3defdr.util.progress import tqdm_maybe as tqdm


[docs]def logpmf(k, m, phi): """ Log of the PMF of the negative binomial distribution, parameterized by its mean ``m`` and dispersion ``phi``. Vectorized. Parameters ---------- k : int The number of counts observed. m : float The mean parameter. phi : float The dispersion parameter. Returns ------- float The log of the probability of observing ``k`` counts. """ r = 1. / phi return gammaln(r+k) - gammaln(k+1) - gammaln(r) + \ r*np.log(r) - r*np.log(r+m) + k*np.log(m) - k*np.log(r+m)
[docs]def mvr(mean, disp): """ Negative binomial fixed-dispersion mean-variance relationship. Vectorized. Parameters ---------- mean, disp : float The mean and dispersion of a NB distribution, respectively. Returns ------- float The variance of that NB distribution. """ return mean + mean**2 * disp
[docs]def inverse_mvr(mean, var): """ Inverse function of the negative binomial fixed-dispersion mean-variance relationship. Vectorized. Parameters ---------- mean, var : float The mean and variance of a NB distribution, respectively. Returns ------- float The dispersion of that NB distribution. """ return (var-mean) / mean**2
[docs]def fit_mu_hat(x, b, alpha, verbose=True): """ Numerical MLE fitter for the mean parameter of the scaled NB model under fixed dispersion. Vectorized. See the following colab notebook for background and derivation: https://colab.research.google.com/drive/1SgMMvc3XhfIXoBx8tsyJt-yyBDlRFQCJ Parameters ---------- x : np.ndarray The vector of observed counts. b : np.ndarray The vector of scaling factors, parallel to ``x``. alpha : np.ndarray The vector of dispersions, parallel to ``x``. verbose : bool Pass False to silence reporting of progress to stderr. Returns ------- float The MLE of the mean parameter. Examples -------- >>> import numpy as np >>> from hic3defdr.util.scaled_nb import fit_mu_hat 3 pixels, 2 reps (matrices): >>> x = np.array([[1, 2], ... [3, 4], ... [5, 6]]) >>> b = np.array([[0.9, 1.1], ... [0.8, 1.2], ... [0.7, 1.3]]) >>> alpha = np.array([[0.1, 0.2], ... [0.3, 0.4], ... [0.5, 0.6]]) >>> fit_mu_hat(x, b, alpha) array([1.47251127, 3.53879843, 5.86853465]) broadcast dispersion down the pixels: >>> fit_mu_hat(x, b, np.array([0.1, 0.2])) array([1.47251127, 3.53749833, 5.85554075]) broadcast dispersion across the reps: >>> fit_mu_hat(x, b, np.array([0.1, 0.2, 0.3])[:, None]) array([1.49544092, 3.51679438, 5.73129492]) 1 pixel, two reps (vectors): >>> fit_mu_hat(np.array([1, 2]), np.array([0.9, 1.1]), np.array([0.1, 0.2])) array([1.47251127]) broadcast dispersion across reps: >>> fit_mu_hat(np.array([1, 2]), np.array([0.9, 1.1]), 0.1) array([1.49544092]) one pixel is fitted with newton, the second is fitted with brentq >>> x = np.array([[2, 3, 4, 2], ... [6, 9, 3, 1]]) >>> b = np.array([[0.45, 0.53, 0.088, 0.091], ... [0.70, 0.83, 0.14, 0.15 ]]) >>> alpha = np.array([[0.0071, 0.0071, 0.0073, 0.0073], ... [0.0070, 0.0070, 0.0072, 0.0072]]) >>> fit_mu_hat(x, b, alpha) array([ 9.5900971 , 10.45962955]) """ assert np.all((alpha > 0) & np.isfinite(alpha)) assert np.all((x >= 0) & np.isfinite(x)) assert np.all((b > 0) & np.isfinite(b)) def f(mu_hat): if hasattr(mu_hat, 'ndim') and mu_hat.ndim < b.ndim and mu_hat.ndim > 0: mu_hat = mu_hat[:, None] return np.sum((x - mu_hat*b) / (mu_hat + alpha * mu_hat**2 * b), axis=-1) if not x.ndim == 2: # only one pixel, no need to parallelize root = np.array([-1.0]) failed = np.array([True]) else: # multiple pixels, use newton root, converged, zero_der = newton( f, np.mean(x / b, axis=1), maxiter=100, full_output=True) failed = ~converged | zero_der failed[root <= 0] = True # fail points with negative mu failed[root >= np.sqrt(np.finfo(float).max) / 1e10] = True # overflow failed[~np.isclose(f(root), 0, atol=1e-5)] = True # these aren't close if np.any(failed): eprint('some points failed, fixing with brentq', skip=x.ndim != 2 or not verbose) for idx in tqdm(np.where(failed)[0], disable=x.ndim != 2 or not verbose): lower = 10 * np.finfo(float).eps upper = np.mean(x[idx] / b[idx]) counter = 0 while True: try: root[idx] = brentq( f if np.isscalar(f(lower)) else lambda y: f(y)[idx], lower, upper) break except ValueError: upper *= 2 counter += 1 if counter > 100: raise ValueError('bracketing interval not found within ' '100 doublings') assert np.allclose(f(root), 0, atol=1e-5) return root
[docs]def equalize(data, f, alpha): """ Given known scaling factors ``f`` and a known dispersion ``alpha``, creates common-scale pseudodata from raw values ``data``. See https://rdrr.io/bioc/edgeR/src/R/equalizeLibSizes.R Parameters ---------- data : np.ndarray Matrix of raw data to equalize. Rows are pixels, columns are replicates. f : np.ndarray Matrix of combined scaling factors for each pixel. alpha : float Single fixed dispersion to use during equalization. Returns ------- np.ndarray Matrix of equalized data. """ f_mean = gmean(f, pseudocount=0, axis=1) mu_hat = fit_mu_hat(data, f, alpha) mu_in = mu_hat[:, None] * f mu_out = mu_hat * f_mean pseudodata = np.zeros_like(data, dtype=float) for i in range(data.shape[1]): pseudodata[:, i] = q2qnbinom(data[:, i], mu_in[:, i], mu_out, alpha) return pseudodata
[docs]def q2qnbinom(x, mu_in, mu_out, alpha): """ Converts values between two NB distributions with different means but the same dispersion. See https://rdrr.io/bioc/edgeR/src/R/q2qnbinom.R Parameters ---------- x : np.ndarray Vector of values to convert. mu_in, mu_out : np.ndarray Vectors of means to convert between. alpha : np.ndarray or float Single dispserion (to use for all ``x``) or vector of dispersions (one per ``x``) to hold constant during conversion. Returns ------- np.ndarray ``x`` converted from ``mu_in`` to ``mu_out``. """ # force minimum mu of 0.25 for stability high_idx = (mu_in >= 0.25) & (mu_out >= 0.25) mu_in[~high_idx] = 0.25 mu_out[~high_idx] = 0.25 # compute fano factors r_in = 1 + alpha * mu_in r_out = 1 + alpha * mu_out # compute variances v_in = mu_in * r_in v_out = mu_out * r_out # mark points as left or right tail right_idx = x >= mu_in # construct normal and gamma distributions norm_in = stats.norm(mu_in, np.sqrt(v_in)) norm_out = stats.norm(mu_out, np.sqrt(v_out)) gamma_in = stats.gamma(mu_in/r_in, scale=r_in) gamma_out = stats.gamma(mu_out/r_out, scale=r_out) # convert left and right tail separately q_norm = np.zeros_like(mu_in) q_gamma = np.zeros_like(mu_in) q_norm[right_idx] = norm_out.isf(norm_in.sf(x))[right_idx] q_norm[~right_idx] = norm_out.ppf(norm_in.cdf(x))[~right_idx] q_gamma[right_idx] = gamma_out.isf(gamma_in.sf(x))[right_idx] q_gamma[~right_idx] = gamma_out.ppf(gamma_in.cdf(x))[~right_idx] # compute pseudocounts as average pseudocounts = (q_norm + q_gamma) / 2 # clip to zero pseudocounts[~(pseudocounts >= 0)] = 0 return pseudocounts