Source code for hic3defdr.util.dispersion

import numpy as np
from scipy.optimize import minimize_scalar
from scipy.special import gammaln

from hic3defdr.util.scaled_nb import inverse_mvr, equalize
from hic3defdr.util.binning import equal_bin
from hic3defdr.util.lowess import lowess_fit


[docs]def qcml(data, f=None, max_iter=10, tol=1e-4):
    """
    Estimates the dispersion parameter of a NB distribution given the data via
    quantile-adjusted conditional maximum likelihood.

    One common dispersion will be estimated across all pixels in the data,
    though the individual pixels in the data may have distinct means.

    Parameters
    ----------
    data : np.ndarray
        Rows should correspond to pixels, columns to replicates.
    f : np.ndarray
        Matrix of scaling factors parallel to ``data``. Pass None to skip
        scaling.

    Returns
    -------
    float
        The estimated dispersion.
    """
    if f is None:
        f = np.ones_like(data, dtype=float)
    disp = 0.01
    delta = np.inf
    it = 1
    while delta > tol and it <= max_iter:
        pseudodata = equalize(data, f, disp)
        new_disp = cml(pseudodata)
        delta = np.abs(disp - new_disp)
        disp = new_disp
        if delta < tol:
            break
    return disp


[docs]def cml(data, f=None):
    """
    Estimates the dispersion parameter of a NB distribution given the data via
    conditional maximum likelihood.

    One common dispersion will be estimated across all pixels in the data,
    though the individual pixels in the data may have distinct means.

    Parameters
    ----------
    data : np.ndarray
        Rows should correspond to pixels, columns to replicates.
    f : np.ndarray
        Matrix of scaling factors parallel to ``data``. Pass None to skip
        scaling.

    Returns
    -------
    float
        The estimated dispersion.
    """
    if f is not None:
        data /= f
    n = data.shape[1]
    z = np.sum(data, axis=1)

    def nll(delta):
        r = 1./delta - 1
        return -np.sum((np.sum(gammaln(data + r), axis=1) + gammaln(n*r) -
                        gammaln(z+n*r) - n * gammaln(r)))

    res = minimize_scalar(nll, bounds=(1e-4, 100./(100+1)), method='bounded')
    assert res.success
    delta_hat = res.x
    return delta_hat / (1-delta_hat)


[docs]def mme_per_pixel(data, f=None):
    """
    Estimates the dispersion parameter of a separate NB distribution for each
    pixel given the data using a method of moments approach.

    Parameters
    ----------
    data : np.ndarray
        Rows should correspond to pixels, columns to replicates.
    f : np.ndarray
        Matrix of scaling factors parallel to ``data``. Pass None to skip
        scaling.

    Returns
    -------
    np.ndarray
        The estimated dispersion for each pixel.
    """
    if f is not None:
        data /= f
    m = np.mean(data, axis=1)
    v = np.var(data, axis=1, ddof=1)
    return inverse_mvr(m, v)


[docs]def mme(data, f=None):
    """
    Estimates the dispersion parameter of a NB distribution given the data using
    a method of moments approach.

    One common dispersion will be estimated across all pixels in the data,
    though the individual pixels in the data may have distinct means.

    Parameters
    ----------
    data : np.ndarray
        Rows should correspond to pixels, columns to replicates.
    f : np.ndarray
        Matrix of scaling factors parallel to ``data``. Pass None to skip
        scaling.

    Returns
    -------
    float
        The estimated dispersion.
    """
    if f is not None:
        data /= f
    return np.nanmean(mme_per_pixel(data))


[docs]def estimate_dispersion(data, cov, estimator='qcml', n_bins=100, logx=False):
    """
    Estimates trended dispersion for each point in ``data`` with respect to a
    covariate ``cov``, using ``estimator`` to estimate the dispersion within
    each of ``n_bins`` equal-number bins and fitting a curve through the per-bin
    dispersion estimates using lowess smoothing.

    This function is deprecated and has no callers.

    Parameters
    ----------
    data : np.ndarray
        Rows should correspond to pixels, columns to replicates.
    cov : np.ndarray
        A vector of covariate values per pixel.
    estimator : 'cml', 'qcml', 'mme', or a function
        Pass 'cml', 'qcml', 'mme' to use conditional maximum likelihood (CML),
        qnorm-CML (qCML), or method of moments estimation (MME) to estimate the
        dispersion within each bin. Pass a function that takes in a (pixels,
        replicates) shaped array of data and returns a dispersion value to use
        that instead.
    n_bins : int
        The number of bins to use when binning the pixels according to ``cov``.
    logx : bool
        Whether or not to perform the lowess fit in log x space.

    Returns
    -------
    smoothed_disp : np.ndarray
        Vector of smoothed dispersion estimates for each pixel.
    cov_per_bin, disp_per_bin : np.ndarray
        Average covariate value and estimated dispersion value, respectively,
        per bin.
    disp_smooth_func : function
        Vectorized function that returns a smoothed dispersion given the value
        of the covariate.
    """
    if type(estimator) == str and estimator not in {'cml', 'qcml', 'mme'}:
        raise ValueError('estimator must be cml, qcml, mme, or a function')
    disp_func = globals()[estimator] if estimator in globals().keys() \
        else estimator
    bins = equal_bin(cov, n_bins)
    cov_per_bin = np.array([np.mean(cov[bins == b]) for b in range(n_bins)])
    disp_per_bin = np.array([disp_func(data[bins == b, :])
                             for b in range(n_bins)])
    cov_idx = cov_per_bin > 0
    disp_smooth_func = lowess_fit(cov_per_bin[cov_idx], disp_per_bin[cov_idx],
                                  left_boundary=None, logx=logx, logy=True)
    smoothed_disp = disp_smooth_func(cov)
    return smoothed_disp, cov_per_bin, disp_per_bin, disp_smooth_func