Source code for hic3defdr.analysis.plotting

import numpy as np

from lib5c.algorithms.correlation import \
    make_pairwise_correlation_matrix_from_counts_matrix
from lib5c.plotters.correlation import plot_correlation_matrix

from hic3defdr.plotting.distance_dependence import plot_dd_curves
from hic3defdr.plotting.histograms import plot_pvalue_histogram
from hic3defdr.plotting.dispersion import plot_mvr, plot_ddr
from hic3defdr.plotting.ma import plot_ma
from hic3defdr.plotting.grid import plot_grid
from hic3defdr.plotting.heatmap import plot_heatmap


[docs]class PlottingHiC3DeFDR(object):
    """
    Mixin class containing plotting functions for HiC3DeFDR.
    """
[docs]    def plot_dd_curves(self, chrom, log=True, **kwargs):
        """
        Plots the distance dependence curve before and after size factor
        adjustment.

        Parameters
        ----------
        chrom : str
            The name of the chromosome to plot the curve for.
        log : bool
            Whether or not to log the axes of the plot.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # load everything
        bias = self.load_bias(chrom)
        row = self.load_data('row', chrom)
        col = self.load_data('col', chrom)
        raw = self.load_data('raw', chrom)
        scaled = self.load_data('scaled', chrom)

        # compute balanced
        balanced = np.zeros_like(raw, dtype=float)
        for r in range(self.design.shape[0]):
            balanced[:, r] = raw[:, r] / (bias[row, r] * bias[col, r])

        return plot_dd_curves(row, col, balanced, scaled,
                              repnames=self.design.index, log=log, **kwargs)

[docs]    def plot_dispersion_fit(self, cond, xaxis='dist', yaxis='disp',
                            dist_max=None, scatter_fit=-1, scatter_size=36,
                            distance=None, hexbin=False, logx=False, logy=False,
                            **kwargs):
        """
        Plots a hexbin plot of pixel-wise distance vs either dispersion or
        variance, overlaying the estimated and fitted dispersions.

        Parameters
        ----------
        cond : str
            The name of the chromosome and condition, respectively, to plot the
            fit for.
        xaxis : 'mean' or 'dist'
            What to plot on the x-axis.
        yaxis : 'disp' or 'var'
            What to plot on the y-axis.
        dist_max : int
            If ``xaxis`` is 'dist', the maximum distance to include on the plot
            in bin units. Pass None to use ``self.dist_thresh_max``.
        scatter_fit : int
            Pass a nonzero integer to draw the fitted dispersions passed in
            ``disp`` as a scatterplot of ``scatter_fit`` selected points. Pass
            -1 to plot the fitted dispersions passed in ``disp`` as a curve.
            Pass 0 to omit plotting the dispersion estimates altogether.
        scatter_size : int
            The marker size when plotting scatterplots.
        distance : int, optional
            Pick a specific distance in bin units to plot only interactions at
            that distance.
        hexbin : bool
            Pass False to skip plotting the hexbin plot, leaving only the
            estimated variances or dispersions.
        logx, logy : bool
            Whether or not to log the x- or y-axis, respectively.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # short circuit to plot_ddr() if possible
        if xaxis == 'dist' and yaxis == 'disp' and scatter_fit == -1 \
                and distance is None and hexbin is False and logx is False \
                and logy is False:
            return self.plot_ddr(cond, dist_max=dist_max,
                                 scatter_size=scatter_size, **kwargs)

        # resolve max_dist
        if dist_max is None:
            dist_max = self.dist_thresh_max

        # identify cond_idx
        cond_idx = self.design.columns.tolist().index(cond)

        # load everything
        disp_idx, _ = self.load_data('disp_idx', 'all')
        scaled = self.load_data(
            'scaled', 'all', idx=disp_idx)[0][:, self.design[cond]]
        disp = self.load_data('disp', 'all')[0][:, cond_idx]
        try:
            disp_per_dist = self.load_data('disp_per_dist')[:, cond_idx]
            idx = np.isfinite(disp_per_dist)
            disp_per_bin = disp_per_dist[idx]
            dist_per_bin = np.arange(self.dist_thresh_max + 1)[idx]
        except IOError:
            disp_per_bin = None
            dist_per_bin = None
        row, _ = self.load_data('row', 'all', idx=disp_idx)
        col, _ = self.load_data('col', 'all', idx=disp_idx)
        dist = col - row

        # compute mean and sample variance
        mean = np.mean(scaled, axis=1)
        var = np.var(scaled, ddof=1, axis=1)

        # resolve distance
        if distance is not None:
            dist_idx = dist == distance
            mean = mean[dist_idx]
            var = var[dist_idx]
            dist = None
            disp = np.ones(dist_idx.sum()) * disp_per_dist[distance]
            dist_per_bin = None
            disp_per_bin = None
            fit_align_dist = False
        else:
            fit_align_dist = xaxis == 'mean' or yaxis == 'var'

        return plot_mvr(
            pixel_mean=mean,
            pixel_var=var,
            pixel_dist=dist,
            pixel_disp_fit=disp,
            dist_per_bin=dist_per_bin,
            disp_per_bin=disp_per_bin,
            fit_align_dist=fit_align_dist,
            xaxis=xaxis, yaxis=yaxis,
            dist_max=dist_max, mean_min=self.mean_thresh,
            scatter_fit=scatter_fit, scatter_size=scatter_size, hexbin=hexbin,
            logx=logx, logy=logy, **kwargs
        )

[docs]    def plot_ddr(self, cond, dist_max=None, scatter_size=36, **kwargs):
        """
        Fast alternative to plot_dispersion_fit() that only supports plotting
        distance versus dispersion, with no hexbin or ``scatter_points``
        support.

        Parameters
        ----------
        cond : str
            The name of the chromosome and condition, respectively, to plot the
            fit for.
        dist_max : int
            If ``xaxis`` is 'dist', the maximum distance to include on the plot
            in bin units. Pass None to use ``self.dist_thresh_max``.
        scatter_size : int
            The marker size when plotting scatterplots.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # resolve max_dist
        if dist_max is None:
            dist_max = self.dist_thresh_max

        # identify cond_idx
        cond_idx = self.design.columns.tolist().index(cond)

        # load everything
        disp_per_dist = self.load_data('disp_per_dist')[:, cond_idx]
        idx = np.isfinite(disp_per_dist)
        disp_per_bin = disp_per_dist[idx]
        dist_per_bin = np.arange(self.dist_thresh_max + 1)[idx]
        disp_fn = self.load_disp_fn(cond)

        # plot
        return plot_ddr(dist_per_bin, disp_per_bin, disp_fn,
                        scatter_size=scatter_size, **kwargs)

[docs]    def plot_pvalue_distribution(self, idx='disp', **kwargs):
        """
        Plots the p-value distribution across all chromosomes.

        Parameters
        ----------
        idx : {'disp', 'loop'}
            Pass 'disp' to plot p-values for all points for which dispersion was
            estimated. Pass 'loop' to plot p-values for all points which are in
            loops (available only if ``loop_patterns`` was passed to the
            constructor).
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # load everything
        if idx == 'loop':
            loop_idx, _ = self.load_data('loop_idx', 'all')
            pvalues, _ = self.load_data('pvalues', 'all', idx=loop_idx)
        elif idx == 'disp':
            pvalues, _ = self.load_data('pvalues', 'all')
        else:
            raise ValueError('idx must be loop or disp')

        # plot
        return plot_pvalue_histogram(pvalues, **kwargs)

[docs]    def plot_qvalue_distribution(self, **kwargs):
        """
        Plots the q-value distribution across all chromosomes.

        Parameters
        ----------
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # load everything
        qvalues, _ = self.load_data('qvalues', 'all')

        # plot
        return plot_pvalue_histogram(qvalues, xlabel='qvalue', **kwargs)

[docs]    def plot_ma(self, fdr=0.05, conds=None, include_non_loops=True, s=-1,
                nonloop_s=None, density_dpi=72, vmax=None, nonloop_vmax=None,
                ax=None, legend=True, **kwargs):
        """
        Plots an MA plot for a given chromosome.

        Parameters
        ----------
        fdr : float
            The threshold to use for labeling significantly differential loop
            pixels.
        conds : tuple of str, optional
            Pass a tuple of two condition names to compare those two
            conditions. Pass None to compare the first two conditions.
        include_non_loops : bool
            Whether or not to include non-looping pixels in the MA plot.
        s : float
            The marker size to use for the scatterplot, or -1 to use a
            scatter density plot.
        nonloop_s : float, optional
            Pass a separate marker size to use specifically for the non-loop
            pixels if `include_non_loops` is True. Useful for drawing just the
            non-loop pixels as a density by passing `s=1, nonloop_s=-1`. Pass
            None to use `s` as the size for both loop and non-loop pixels.
        density_dpi : int
            If `s` or `nonloop_s` are -1 this specifies the DPI to use for the
            density grid.
        vmax, nonloop_vmax : float, optional
            The vmax to use for `ax.scatter_density()` if `s` or `nonloop_s` is
            -1, respectively. Pass None to choose values automatically.
        ax : pyplot axis
            The axis to plot to. Must have been created with
            `projection='scatter_density'`. Pass None to create a new axis.
        legend : bool
            Pass True to add a legend. Note that passing `legend='outside'` is
            not supported.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # resolve conds
        if conds is None:
            conds = self.design.columns.tolist()[:2]
        cond_idx = [self.design.columns.tolist().index(cond) for cond in conds]

        # load data
        disp_idx, _ = self.load_data('disp_idx', 'all')
        loop_idx, _ = self.load_data('loop_idx', 'all')
        scaled, _ = self.load_data('scaled', 'all', idx=disp_idx)
        qvalues, _ = self.load_data('qvalues', 'all')

        # compute mean
        mean = np.dot(scaled, self.design) / np.sum(self.design, axis=0).values
        mean = mean[:, cond_idx]

        # prepare sig_idx
        sig_idx = qvalues < fdr

        # stuff common kwargs into kwargs
        kwargs['names'] = conds
        kwargs['s'] = s
        kwargs['nonloop_s'] = nonloop_s
        kwargs['density_dpi'] = density_dpi
        kwargs['vmax'] = vmax
        kwargs['nonloop_vmax'] = vmax
        kwargs['ax'] = ax
        kwargs['legend'] = legend

        # plot
        if include_non_loops:
            plot_ma(mean, sig_idx, loop_idx=loop_idx, **kwargs)
        else:
            plot_ma(mean[loop_idx], sig_idx, **kwargs)

[docs]    def plot_correlation_matrix(self, stage='scaled', idx='loop',
                                correlation='spearman', colorscale=(0.75, 1.0),
                                **kwargs):
        """
        Plots a matrix of pairwise correlations among all replicates.

        Parameters
        ----------
        stage : {'raw', 'scaled'}
            Specify the stage of the data to compute correlations between.
        idx : {'disp', 'loop'}
            Pass 'disp' to compute correlations for all points for which
            dispersion was estimated. Pass 'loop' to compute correlations for
            all points which are in loops (available only if ``loop_patterns``
            was passed to the constructor).
        correlation : {'spearman', 'pearson'}
            Which correlation coefficient to compute.
        colorscale : tuple of float
            The min and max values of the correlation to use to color the
            matrix.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        # resolve idx
        if idx == 'disp':
            idx, _ = self.load_data('disp_idx', 'all')
        elif idx == 'loop':
            idx = (
                self.load_data('disp_idx', 'all')[0],
                self.load_data('loop_idx', 'all')[0]
            )
        else:
            raise ValueError('idx must be \'disp\' or \'loop\'')

        # load data
        data = self.load_data(stage, 'all', idx=idx)[0].T

        # plot
        return plot_correlation_matrix(
            make_pairwise_correlation_matrix_from_counts_matrix(
                data, correlation=correlation
            ),
            label_values=self.design.index.tolist(),
            colorscale=colorscale,
            **kwargs
        )

[docs]    def plot_heatmap(self, chrom, row_slice, col_slice, stage='scaled',
                     rep=None, cond=None, cmap='Reds', vmin=0, vmax=100,
                     **kwargs):
        """
        Plots a simple heatmap of a slice of the contact matrix.

        Parameters
        ----------
        chrom : str
            The chromosome to plot.
        row_slice, col_slice : slice
            The row and column slice, respectively, to plot.
        stage : str
            The stage of the data to plot.
        rep, cond : str, optional
            Pass the rep name or condition name if the data specified by
            ``stage`` has multiple columns.
        cmap : matplotlib colormap or dict
            The colormap to use for the heatmap.
        vmin, vmax : float
            The vmin and vmax to use for the heatmap colorscale.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis
            The axis plotted on.
        """
        plot_heatmap(
            self.get_matrix(
                stage, chrom, row_slice, col_slice, rep=rep, cond=cond),
            cmap=cmap, vmin=vmin, vmax=vmax, **kwargs)

[docs]    def plot_grid(self, chrom, i, j, w, vmax=100, fdr=0.05, cluster_size=3,
                  fdr_vmid=0.05,
                  color_cycle=('blue', 'green', 'purple', 'yellow', 'cyan',
                               'red'),
                  despine=False, **kwargs):
        """
        Plots a combination visualization grid focusing on a specific pixel on a
        specific chromosome, combining heatmaps, cluster outlines, and
        stripplots.

        Parameters
        ----------
        chrom : str
            The name of the chromosome to slice matrices from.
        i, j : int
            The row and column index of the pixel to focus on.
        w : int
            The size of the heatmap will be ``2*w + 1`` bins in each dimension.
        vmax : float
            The maximum of the colorscale to use when plotting normalized
            heatmaps.
        fdr : float
            The FDR threshold to use when outlining clusters.
        cluster_size : int
            The cluster size threshold to use when outlining clusters.
        fdr_vmid : float
            The FDR value at the middle of the colorscale used for plotting the
            q-value heatmap.
        color_cycle : list of matplotlib colors
            The color cycle to use over conditions.
        kwargs : kwargs
            Typical plotter kwargs.

        Returns
        -------
        pyplot axis, grid of pyplot axes, function
            The first pyplot axis returned is injected by ``@plotter``.
            The grid of pyplot axes is the second return value from the call to
            ``plt.subplots()`` that is used to create the grid.
            The function takes two args, an FDR and a cluster size, and redraws
            the cluster outlines using the new parameters.
        """
        # load everything
        row = self.load_data('row', chrom)
        col = self.load_data('col', chrom)
        raw = self.load_data('raw', chrom)
        scaled = self.load_data('scaled', chrom)
        disp_idx = self.load_data('disp_idx', chrom)
        loop_idx = self.load_data('loop_idx', chrom)
        mu_hat_alt = self.load_data('mu_hat_alt', chrom)
        mu_hat_null = self.load_data('mu_hat_null', chrom)
        qvalues = self.load_data('qvalues', chrom)

        return plot_grid(i, j, w, row, col, raw, scaled, mu_hat_alt,
                         mu_hat_null, qvalues, disp_idx, loop_idx, self.design,
                         fdr, cluster_size, vmax=vmax, fdr_vmid=fdr_vmid,
                         color_cycle=color_cycle, despine=despine, **kwargs)