Source code for hic3defdr.plotting.distance_bias

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from lib5c.util.plotting import plotter


[docs]@plotter
def plot_distance_bias(ob, bins, bin_labels=None, idx='disp', threshold=0.05,
                       colors=None, labels=None, xlabel='distance range',
                       legend_label='group', **kwargs):
    """
    Plots a bar plot illustrating the degree to which p-values are biased among
    different distance scales.

    This method visualizes distance bias by computing a specified percentile of
    all p-values called in an analysis, and then computing the proportion of
    pixels with p-values below this percentile in each distance bin.

    Parameters
    ----------
    ob : hic3defdr.HiC3DeFDR object or list of hic3defdr.HiC3DeFDR objects
        The analyis or analyses to inspect for distance bias.
    bins : list of tuple of int
        Each tuple represents a distance bin as a (min, max) pair, where min and
        max are distances in bin units and the ranges are inclusive. If either
        min or max is None, the distance bin will be considered unbounded on
        that end.
    bin_labels : list of str
        Pass a list of labels to describe the distance bins.
    idx : {'disp', 'loop'}
        Pass 'disp' to use p-values for all points for which dispersion was
        estimated. Pass 'loop' to only use p-values for points which are in
        loops.
    threshold : float
        The percentile to use for the comparison.
    colors : str or list of str, optional
        If ``ob`` is a single object, pass a single color to color the bars in
        the barplot. If ``ob`` is a list of objects, pass a list of colors. Pass
        None to use automatic colors.
    labels : list of str, optional
        If ``ob`` is a list of objects, you must pass a list of strings to label
        the objects. Otherwise, this kwarg does nothing.
    xlabel : str
        The label for the x-axis.
    legend_label : str
        If ``ob`` is a list of objects, the label to use for the legend title.
        Otherwise, this kwarg does nothing.
    kwargs : kwargs
        Typical plotter kwargs.

    Returns
    -------
    pyplot axis
        The axis plotted on.
    """
    # promote ob to list and resolve labels
    if type(ob) not in [list, tuple]:
        ob = [ob]
        hue = None
        color = 'k' if colors is None else colors
        colors = None
        if labels is None:
            labels = ['group1']
    else:
        hue = legend_label
        color = None
        if labels is None:
            raise ValueError('must pass labels if ob is a list or tuple')

    # resolve bin_labels
    if bin_labels is None:
        bin_labels = []
        for min_dist, max_dist in bins:
            if min_dist is None and max_dist is not None:
                label = '<= %s' % max_dist
            elif min_dist is not None and max_dist is None:
                label = '>= %s' % min_dist
            elif min_dist is None and max_dist is None:
                label = 'all'
            else:
                label = '%s to %s' % (min_dist, max_dist)
            bin_labels.append(label)

    data = []
    for o, label in zip(ob, labels):
        # load data
        disp_idx, _ = o.load_data('disp_idx', 'all')
        if idx == 'loop':
            loop_idx, _ = o.load_data('loop_idx', 'all')
            rc_idx = (disp_idx, loop_idx)
            p_idx = loop_idx
        else:
            rc_idx = disp_idx
            p_idx = None
        disp_idx, _ = o.load_data('disp_idx', 'all')
        loop_idx, _ = o.load_data('loop_idx', 'all')
        row, _ = o.load_data('row', 'all', idx=rc_idx)
        col, _ = o.load_data('col', 'all', idx=rc_idx)
        dist = col - row
        pvalues, _ = o.load_data('pvalues', 'all', idx=p_idx)
        p_star = np.percentile(pvalues, 100*threshold)

        # process each distance bin
        for bin_label, (min_dist, max_dist) in zip(bin_labels, bins):
            dist_idx = np.ones(len(dist), dtype=bool)
            if min_dist is not None:
                dist_idx[dist < min_dist] = False
            if max_dist is not None:
                dist_idx[dist > max_dist] = False
            perc = np.mean(pvalues[dist_idx] < p_star)
            data.append({legend_label: label, xlabel: bin_label,
                         'percentage significant': perc})
    df = pd.DataFrame(data)
    sns.barplot(data=df, x=xlabel, y='percentage significant', hue=hue,
                color=color, palette=colors)
    xlim = plt.xlim()
    plt.hlines(threshold, *xlim, color='gray', linestyle='--')
    plt.xlim(xlim)