Source code for hic3defdr.util.evaluation

import numpy as np

from hic3defdr.util.progress import tqdm_maybe as tqdm

try:
    from sklearn.metrics import roc_curve, confusion_matrix

    sklearn_avail = True
except ImportError:
    sklearn_avail = False
    roc_curve = None
    confusion_matrix = None


[docs]def make_y_true(row, col, clusters, labels): """ Makes a boolean vector of the true labels for each pixel, given a list of clusters and the true label for each cluster. Parameters ---------- row, col : np.ndarray The row and column indices of pixels to be labeled. clusters : list of list of tuple The outer list is a list of clusters. Each cluster is a list of (i, j) tuples marking the position of significant points which belong to that cluster. labels : list of str List of labels for each cluster, parallel to ``clusters``. Returns ------- np.ndarray Boolean vector with the same length as ``row``/``col``. It's `i`th element is False when the pixel at `(row[i], col[i])` is in a cluster with label 'constit' and is True otherwise. """ sig_idx = ~(labels == 'constit') sig_pixels = set().union(*[c for i, c in enumerate(clusters) if sig_idx[i]]) return np.array([True if (r, c) in sig_pixels else False for r, c in zip(row, col)])
[docs]def evaluate(y_true, qvalues, n_fdr_points=100): """ Evaluates how good a vector of q-values (or p-values) is at predicting the vector of true labels. Parameters ---------- y_true : np.ndarray The boolean vector of true labels. qvalues : np.ndarray Vector of q-values or p-values which are supposed to predict the boolean label in ``y_true``. n_fdr_points : int The maximum number of points at which to compute FDR. The FDR computation is not parallelized so increasing this number will slow down the evaluation. The default value of 100 should be sufficient to visualize the FDR control curve. Returns ------- fdr, fpr, tpr, thresh : np.ndarray Parallel arrays of the FDR, FPR, TPR, and thresholds (in ``1 - qvalue``) space which specify the FDR, FPR, and and TPR at each threshold. The thresholds are selected to represent the convex edge of the ROC curve. The FDR will only be evaluated at about 100 selected thresholds and will be set to ``np.nan`` at the un-evaluated thresholds. """ if not sklearn_avail: raise ImportError('failed to import scikit-learn - is it installed?') y_pred = 1 - qvalues fpr, tpr, thresh = roc_curve(y_true, y_pred) fdr = np.ones_like(fpr) * np.nan rate = max(int(len(thresh)/n_fdr_points), 1) for i in tqdm(range(np.argmax(tpr > 0), len(thresh), rate)): fdr[i] = compute_fdr(y_true, y_pred >= thresh[i]) return fdr, fpr, tpr, thresh
[docs]def compute_fdr(y_true, y_pred): """ Computes the observed false discovery rate from boolean vectors of true and predicted labels. Parameters ---------- y_true, y_pred : np.ndarray Boolean vectors of the true and predicted labels, respectively. Returns ------- float The false discovery rate. """ if not sklearn_avail: raise ImportError('failed to import scikit-learn - is it installed?') tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() return fp / float(fp + tp)