Source code for hic3defdr.analysis.simulation

import os

import numpy as np
import pandas as pd
import scipy.sparse as sparse

from lib5c.util.system import check_outdir
from lib5c.util.statistics import adjust_pvalues

from hic3defdr.util.printing import eprint
from hic3defdr.util.clusters import load_clusters
from hic3defdr.util.simulation import simulate
from hic3defdr.util.evaluation import make_y_true, evaluate
from hic3defdr.util.progress import tqdm_maybe as tqdm
from hic3defdr.util.parallelization import parallel_apply


[docs]class SimulatingHiC3DeFDR(object): """ Mixin class containing plotting functions for HiC3DeFDR. """
[docs] def simulate(self, cond, chrom=None, beta=0.5, p_diff=0.4, skip_bias=False, loop_pattern=None, outdir='sim', n_threads=-1, verbose=True): """ Simulates raw contact matrices based on previously fitted scaled means and dispersions in a specific condition. Can only be run after ``estimate_dispersions()`` has been run. Parameters ---------- cond : str Name of the condition to base the simulation on. chrom : str, optional Name of the chromosome to simulate. Pass None to simulate all chromosomes in series. beta : float The effect size of the loop perturbations to use when simulating. Perturbed loops will be strengthened or weakened by this fraction of their original strength. p_diff : float or list of float Pass a single float to specify the probability that a loop will be perturbed across the simulated conditions. Pass four floats to specify the probabilities of all four specific perturbations: up in A, down in A, up in B, down in B. The remaining loops will be constitutive. skip_bias : bool Pass True to set all bias factors and size factors to 1, effectively simulating "unbiased" raw data. loop_pattern : str, optional File path pattern to sparse JSON formatted cluster files representing loop cluster locations for the simulation. Should contain at least one '<chrom>' which will be replaced with the chromosome name when loading data for specific chromosomes. Pass None to use ``self.loop_patterns[cond]``. outdir : str Path to a directory to store the simulated data to. n_threads : int The number of threads (technically GIL-avoiding child processes) to use to process multiple chromosomes in parallel. Pass -1 to use as many threads as there are CPUs. Pass 0 to process the chromosomes serially. verbose : bool Pass False to silence reporting of progress to stderr. """ if chrom is None: if n_threads: parallel_apply( self.simulate, [{'cond': cond, 'chrom': c, 'beta': beta, 'p_diff': p_diff, 'skip_bias': skip_bias, 'loop_pattern': loop_pattern, 'outdir': outdir, 'verbose': False} for c in self.chroms], n_threads=n_threads ) else: for chrom in self.chroms: self.simulate(cond, chrom=chrom, beta=beta, p_diff=p_diff, loop_pattern=loop_pattern, outdir=outdir) return eprint('simulating data for chrom %s' % chrom) # resolve loop_pattern if loop_pattern is None: loop_pattern = self.loop_patterns[cond] # load everything bias = self.load_bias(chrom)[:, self.design[cond]] size_factors = self.load_data('size_factors', chrom) if len(size_factors.shape) == 2: size_factors = size_factors[:, self.design[cond]] else: size_factors = size_factors[self.design[cond]] row = self.load_data('row', chrom) col = self.load_data('col', chrom) scaled = self.load_data('scaled', chrom)[:, self.design[cond]] disp_fn = self.load_disp_fn(cond) clusters = load_clusters(loop_pattern.replace('<chrom>', chrom)) # compute pixel-wise mean of normalized data mean = np.mean(scaled, axis=1) # book keeping check_outdir('%s/' % outdir) n_sim_per_cond = size_factors.shape[-1] repnames = sum((['%s%i' % (c, i+1) for i in range(n_sim_per_cond)] for c in ['A', 'B']), []) # write design to disk if not present design_file = '%s/design.csv' % outdir if not os.path.isfile(design_file): pd.DataFrame( {'A': [1]*n_sim_per_cond + [0]*n_sim_per_cond, 'B': [0]*n_sim_per_cond + [1]*n_sim_per_cond}, dtype=bool, index=repnames ).to_csv(design_file) # rewrite size_factor matrix in terms of distance if len(size_factors.shape) == 2: eprint(' converting size factors', skip=not verbose) dist = col - row n_dists = dist.max() + 1 new_size_factors = np.zeros((n_dists, size_factors.shape[1])) for d in tqdm(range(n_dists)): idx = np.argmax(dist == d) new_size_factors[d, :] = size_factors[idx, :] size_factors = new_size_factors # get rid of bias if skip_bias: bias = np.ones_like(bias) size_factors = np.ones_like(size_factors) # tile bias and size_factors bias = np.tile(bias, 2) size_factors = np.tile(size_factors, 2) # simulate and save classes, sim_iter = simulate( row, col, mean, disp_fn, bias, size_factors, clusters, beta=beta, p_diff=p_diff, trend='dist', verbose=verbose) np.savetxt('%s/labels_%s.txt' % (outdir, chrom), classes, fmt='%s') for rep, csr in zip(repnames, sim_iter): sparse.save_npz('%s/%s_%s_raw.npz' % (outdir, rep, chrom), csr)
[docs] def evaluate(self, cluster_pattern, label_pattern, min_dist=None, max_dist=None, rerun_bh=False, outfile=None): """ Evaluates the results of this analysis, comparing it to true labels. Parameters ---------- cluster_pattern : str File path pattern to sparse JSON formatted cluster files representing loop cluster locations. Should contain at least one '<chrom>' which will be replaced with the chromosome name when loading data for specific chromosomes. Pass a condition name to use ``self.loop_patterns[cluster_pattern]`` instead. label_pattern : str File path pattern to true label files for each chromosome. Should contain at least one '<chrom>' which will be replaced with the chromosome name when loading data for specific chromosomes. Files should be loadable with ``np.loadtxt(..., dtype='U7')`` to yield a vector of true labels parallel to the clusters pointed to by ``cluster_pattern``. min_dist, max_dist : int, optional Specify minimum and maximum distances to evaluate performance within, respectively. Pass None to leave one or both ends unbounded. rerun_bh : bool If ``min_dist`` and/or ``max_dist`` are used to constrain the distances, pass True to re-run BH-FDR on the subset of p-values at the selected distances. Pass False to use the original dataset-wide q-values. Does nothing if ``min_dist`` and ``max_dist`` are both None. outfile : str, optional Name of a file to save the evaluation results to inside this object's ``outdir``. Default is 'eval.npz' if ``min_dist`` and ``max_dist`` are both None, otherwise it is 'eval_<min_dist>_<max_dist>.npz'. """ # resolve outfile if outfile is None: if min_dist is None and max_dist is None: outfile = 'eval.npz' else: outfile = 'eval_%s_%s.npz' % (min_dist, max_dist) # resolve case where a condition name was passed to cluster_pattern if cluster_pattern in self.loop_patterns.keys(): cluster_pattern = self.loop_patterns[cluster_pattern] # make y_true and pvalues/qvalues (if necessary) one chrom at a time y_true = [] pvalues = [] qvalues = [] for chrom in self.chroms: # load data disp_idx = self.load_data('disp_idx', chrom) loop_idx = self.load_data('loop_idx', chrom) row = self.load_data('row', chrom, idx=(disp_idx, loop_idx)) col = self.load_data('col', chrom, idx=(disp_idx, loop_idx)) clusters = load_clusters(cluster_pattern.replace('<chrom>', chrom)) labels = np.loadtxt(label_pattern.replace('<chrom>', chrom), dtype='U7') # construct dist_idx dist = col - row dist_idx = np.ones(len(dist), dtype=bool) if min_dist is not None: dist_idx[dist < min_dist] = False if max_dist is not None: dist_idx[dist > max_dist] = False # append to y_true and pvalues/qvalues (if necessary) y_true.append(make_y_true( row[dist_idx], col[dist_idx], clusters, labels)) if min_dist is not None or max_dist is not None: if rerun_bh: pvalues.append(self.load_data('pvalues', chrom, idx=(loop_idx, dist_idx))) else: qvalues.append(self.load_data('qvalues', chrom, idx=dist_idx)) # concatenate y_true and make or load qvalues y_true = np.concatenate(y_true) if pvalues: qvalues = adjust_pvalues(np.concatenate(pvalues)) elif qvalues: qvalues = np.concatenate(qvalues) else: qvalues, _ = self.load_data('qvalues', 'all') # evaluate and save to disk fdr, fpr, tpr, thresh = evaluate(y_true, qvalues) # save to disk np.savez('%s/%s' % (self.outdir, outfile), **{'fdr': fdr, 'fpr': fpr, 'tpr': tpr, 'thresh': thresh})