Source code for hic3defdr.analysis.constructor

import pandas as pd
import dill as pickle

from lib5c.util.system import check_outdir

from hic3defdr.analysis.core import CoreHiC3DeFDR
from hic3defdr.analysis.analysis import AnalyzingHiC3DeFDR
from hic3defdr.analysis.simulation import SimulatingHiC3DeFDR
from hic3defdr.analysis.plotting import PlottingHiC3DeFDR


[docs]class HiC3DeFDR(CoreHiC3DeFDR, AnalyzingHiC3DeFDR, SimulatingHiC3DeFDR, PlottingHiC3DeFDR): """ Main object for hic3defdr analysis. Attributes ---------- raw_npz_patterns : list of str File path patterns to ``scipy.sparse`` formatted NPZ files containing raw contact matrices for each replicate, in order. Each file path pattern should contain at least one '<chrom>' which will be replaced with the chromosome name when loading data for specific chromosomes. bias_patterns : list of str File path patterns to ``np.savetxt()`` formatted files containing bias vector information for each replicate, in order. ach file path pattern should contain at least one '<chrom>' which will be replaced with the chromosome name when loading data for specific chromosomes. chroms : list of str List of chromosome names as strings. These names will be substituted in for '<chroms>' in the ``raw_npz_patterns`` and ``bias_patterns``. design : pd.DataFrame or str Pass a DataFrame with boolean dtype whose rows correspond to replicates and whose columns correspond to conditions. Replicate and condition names will be inferred from the row and column labels, respectively. If you pass a string, the DataFrame will be loaded via ``pd.read_csv(design, index_col=0)``. outdir : str Specify a directory to store the results of the analysis. Two different HiC3DeFDR analyses cannot co-exist in the same directory. The directory will be created if it does not exist. dist_thresh_min, dist_thresh_max : int The minimum and maximum interaction distance (in bin units) to include in the analysis. bias_thresh : float Bins with a bias factor below this threshold or above its reciprocal in any replicate will be filtered out of the analysis. mean_thresh : float Pixels with mean value below this threshold will be filtered out at the dispersion fitting stage. loop_patterns : dict of str, optional Keys should be condition names as strings, values should be file path patterns to sparse JSON formatted cluster files representing called loops in that condition. Each file path pattern should contain at least one '<chrom>' which will be replaced with the chromosome name when loading data for specific chromosomes. res : int, optional The bin resolution, in base pair units, of the input contact matrix data. Used only when printing TSV output. Pass None to skip printing TSV output during the ``threshold()`` and ``classify()`` steps. """ def __init__(self, raw_npz_patterns, bias_patterns, chroms, design, outdir, dist_thresh_min=4, dist_thresh_max=200, bias_thresh=0.1, mean_thresh=1.0, loop_patterns=None, res=None): """ Base constructor. See ``help(HiC3DeFDR)`` for details. """ self.raw_npz_patterns = raw_npz_patterns self.bias_patterns = bias_patterns self.chroms = chroms if type(design) == str: self.design = pd.read_csv(design, index_col=0) else: self.design = design self.outdir = outdir self.dist_thresh_min = dist_thresh_min self.dist_thresh_max = dist_thresh_max self.bias_thresh = bias_thresh self.mean_thresh = mean_thresh self.loop_patterns = loop_patterns self.res = res state = self.__dict__.copy() del state['outdir'] check_outdir(self.picklefile) with open(self.picklefile, 'wb') as handle: pickle.dump(state, handle, -1)