Source code for hic3defdr.util.cluster_table

import numpy as np
import scipy.sparse as sparse
import pandas as pd

from lib5c.util.bed import parse_feature_from_string
from lib5c.util.primers import natural_sort_key
from hic3defdr.util.clusters import cluster_to_loop_id, cluster_from_string


COLUMN_ORDER = ['loop_id', 'us_chrom', 'us_start', 'us_end', 'ds_chrom',
                'ds_start', 'ds_end', 'cluster_size', 'cluster']


[docs]def clusters_to_table(clusters, chrom, res): """ Creates a DataFrame which tabulates cluster information. The DataFrame's first column (and index) will be a "loop_id" in the form "chr:start-end_chr:start-end". Its other columns will be "us_chrom", "us_start", "us_end", and "ds_chrom", "ds_start", "ds_end", representing the BED-style chromosome, start coordinate, and end coordinate of the upstream ("us", smaller coordinate values) and downstream ("ds", larger coordinate values) anchors of the loop, respectively. These anchors together form a rectangular "bounding box" that completely encloses the significant pixels in the cluster. The DataFrame will also have a "cluster_size" column representing the total number of significant pixels in the cluster. Finally, the exact indices of the significant pixels in the cluster will be recorded in a "cluster" column in a JSON-like format (using only square brackets). Parameters ---------- clusters : list of list of tuple The outer list is a list of clusters. Each cluster is a list of (i, j) tuples marking the position of significant points which belong to that cluster. chrom : str The name of the chromosome these clusters are on. res : int The resolution of the contact matrix referred to by the row and column indices in ``clusters``, in units of base pairs. Returns ------- pd.DataFrame The table of loop information. Examples -------- >>> from hic3defdr.util.cluster_table import clusters_to_table >>> clusters = [[(1, 2), (1, 1)], [(4, 4), (3, 4)]] >>> df = clusters_to_table(clusters, 'chrX', 10000) >>> df.iloc[0, :] us_chrom chrX us_start 10000 us_end 20000 ds_chrom chrX ds_start 10000 ds_end 30000 cluster_size 2 cluster [[1, 2], [1, 1]] Name: chrX:10000-20000_chrX:10000-30000, dtype: object """ loops = [] for cluster in clusters: cluster = list(cluster) loop_id = cluster_to_loop_id(cluster, chrom, res) us_anchor, ds_anchor = map( parse_feature_from_string, loop_id.split('_')) loops.append({ 'loop_id': loop_id, 'us_chrom': us_anchor['chrom'], 'us_start': us_anchor['start'], 'us_end': us_anchor['end'], 'ds_chrom': ds_anchor['chrom'], 'ds_start': ds_anchor['start'], 'ds_end': ds_anchor['end'], 'cluster_size': len(cluster), 'cluster': [list(c) for c in cluster] }) return sort_cluster_table( pd.DataFrame(loops, columns=COLUMN_ORDER).set_index('loop_id'))
[docs]def sort_cluster_table(cluster_table): r""" Sorts the rows of a cluster table in the expected order. This function does not operate in-place. We expect this to get a lot easier after this pandas issue is fixed: https://github.com/pandas-dev/pandas/issues/3942 Parameters ---------- cluster_table : pd.DataFrame The cluster table to sort. Must have all the expected columns. Returns ------- pd.DataFrame The sorted cluster table. Examples -------- >>> from hic3defdr.util.cluster_table import clusters_to_table, \ ... sort_cluster_table >>> clusters = [[(4, 4), (3, 4)], [(1, 2), (1, 1)]] >>> res = 10000 >>> df1 = clusters_to_table(clusters, 'chr1', res) >>> df2 = clusters_to_table(clusters, 'chr2', res) >>> df3 = clusters_to_table(clusters, 'chr11', res) >>> df4 = clusters_to_table(clusters, 'chrX', res) >>> df = pd.concat([df4, df3, df2, df1], axis=0) >>> sort_cluster_table(df).index Index(['chr1:10000-20000_chr1:10000-30000', 'chr1:30000-50000_chr1:40000-50000', 'chr2:10000-20000_chr2:10000-30000', 'chr2:30000-50000_chr2:40000-50000', 'chr11:10000-20000_chr11:10000-30000', 'chr11:30000-50000_chr11:40000-50000', 'chrX:10000-20000_chrX:10000-30000', 'chrX:30000-50000_chrX:40000-50000'], dtype='object', name='loop_id') """ # these are the six BED-like columns sort_order = COLUMN_ORDER[1:7] # we can't sort directly on *_chrom # we will add these two surrogate columns to sort on instead sort_order[0] = 'us_chrom_idx' sort_order[3] = 'ds_chrom_idx' # create a mapping from chromosome names to their index in the sort order idx_to_chrom = sorted(list(set(list(cluster_table['us_chrom'].unique()) + list(cluster_table['ds_chrom'].unique()))), key=natural_sort_key) chrom_to_idx = {idx_to_chrom[i]: i for i in range(len(idx_to_chrom))} # use this mapping to add our surrogate columns cluster_table['us_chrom_idx'] = cluster_table['us_chrom']\ .replace(chrom_to_idx) cluster_table['ds_chrom_idx'] = cluster_table['ds_chrom']\ .replace(chrom_to_idx) # sort, then drop the surrogate columns before returning return cluster_table.sort_values(sort_order)\ .drop(columns=['us_chrom_idx', 'ds_chrom_idx'])
[docs]def load_cluster_table(table_filename): r""" Loads a cluster table from a TSV file on disk to a DataFrame. This function will ensure that the "cluster" column of the DataFrame is converted from a string representation to a list of list of int to simplify downstream processing. See the example below for details on how this function assumes the cluster table was saved. Parameters ---------- table_filename : str String reference to the location of the TSV file. Returns ------- pd.DataFrame The loaded cluster table. Examples -------- >>> from tempfile import TemporaryFile >>> from hic3defdr.util.cluster_table import clusters_to_table, \ ... load_cluster_table >>> clusters = [[(1, 2), (1, 1)], [(4, 4), (3, 4)]] >>> df = clusters_to_table(clusters, 'chrX', 10000) >>> f = TemporaryFile(mode='w+') # simulates a file on disk >>> df.to_csv(f, sep='\t') >>> position = f.seek(0) >>> loaded_df = load_cluster_table(f) >>> df.equals(loaded_df) True >>> loaded_df['cluster'][0] [[1, 2], [1, 1]] """ df = pd.read_csv(table_filename, sep='\t', index_col=0) df['cluster'] = df['cluster'].apply(cluster_from_string) return df
[docs]def add_columns_to_cluster_table(cluster_table, name_pattern, row, col, data, labels=None, reducer='mean', chrom=None): r""" Adds new data columns to an existing cluster table by evaluating a sparse dataset specified by ``row``, ``col``, ``data`` at the pixels in each cluster and combining the resulting values using ``reducer``. This function operates in-place. Parameters ---------- cluster_table : pd.DataFrame Must contain a "cluster" column. If the values in this column are strings, they will be "corrected" to list of lists of int in-place. name_pattern : str The name of the column to fill in. If ``data`` contains more than one column, multiple columns will be added - include exactly one ``%s`` in the ``name_pattern``, then the ``i`` th new column will be called ``name_pattern % labels[i]``. row, col, data : np.ndarray Sparse format data to use to determine the value to fill in for each cluster for each new column. ``row`` and ``col`` must be parallel to the first dimension of ``data``. If ``data`` is two-dimensional, you must pass ``labels`` to label the columns and include a ``%s`` in ``name_pattern``. labels : list of str, optional If ``data`` is two-dimensional, pass a list of strings labeling the columns of ``data``. reducer : {'mean', 'max', 'min'} The function to use to combine the values for the pixels in each cluster. chrom : str, optional If the ``cluster_table`` contains data from multiple chromosomes, pass the name of the chromosome that ``row, col, data`` correspond to and only clusters for that chromosome will have their new column created/updated. If the ``cluster_table`` contains data from only one chromosome, pass None to update all clusters in the ``cluster_table``. Examples -------- >>> import numpy as np >>> from hic3defdr.util.cluster_table import clusters_to_table, \ ... add_columns_to_cluster_table >>> # basic test: clusters all on one chromosome >>> clusters = [[(1, 2), (1, 1)], [(4, 4), (3, 4)]] >>> res = 10000 >>> df = clusters_to_table(clusters, 'chrX', res) >>> row, col = zip(*sum(clusters, [])) >>> data = np.array([[1, 2], ... [3, 4], ... [5, 6], ... [7, 8]], dtype=float) >>> add_columns_to_cluster_table(df, '%s_mean', row, col, data, ... labels=['rep1', 'rep2']) >>> df.iloc[0, :] us_chrom chrX us_start 10000 us_end 20000 ds_chrom chrX ds_start 10000 ds_end 30000 cluster_size 2 cluster [[1, 2], [1, 1]] rep1_mean 2 rep2_mean 3 Name: chrX:10000-20000_chrX:10000-30000, dtype: object >>> # advanced test: two chromosomes >>> df1 = clusters_to_table(clusters, 'chr1', res) >>> df2 = clusters_to_table(clusters, 'chr2', res) >>> df = pd.concat([df1, df2], axis=0) >>> # add chr1 info >>> add_columns_to_cluster_table(df, '%s_mean', row, col, data, ... labels=['rep1', 'rep2'], chrom='chr1') >>> # chr1 cluster has data filled in >>> df.loc[df.index[0], ['rep1_mean', 'rep2_mean']] rep1_mean 2 rep2_mean 3 Name: chr1:10000-20000_chr1:10000-30000, dtype: object >>> # chr2 cluster has nans >>> df.loc[df.index[2], ['rep1_mean', 'rep2_mean']] rep1_mean NaN rep2_mean NaN Name: chr2:10000-20000_chr2:10000-30000, dtype: object >>> # add chr2 info, with different data (reversed row order) >>> add_columns_to_cluster_table(df, '%s_mean', row, col, data[::-1, :], ... labels=['rep1', 'rep2'], chrom='chr2') >>> # now the chr2 clusters have data >>> df.loc[df.index[2], ['rep1_mean', 'rep2_mean']] rep1_mean 6 rep2_mean 7 Name: chr2:10000-20000_chr2:10000-30000, dtype: object >>> # edge case: data is a vector >>> df = clusters_to_table(clusters, 'chrX', res) >>> add_columns_to_cluster_table(df, 'value', row, col, data[:, 0]) >>> df.iloc[0, :] us_chrom chrX us_start 10000 us_end 20000 ds_chrom chrX ds_start 10000 ds_end 30000 cluster_size 2 cluster [[1, 2], [1, 1]] value 2 Name: chrX:10000-20000_chrX:10000-30000, dtype: object """ # determine reducer reducer = { 'mean': np.mean, 'max': np.max, 'min': np.min }[reducer] # determine which rows of the cluster table lie on this chromosome row_idx = np.ones(len(cluster_table), dtype=bool) if chrom is not None: row_idx = (cluster_table['us_chrom'] == chrom) & \ (cluster_table['ds_chrom'] == chrom) # promote data to 2D if data.ndim == 1: data = data[:, None] # create CSRs for each column of data csrs = [sparse.coo_matrix((data[:, i], (row, col))).tocsr() for i in range(data.shape[1])] # iterate over columns of data for i in range(data.shape[1]): # determine the name of the new column column_name = name_pattern % labels[i] if labels is not None \ else name_pattern # add the new column if it doesn't exist yet if column_name not in cluster_table.columns: cluster_table[column_name] = np.ones(len(cluster_table)) * np.nan # fill in information for the new column cluster_table.loc[row_idx, column_name] = \ cluster_table.loc[row_idx, 'cluster']\ .map(lambda x: reducer(csrs[i][tuple(zip(*x))]))