Source code for hic3defdr.util.cluster_table

import numpy as np
import scipy.sparse as sparse
import pandas as pd

from lib5c.util.bed import parse_feature_from_string
from lib5c.util.primers import natural_sort_key
from hic3defdr.util.clusters import cluster_to_loop_id, cluster_from_string


COLUMN_ORDER = ['loop_id', 'us_chrom', 'us_start', 'us_end', 'ds_chrom',
                'ds_start', 'ds_end', 'cluster_size', 'cluster']


[docs]def clusters_to_table(clusters, chrom, res):
    """
    Creates a DataFrame which tabulates cluster information.

    The DataFrame's first column (and index) will be a "loop_id" in the form
    "chr:start-end_chr:start-end". Its other columns will be "us_chrom",
    "us_start", "us_end", and "ds_chrom", "ds_start", "ds_end", representing the
    BED-style chromosome, start coordinate, and end coordinate of the upstream
    ("us", smaller coordinate values) and downstream ("ds", larger coordinate
    values) anchors of the loop, respectively. These anchors together form a
    rectangular "bounding box" that completely encloses the significant pixels
    in the cluster. The DataFrame will also have a "cluster_size" column
    representing the total number of significant pixels in the cluster. Finally,
    the exact indices of the significant pixels in the cluster will be recorded
    in a "cluster" column in a JSON-like format (using only square brackets).

    Parameters
    ----------
    clusters : list of list of tuple
        The outer list is a list of clusters. Each cluster is a list of (i, j)
        tuples marking the position of significant points which belong to that
        cluster.
    chrom : str
        The name of the chromosome these clusters are on.
    res : int
        The resolution of the contact matrix referred to by the row and column
        indices in ``clusters``, in units of base pairs.

    Returns
    -------
    pd.DataFrame
        The table of loop information.

    Examples
    --------
    >>> from hic3defdr.util.cluster_table import clusters_to_table
    >>> clusters = [[(1, 2), (1, 1)], [(4, 4),  (3, 4)]]
    >>> df = clusters_to_table(clusters, 'chrX', 10000)
    >>> df.iloc[0, :]
    us_chrom                    chrX
    us_start                   10000
    us_end                     20000
    ds_chrom                    chrX
    ds_start                   10000
    ds_end                     30000
    cluster_size                   2
    cluster         [[1, 2], [1, 1]]
    Name: chrX:10000-20000_chrX:10000-30000, dtype: object
    """
    loops = []
    for cluster in clusters:
        cluster = list(cluster)
        loop_id = cluster_to_loop_id(cluster, chrom, res)
        us_anchor, ds_anchor = map(
            parse_feature_from_string, loop_id.split('_'))
        loops.append({
            'loop_id': loop_id,
            'us_chrom': us_anchor['chrom'],
            'us_start': us_anchor['start'],
            'us_end': us_anchor['end'],
            'ds_chrom': ds_anchor['chrom'],
            'ds_start': ds_anchor['start'],
            'ds_end': ds_anchor['end'],
            'cluster_size': len(cluster),
            'cluster': [list(c) for c in cluster]
        })
    return sort_cluster_table(
        pd.DataFrame(loops, columns=COLUMN_ORDER).set_index('loop_id'))


[docs]def sort_cluster_table(cluster_table):
    r"""
    Sorts the rows of a cluster table in the expected order.

    This function does not operate in-place.

    We expect this to get a lot easier after this pandas issue is fixed:
    https://github.com/pandas-dev/pandas/issues/3942

    Parameters
    ----------
    cluster_table : pd.DataFrame
        The cluster table to sort. Must have all the expected columns.

    Returns
    -------
    pd.DataFrame
        The sorted cluster table.

    Examples
    --------
    >>> from hic3defdr.util.cluster_table import clusters_to_table, \
    ...     sort_cluster_table
    >>> clusters = [[(4, 4),  (3, 4)], [(1, 2), (1, 1)]]
    >>> res = 10000
    >>> df1 = clusters_to_table(clusters, 'chr1', res)
    >>> df2 = clusters_to_table(clusters, 'chr2', res)
    >>> df3 = clusters_to_table(clusters, 'chr11', res)
    >>> df4 = clusters_to_table(clusters, 'chrX', res)
    >>> df = pd.concat([df4, df3, df2, df1], axis=0)
    >>> sort_cluster_table(df).index
    Index(['chr1:10000-20000_chr1:10000-30000',
           'chr1:30000-50000_chr1:40000-50000',
           'chr2:10000-20000_chr2:10000-30000',
           'chr2:30000-50000_chr2:40000-50000',
           'chr11:10000-20000_chr11:10000-30000',
           'chr11:30000-50000_chr11:40000-50000',
           'chrX:10000-20000_chrX:10000-30000',
           'chrX:30000-50000_chrX:40000-50000'],
          dtype='object', name='loop_id')
    """
    # these are the six BED-like columns
    sort_order = COLUMN_ORDER[1:7]

    # we can't sort directly on *_chrom
    # we will add these two surrogate columns to sort on instead
    sort_order[0] = 'us_chrom_idx'
    sort_order[3] = 'ds_chrom_idx'

    # create a mapping from chromosome names to their index in the sort order
    idx_to_chrom = sorted(list(set(list(cluster_table['us_chrom'].unique()) +
                                   list(cluster_table['ds_chrom'].unique()))),
                          key=natural_sort_key)
    chrom_to_idx = {idx_to_chrom[i]: i for i in range(len(idx_to_chrom))}

    # use this mapping to add our surrogate columns
    cluster_table['us_chrom_idx'] = cluster_table['us_chrom']\
        .replace(chrom_to_idx)
    cluster_table['ds_chrom_idx'] = cluster_table['ds_chrom']\
        .replace(chrom_to_idx)

    # sort, then drop the surrogate columns before returning
    return cluster_table.sort_values(sort_order)\
        .drop(columns=['us_chrom_idx', 'ds_chrom_idx'])


[docs]def load_cluster_table(table_filename):
    r"""
    Loads a cluster table from a TSV file on disk to a DataFrame.

    This function will ensure that the "cluster" column of the DataFrame is
    converted from a string representation to a list of list of int to simplify
    downstream processing.

    See the example below for details on how this function assumes the cluster
    table was saved.

    Parameters
    ----------
    table_filename : str
        String reference to the location of the TSV file.

    Returns
    -------
    pd.DataFrame
        The loaded cluster table.

    Examples
    --------
    >>> from tempfile import TemporaryFile
    >>> from hic3defdr.util.cluster_table import clusters_to_table, \
    ...     load_cluster_table
    >>> clusters = [[(1, 2), (1, 1)], [(4, 4),  (3, 4)]]
    >>> df = clusters_to_table(clusters, 'chrX', 10000)
    >>> f = TemporaryFile(mode='w+')  # simulates a file on disk
    >>> df.to_csv(f, sep='\t')
    >>> position = f.seek(0)
    >>> loaded_df = load_cluster_table(f)
    >>> df.equals(loaded_df)
    True
    >>> loaded_df['cluster'][0]
    [[1, 2], [1, 1]]
    """
    df = pd.read_csv(table_filename, sep='\t', index_col=0)
    df['cluster'] = df['cluster'].apply(cluster_from_string)
    return df


[docs]def add_columns_to_cluster_table(cluster_table, name_pattern, row, col,
                                 data, labels=None, reducer='mean', chrom=None):
    r"""
    Adds new data columns to an existing cluster table by evaluating a sparse
    dataset specified by ``row``, ``col``, ``data`` at the pixels in each
    cluster and combining the resulting values using ``reducer``.

    This function operates in-place.

    Parameters
    ----------
    cluster_table : pd.DataFrame
        Must contain a "cluster" column. If the values in this column are
        strings, they will be "corrected" to list of lists of int in-place.
    name_pattern : str
        The name of the column to fill in. If ``data`` contains more than one
        column, multiple columns will be added - include exactly one ``%s`` in
        the ``name_pattern``, then the ``i`` th new column will be called
        ``name_pattern % labels[i]``.
    row, col, data : np.ndarray
        Sparse format data to use to determine the value to fill in for each
        cluster for each new column. ``row`` and ``col`` must be parallel to the
        first dimension of ``data``. If ``data`` is two-dimensional, you must
        pass ``labels`` to label the columns and include a ``%s`` in
        ``name_pattern``.
    labels : list of str, optional
        If ``data`` is two-dimensional, pass a list of strings labeling the
        columns of ``data``.
    reducer : {'mean', 'max', 'min'}
        The function to use to combine the values for the pixels in each
        cluster.
    chrom : str, optional
        If the ``cluster_table`` contains data from multiple chromosomes, pass
        the name of the chromosome that ``row, col, data`` correspond to and
        only clusters for that chromosome will have their new column
        created/updated. If the ``cluster_table`` contains data from only one
        chromosome, pass None to update all clusters in the ``cluster_table``.

    Examples
    --------
    >>> import numpy as np
    >>> from hic3defdr.util.cluster_table import clusters_to_table, \
    ...     add_columns_to_cluster_table
    >>> # basic test: clusters all on one chromosome
    >>> clusters = [[(1, 2), (1, 1)], [(4, 4),  (3, 4)]]
    >>> res = 10000
    >>> df = clusters_to_table(clusters, 'chrX', res)
    >>> row, col = zip(*sum(clusters, []))
    >>> data = np.array([[1, 2],
    ...                  [3, 4],
    ...                  [5, 6],
    ...                  [7, 8]], dtype=float)
    >>> add_columns_to_cluster_table(df, '%s_mean', row, col, data,
    ...                              labels=['rep1', 'rep2'])
    >>> df.iloc[0, :]
    us_chrom                    chrX
    us_start                   10000
    us_end                     20000
    ds_chrom                    chrX
    ds_start                   10000
    ds_end                     30000
    cluster_size                   2
    cluster         [[1, 2], [1, 1]]
    rep1_mean                      2
    rep2_mean                      3
    Name: chrX:10000-20000_chrX:10000-30000, dtype: object
    >>> # advanced test: two chromosomes
    >>> df1 = clusters_to_table(clusters, 'chr1', res)
    >>> df2 = clusters_to_table(clusters, 'chr2', res)
    >>> df = pd.concat([df1, df2], axis=0)
    >>> # add chr1 info
    >>> add_columns_to_cluster_table(df, '%s_mean', row, col, data,
    ...                              labels=['rep1', 'rep2'], chrom='chr1')
    >>> # chr1 cluster has data filled in
    >>> df.loc[df.index[0], ['rep1_mean', 'rep2_mean']]
    rep1_mean    2
    rep2_mean    3
    Name: chr1:10000-20000_chr1:10000-30000, dtype: object
    >>> # chr2 cluster has nans
    >>> df.loc[df.index[2], ['rep1_mean', 'rep2_mean']]
    rep1_mean    NaN
    rep2_mean    NaN
    Name: chr2:10000-20000_chr2:10000-30000, dtype: object
    >>> # add chr2 info, with different data (reversed row order)
    >>> add_columns_to_cluster_table(df, '%s_mean', row, col, data[::-1, :],
    ...                              labels=['rep1', 'rep2'], chrom='chr2')
    >>> # now the chr2 clusters have data
    >>> df.loc[df.index[2], ['rep1_mean', 'rep2_mean']]
    rep1_mean    6
    rep2_mean    7
    Name: chr2:10000-20000_chr2:10000-30000, dtype: object
    >>> # edge case: data is a vector
    >>> df = clusters_to_table(clusters, 'chrX', res)
    >>> add_columns_to_cluster_table(df, 'value', row, col, data[:, 0])
    >>> df.iloc[0, :]
    us_chrom                    chrX
    us_start                   10000
    us_end                     20000
    ds_chrom                    chrX
    ds_start                   10000
    ds_end                     30000
    cluster_size                   2
    cluster         [[1, 2], [1, 1]]
    value                          2
    Name: chrX:10000-20000_chrX:10000-30000, dtype: object
    """
    # determine reducer
    reducer = {
        'mean': np.mean,
        'max': np.max,
        'min': np.min
    }[reducer]

    # determine which rows of the cluster table lie on this chromosome
    row_idx = np.ones(len(cluster_table), dtype=bool)
    if chrom is not None:
        row_idx = (cluster_table['us_chrom'] == chrom) & \
                  (cluster_table['ds_chrom'] == chrom)

    # promote data to 2D
    if data.ndim == 1:
        data = data[:, None]

    # create CSRs for each column of data
    csrs = [sparse.coo_matrix((data[:, i], (row, col))).tocsr()
            for i in range(data.shape[1])]

    # iterate over columns of data
    for i in range(data.shape[1]):
        # determine the name of the new column
        column_name = name_pattern % labels[i] if labels is not None \
            else name_pattern

        # add the new column if it doesn't exist yet
        if column_name not in cluster_table.columns:
            cluster_table[column_name] = np.ones(len(cluster_table)) * np.nan

        # fill in information for the new column
        cluster_table.loc[row_idx, column_name] = \
            cluster_table.loc[row_idx, 'cluster']\
            .map(lambda x: reducer(csrs[i][tuple(zip(*x))]))