diff --git a/CHANGELOG.md b/CHANGELOG.md index 6beffa6a..77635cb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,10 @@ # calour changelog +## Version 2024.8.25 + +New features: +* Add CorrelationExperiment class for working with correlation matrices and showing significance in heatmap +Other changes: +* Update experiment classes to provide the _get_abundance_info() method for the interactive heatmap (instead of being produced by the heatmap() method). This allows experiment class specific information to be shown in the heatmap abundance field when clicking on a feature/sample. ## Version 2024.5.30 add mRNAExperiment class for handling rna-seq data. interactive heatmap gene information is via the rna_calour module using Harmonizome server (https://maayanlab.cloud/Harmonizome) diff --git a/calour/__init__.py b/calour/__init__.py index 5379192a..c2109b98 100644 --- a/calour/__init__.py +++ b/calour/__init__.py @@ -12,23 +12,25 @@ from .experiment import Experiment from .amplicon_experiment import AmpliconExperiment +from .correlation_experiment import CorrelationExperiment from .ms1_experiment import MS1Experiment from .mrna_experiment import mRNAExperiment -from .io import read, read_amplicon, read_ms, read_qiime2 +from .io import read, read_amplicon, read_ms, read_qiime2, read_correlation from .util import set_log_level, register_functions __credits__ = "https://github.com/biocore/calour/graphs/contributors" -__version__ = "2024.5.30" +__version__ = "2024.8.25" -__all__ = ['read', 'read_amplicon', 'read_ms', 'read_qiime2', +__all__ = ['read', 'read_amplicon', 'read_ms', 'read_qiime2', 'read_correlation', 'Experiment', 'AmpliconExperiment', 'MS1Experiment','mRNAExperiment', + 'CorrelationExperiment', 'set_log_level'] # add member functions to the class -register_functions((Experiment, AmpliconExperiment, MS1Experiment, mRNAExperiment)) +register_functions((Experiment, AmpliconExperiment, MS1Experiment, mRNAExperiment, CorrelationExperiment)) # setting False allows other logger to print log. -fileConfig(resource_filename(__package__, 'log.cfg'), disable_existing_loggers=False) +fileConfig(resource_filename(__package__, 'log.cfg'), disable_existing_loggers=False) \ No newline at end of file diff --git a/calour/amplicon_experiment.py b/calour/amplicon_experiment.py index 17de670f..41c3f475 100644 --- a/calour/amplicon_experiment.py +++ b/calour/amplicon_experiment.py @@ -84,6 +84,24 @@ class AmpliconExperiment(Experiment): def __init__(self, *args, databases=('dbbact',), **kwargs): super().__init__(*args, databases=databases, **kwargs) + def _get_abundance_info(self, row:int , col:int): + '''Get a string with the abundance information for display in the interactive heatmap + For amplicon experiment (that is based on normalized discrete reads), we show the abundance in float format (with 2 decimal points). + + Parameters + ---------- + row : int + The row index + col : int + The column index + + Returns + ------- + str + The string with the abundance information + ''' + return '{:.2f}'.format(self.data[row, col]) + def heatmap(self, *args, **kwargs): '''Plot a heatmap for the amplicon experiment. diff --git a/calour/correlation_experiment.py b/calour/correlation_experiment.py new file mode 100644 index 00000000..66c5fdbd --- /dev/null +++ b/calour/correlation_experiment.py @@ -0,0 +1,302 @@ +''' +correlation experiment (:mod:`calour.correlation_experiment`) +======================================================= + +.. currentmodule:: calour.correlation_experiment + +Classes +^^^^^^^ +.. autosummary:: + :toctree: generated + + CorrelationExperiment +''' + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016--, Calour development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- + +from logging import getLogger + +import numpy as np +import pandas as pd +import scipy.stats +from statsmodels.stats.multitest import multipletests + +from .experiment import Experiment +from .util import _to_list +from .analysis import _new_experiment_from_pvals, _CALOUR_DIRECTION, _CALOUR_STAT + +logger = getLogger(__name__) + + +class CorrelationExperiment(Experiment): + '''This class stores a correlation matrix data and corresponding analysis methods. + Besides the main data matrix (which is the correlation values) it also stores an additional Experiment (in self.qvals) that contains a matrix containing the q-values for each correlation. + These can be plotted on top of the correlation matrix to show the significance of each correlation. + + This is a child class of :class:`.Experiment`. + + Parameters + ---------- + data : numpy.ndarray or scipy.sparse.csr_matrix + The Correlation values (between -1 and 1) + sample_metadata : pandas.DataFrame + The metadata on the samples (rows in the matrix, shown in columns in the heatmap) + feature_metadata : pandas.DataFrame + The metadata on the features (columns in the matrix, shown in rows in the heatmap) + qvals : numpy.ndarray or scipy.sparse.csr_matrix or None + The q-values for the correlation values + NOTE: This is not guaranteed to be in the same order as the data matrix (unless _sync_qvals() is called) + description : str + name of experiment + sparse : bool + store the data array in :class:`scipy.sparse.csr_matrix` + or :class:`numpy.ndarray` + databases: iterable of str, optional + database interface names to show by default in heatmap() function + by default use None (no databases) + For ASV correlations, can use 'dbbact' + For gene correlations, can use 'mrna' + + Attributes + ---------- + data : numpy.ndarray or scipy.sparse.csr_matrix + The log ratio table for OTUs or ASVs. + Samples are in row and features in column. values are float (can be negative) + with np.nan indicating ratio for the specific feature does not exist. + sample_metadata : pandas.DataFrame + The metadata on the samples + feature_metadata : pandas.DataFrame + The metadata on the features + qvals: numpy.ndarray or scipy.sparse.csr_matrix or None + The q-values for the correlation values + shape : tuple of (int, int) + the dimension of data + sparse : bool + store the data as sparse matrix (scipy.sparse.csr_matrix) or dense numpy array. + info : dict + information about the experiment (data md5, filenames, etc.) + description : str + name of the experiment + databases : dict + keys are the database names (i.e. 'dbbact' / 'gnps') + values are the database specific data for the experiment (i.e. annotations for dbbact) + + See Also + -------- + Experiment + ''' + def __init__(self, *args, qvals=None, **kwargs): + '''Init the CorrelationExperiment class + By default we set sparse=False (as we usually have a dense matrix) + ''' + if 'sparse' not in kwargs: + kwargs['sparse'] = False + super().__init__(*args, **kwargs) + if qvals is not None: + if self.data.shape != qvals.shape: + raise ValueError('qvals shape %s does not match data shape %s' % (qvals.shape, self.data.shape)) + self.qvals = Experiment(data=qvals, sample_metadata=self.sample_metadata, feature_metadata=self.feature_metadata, sparse=self.sparse) + + def _sync_qvals(self): + '''Sync the q-values experiment with the main experiment + Used to make sure the q-values are in the same order as the data matrix. + ''' + self.qvals = self.qvals.filter_ids(self.feature_metadata.index, axis='f') + self.qvals = self.qvals.filter_ids(self.sample_metadata.index, axis='s') + + def _get_abundance_info(self, row:int , col:int): + '''Get a string with the abundance information for display in the interactive heatmap + Also returns the qvalue if it exists. + + Parameters + ---------- + row : int + The row index + col : int + The column index + + Returns + ------- + str + The string with the abundance information + ''' + if self.qvals is None: + qval = 'NA' + else: + qval = self.qvals.data[row, col] + return '{:.2E}, qval: {:.2f}'.format(self.data[row, col], qval) + + def heatmap(self, significance_plot=['cmap'],significance_threshold=0.05, significance_plot_params={'color': 'red'}, cmap='bwr', *args, **kwargs): + '''Plot a heatmap for the ratio experiment. + The heatmap includes indication for significant correlations. This can be as a different set of colors for the significant correlations or by plotting a marker for the significant correlations. + + This method accepts the same parameters as input with its parent class method. + In addition, it accepts the following parameters: + significance_plot : list of str, optional + The type of significance plot to show. Can be 'cmap' and/or 'x' + significance_threshold : float, optional + The threshold for the q-values to be considered significant. + significance_plot_params : dict, optional + The parameters to be passed to the plot function for the significance values. + If 'cmap' is in the list, use the 'cmap' parameter in significance_plot_params to set the colormap for the significant values. + If 'x' is in the list, use the 'significance_plot_params' parameter to set the plot parameters for the significance values. + + See Also + -------- + Experiment.heatmap + + ''' + import matplotlib.pyplot as plt + from matplotlib.colors import LinearSegmentedColormap + + if 'clim' not in kwargs: + min_val = np.min(self.get_data()[:]) + max_val = np.max(self.get_data()[:]) + range_val = np.max([np.abs(min_val), np.abs(max_val)]) + kwargs['clim'] = (-range_val, range_val) + + if significance_plot is None or significance_plot == []: + if self.qvals is None: + raise ValueError('No qvals attached to experiment. Please provide a qvals matrix to plot the significance values or use significance_plot=[] to not plot significance values.') + else: + self._sync_qvals() + + data_changed = False + if 'cmap' in significance_plot: + # copy the data + old_data = self.get_data(copy=True) + data_changed = True + + # eps is added to the data to avoid overlap in the colormaps for significant/non-significant values + eps = 1e-7 + + max_val = kwargs['clim'][1] + min_val = kwargs['clim'][0] + self.data[self.data>max_val]=max_val + self.data[self.data CorrelationExperiment: + '''Read a saved correlation experiment. + Loads both the original correlation data experiment and the q-values experiment. + + Parameters + ---------- + prefix : str + The file to read the experiment from (the names passed to CorrelationExperiment.save) + **kwargs : dict + Additional arguments to pass to the read + ''' + # store the function parameters for call history + fparams = locals() + + # by default, don't normalize the data since it is correlation data + if 'normalize' not in kwargs: + kwargs['normalize'] = None + + # load the main correlation experiment + logger.debug('Reading correlation experiment from %s' % prefix) + exp = read(prefix+'.biom', sample_metadata_file=prefix+'_sample.txt', feature_metadata_file=prefix+'_feature.txt', cls=CorrelationExperiment, **kwargs) + # and load the q-values table + logger.debug('Reading correlation matrix %s_qvals.biom' % prefix) + exp.qvals = read(prefix+'_qvals.biom', sample_metadata_file=prefix+'_qvals_sample.txt', feature_metadata_file=prefix+'_qvals_feature.txt', normalize=None) + + # initialize the call history + param = ['{0!s}={1!r}'.format(k, v) for k, v in fparams.items()] + exp._call_history = ['{0}({1})'.format('read_correlation', ','.join(param))] + + return exp + + @ds.with_indent(4) def read_ms(data_file, sample_metadata_file=None, feature_metadata_file=None, gnps_file=None, data_file_type='mzmine2', sample_in_row=None, direct_ids=None, get_mz_rt_from_feature_id=None,