Skip to content

Commit

Permalink
Changed multiprocessing code to use Pathos. Removed dill recurse opti…
Browse files Browse the repository at this point in the history
…on. Ran black on all files
  • Loading branch information
Patrick Nicodemus committed Sep 27, 2024
1 parent 133cffc commit b6228d4
Show file tree
Hide file tree
Showing 17 changed files with 152 additions and 137 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ The ```-p``` flag controls the port number on local host. For example, writing `
## Documentation
Extensive documentation, including several tutorials, can be found in [CAJAL's readthedocs.io website](https://cajal.readthedocs.io/en/latest/index.html). This website is under development and will continue to be substantially updated during the coming months.

## New in this release (v1.0.2, 8/22/2024)
- Released on PyPI
- Included a fix in cell segmentation contributed by @YuchenXiangEMBL
- Wheels are available on the Github Release page
## New in this release (v1.0.4, tbd)
- Added multiprocessing back to sample_swc

## Changelog since v1.0.3
168 changes: 77 additions & 91 deletions docs/notebooks/Example_1.ipynb

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cajal"
version = "1.0.3"
version = "1.0.4"
description="A library for multi-modal cell morphology analyses using Gromov-Wasserstein (GW) distance."
readme="./README.md"
requires-python=">=3.9"
Expand All @@ -32,7 +32,7 @@ dependencies = [
"networkx>=2.8.8",
"numpy",
"cython >= 3",
"pathos",
"pathos >= 0.3.2",
"tqdm>=4.64.1",
"potpourri3d",
"pynndescent>=0.5.13",
Expand All @@ -50,7 +50,12 @@ dependencies = [
dev = [
"mypy>=0.991",
"pytest >= 7.2.1",
"pre-commit >= 2.20.0"
"pre-commit >= 2.20.0",
"myst_parser",
"plotly",
"nbsphinx",
"navis",
"sphinx_rtd_theme"
]

vis = [
Expand Down
1 change: 1 addition & 0 deletions src/cajal/partial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""For testing partial matching."""

import itertools as it

import numpy as np
Expand Down
2 changes: 1 addition & 1 deletion src/cajal/qgw.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Functions for computing the quantized Gromov-Wasserstein distance and the SLB between \
metric measure spaces, and related utilities for file IO and parallel computation."""

# std lib dependencies
import csv
import itertools as it
Expand Down Expand Up @@ -446,7 +447,6 @@ def _quantized_gw_index(p: tuple[int, int]) -> tuple[int, int, float]:
"""Given input p= (i,j), compute the quantized GW distance between cells i \
and j in the global list of quantized cells."""
i, j = p
retval: tuple[int, int, float]
return (
i,
j,
Expand Down
1 change: 1 addition & 0 deletions src/cajal/run_gw.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Functionality to compute Gromov-Wasserstein distances \
using algorithms in Peyre et al. ICML 2016."""

from __future__ import annotations

import csv
Expand Down
1 change: 1 addition & 0 deletions src/cajal/sample_mesh.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Functions for sampling points from a triangular mesh
"""

from __future__ import annotations
import os
import sys
Expand Down
3 changes: 2 additions & 1 deletion src/cajal/sample_seg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pathos.pools import ProcessPool
from .utilities import write_csv_block


def _filter_to_cells(segmask: npt.NDArray[np.int_], background: int) -> list[int]:
"""
Return a list of identifiers for cells in the interior of the image.
Expand All @@ -24,6 +25,7 @@ def _filter_to_cells(segmask: npt.NDArray[np.int_], background: int) -> list[int
remove_cells.update(np.unique(segmask[:, -1]))
return list(cell_ids.difference(remove_cells))


def cell_boundaries(
imarray: npt.NDArray[np.int_],
n_sample: int,
Expand Down Expand Up @@ -80,7 +82,6 @@ def cell_boundaries(
return list(outlist)



def _compute_intracell_all(
infolder: str,
n_sample: int,
Expand Down
27 changes: 15 additions & 12 deletions src/cajal/sample_swc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Callable, Iterator, Union

import numpy as np
from pathos.pools import ProcessPool
import numpy.typing as npt
from scipy.spatial.distance import euclidean, pdist
from tqdm import tqdm
Expand Down Expand Up @@ -481,6 +482,7 @@ def compute_icdm_all_euclidean(
infolder: str,
out_csv: str,
n_sample: int,
num_processes: int = 8,
preprocess: Callable[[SWCForest], Union[Err[T], SWCForest]] = lambda forest: forest,
name_validate: Callable[[str], bool] = default_name_validate,
) -> list[tuple[str, Err[T]]]:
Expand Down Expand Up @@ -537,13 +539,13 @@ def rpce(file_path: str) -> Union[Err[T], npt.NDArray[np.float64]]:
# args = zip([file_paths,repeat(n_sample),repeat(preprocess)])
icdms: Iterator[Union[Err[T], npt.NDArray[np.float64]]]
failed_cells: list[tuple[str, Err[T]]]
# with ProcessPool(nodes=num_processes) as pool:
# icdms = pool.imap(rpce, file_paths)
icdms = map(rpce, file_paths)
# icdms = pool.starmap(read_preprocess_compute_euclidean,args)
tq_icdms = tqdm(icdms, total=len(cell_names))

failed_cells = write_csv_block(out_csv, n_sample, zip(cell_names, tq_icdms), 3)
pool = ProcessPool(nodes=num_processes)
results = tqdm(pool.imap(rpce, file_paths), total=len(cell_names))
failed_cells = write_csv_block(out_csv, n_sample, zip(cell_names, results), 10)
pool.close()
pool.join()
pool.clear()
return failed_cells


Expand All @@ -555,7 +557,7 @@ def compute_icdm_all_geodesic(
preprocess: Callable[
[SWCForest], Union[Err[T], NeuronTree]
] = lambda forest: forest[0],
name_validate: Callable[[str], bool] = default_name_validate
name_validate: Callable[[str], bool] = default_name_validate,
) -> list[tuple[str, Err[T]]]:
"""
Compute the intracell geodesic distance matrices for all swc cells in `infolder`.
Expand All @@ -576,10 +578,11 @@ def rpcg(file_path) -> Union[Err[T], npt.NDArray[np.float64]]:

icdms: Iterator[Err[T] | npt.NDArray[np.float64]]
failed_cells: list[tuple[str, Err[T]]]
# with ProcessPool(nodes=num_processes) as pool:
# pool.restart(force=True)
icdms = map(rpcg, file_paths)
tq_icdms = tqdm(icdms, total=len(cell_names))
failed_cells = write_csv_block(out_csv, n_sample, zip(cell_names, tq_icdms), 10)

pool = ProcessPool(nodes=num_processes)
results = tqdm(pool.imap(rpcg, file_paths), total=len(cell_names))
failed_cells = write_csv_block(out_csv, n_sample, zip(cell_names, results), 10)
pool.close()
pool.join()
pool.clear()
return failed_cells
5 changes: 4 additions & 1 deletion src/cajal/swc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
of an \*.swc file. Basic functions for manipulating, examining, validating and \
filtering \*.swc files. A function for reading \*.swc files from memory.
"""

from __future__ import annotations

import os
Expand All @@ -21,14 +22,15 @@

from .utilities import Err, T

dill.settings["recurse"] = True
# dill.settings["recurse"] = True


@dataclass
class NeuronNode:
r"""
A NeuronNode represents the contents of a single line in an \*.swc file.
"""

sample_number: int
structure_id: int
coord_triple: tuple[float, float, float]
Expand All @@ -44,6 +46,7 @@ class NeuronTree:
r"""
A NeuronTree represents one connected component of the graph coded in an \*.swc file.
"""

root: NeuronNode
child_subgraphs: list[NeuronTree]

Expand Down
8 changes: 5 additions & 3 deletions src/cajal/ternary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import numpy.typing as npt
from numpy.linalg import norm
from typing import Literal, Optional, Any
from typing import Literal, Optional

from scipy.spatial.distance import squareform
from scipy.stats import gaussian_kde
Expand Down Expand Up @@ -215,8 +215,10 @@ def ternary_distance_clusters(
coloring will vary more.
:param contour_lines: How many contour lines to draw.
:param figsize: Passed to matplotlib.pyplot.subplots.
:param clusters: Labels for clusters, should be the same length as the distance matrices featurei_dispersion
:param min_cluster_size: Ignore clusters below the threshold size (density plots are somewhat useless when there are very few observations)
:param clusters: Labels for clusters, should be the same length as the distance matrices
featurei_dispersion
:param min_cluster_size: Ignore clusters below the threshold size (density plots are somewhat
useless when there are very few observations)
:param mpl_params: Passed to matplotlib.
"""

Expand Down
1 change: 1 addition & 0 deletions src/cajal/utilities.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Helper functions.
"""

from dataclasses import dataclass
import csv
from scipy.spatial.distance import squareform
Expand Down
1 change: 1 addition & 0 deletions src/cajal/weighted_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Defines a WeightedTree class, to represent the information relevant in an SWC from the \
geodesic point of view. Defines functions for manipulating and processing WeightedTrees.
"""

from __future__ import annotations
from dataclasses import dataclass
from typing import Union
Expand Down
40 changes: 22 additions & 18 deletions src/cajal/wnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,18 @@

class Modality:
"""
A Modality is a dataset profiling a collection of cells from one perspective or using one technology.
It can be constructed using either a set of observations in n-dimensional space (a `k` by `n`) matrix,
where `k` is the number of cells and `n` is the dimensionality of the ambient space;
or it can be constructed using a distance matrix (a `k` by `k`).
If only a distance matrix is supplied, then the constructor chooses an embedding of the points in the
distance matrix into n-dimensional space using Isomap,
so a set of observations in a vector space is preferable when it is available.
If using a distance matrix, a Modality object must be constructed together with a given number of neighbors to consider when
constructing the nearest neighbors graphs for the Isomap embedding.
This number should be as least as high as the number of neighbors you care about when analyzing the output of the WNN embedding.
A Modality is a dataset profiling a collection of cells from one perspective or using one
technology. It can be constructed using either a set of observations in n-dimensional space (a
`k` by `n`) matrix, where `k` is the number of cells and `n` is the dimensionality of the
ambient space; or it can be constructed using a distance matrix (a `k` by `k`). If only a
distance matrix is supplied, then the constructor chooses an embedding of the points in the
distance matrix into n-dimensional space using Isomap, so a set of observations in a vector
space is preferable when it is available.
If using a distance matrix, a Modality object must be constructed together with a given number
of neighbors to consider when constructing the nearest neighbors graphs for the Isomap
embedding. This number should be as least as high as the number of neighbors you care about when
analyzing the output of the WNN embedding.
"""

def local_bandwidth(self, margin_count: int = 20):
Expand Down Expand Up @@ -75,8 +76,10 @@ def of_dmat(
"""
:param dmat: A distance matrix.
:param intrinsic_dim: If you have computed the intrinsic dimension
of your space by a technique other than MADA, feed the precomputed dimension in here as a parameter.
:param n_neighbors: How many nearest neighbors to build when constructing the Isomap embedding.
of your space by a technique other than MADA, feed the precomputed dimension in here as a
parameter.
:param n_neighbors: How many nearest neighbors to build when constructing the Isomap
embedding.
:returns: A Modality object constructed from the distance matrix.
"""
Expand All @@ -98,17 +101,18 @@ def of_dmat(
def wnn(modalities: list[Modality], k: int, epsilon: float = 1e-4):
"""
Compute the weighted nearest neighbors pairing, following
`Integrated analysis of multimodal single-cell data <https://www.sciencedirect.com/science/article/pii/S0092867421005833>`_
`Integrated analysis of multimodal single-cell data
<https://www.sciencedirect.com/science/article/pii/S0092867421005833>`_
This algorithm differs from the published algorithm in the paper in a few ways.
In particular we do not take the L2 normalization of columns of the matrix before we begin.
This algorithm differs from the published algorithm in the paper in a few ways. In particular we
do not take the L2 normalization of columns of the matrix before we begin.
:param modalities: list of modalities
:param k: how many nearest neighbors to consider
:param epsilon: This is a numerical stability parameter,
it is added to the denominator of a fraction to prevent dividing by zero.
:returns: A matrix of pairwise similarities (not distances!) which can be used in
training a k-nearest neighbors classifier to identify cells which are overall most like the query cell
:returns: A matrix of pairwise similarities (not distances!) which can be used in training a
k-nearest neighbors classifier to identify cells which are overall most like the query cell
from the perspective of multiple morphologies.
"""

Expand Down
4 changes: 2 additions & 2 deletions tests/test_qgw.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ def test_combined_slb_quantized_gw():
gw_out_csv_location="tests/slb_qgw.csv",
num_processes=2,
num_clusters=20,
accuracy=.97,
accuracy=0.97,
nearest_neighbors=3,
verbose=False,
chunksize=20
chunksize=20,
)
1 change: 1 addition & 0 deletions tests/test_sample_swc.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def test_compute_icdm_both():
infolder=swc_dir,
out_csv="tests/icdm_euclidean.csv",
n_sample=50,
num_processes=10,
)
compute_icdm_all_geodesic(
infolder=swc_dir,
Expand Down
7 changes: 6 additions & 1 deletion tests/test_ternary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@ def test_isometry():
d12 = d1_dm_vf - d2_dm_vf + (1 / 3)
d23 = d2_dm_vf - d3_dm_vf + (1 / 3)
d31 = d3_dm_vf - d1_dm_vf + (1 / 3)
assert (np.allclose(d12 + d23 + d31, np.ones((4950),)))
assert np.allclose(
d12 + d23 + d31,
np.ones(
(4950),
),
)
xyz = np.stack((d12, d23, d31), axis=1)
xy = two_d_projection(xyz)
assert np.allclose(pdist(xyz), pdist(xy))

0 comments on commit b6228d4

Please sign in to comment.