Skip to content

Commit

Permalink
Remove Cython dependency on Numpy
Browse files Browse the repository at this point in the history
  • Loading branch information
jlumpe committed Aug 4, 2024
1 parent 000b0cf commit 1e6bb5e
Show file tree
Hide file tree
Showing 13 changed files with 72 additions and 62 deletions.
3 changes: 0 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,5 @@ requires = [
"setuptools",
"wheel",
"Cython >= 3.0",
# If the Numpy version is different at runtime than build time, the build version should be
# lower as the ABI is forward- but not backwards-compatible.
"oldest-supported-numpy",
]
build-backend = "setuptools.build_meta"
3 changes: 0 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@
from setuptools import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy


# Cython extensions
np_include = numpy.get_include()
extensions = [Extension(
'gambit._cython.*',
['src/gambit/_cython/*.pyx'],
include_dirs=[np_include],
extra_compile_args=['-fopenmp', '-Wno-sign-compare'],
extra_link_args=['-fopenmp'],
)]
Expand Down
8 changes: 4 additions & 4 deletions src/gambit/_cython/kmers.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
cimport numpy as np
from libc.stdint cimport uint64_t, intptr_t

ctypedef unsigned char CHAR


cdef np.uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil
cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil
cdef void c_index_to_kmer(np.uint64_t, CHAR[:]) nogil
cdef uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil
cdef uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil
cdef void c_index_to_kmer(uint64_t, CHAR[:]) nogil
cdef void c_revcomp(const CHAR[:], CHAR[:]) nogil
14 changes: 7 additions & 7 deletions src/gambit/_cython/kmers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def kmer_to_index(const CHAR[:] kmer):
Convert k-mer byte string to its integer index.
"""
cdef:
np.uint64_t idx
uint64_t idx
bint exc = False

if kmer.shape[0] > 32:
Expand All @@ -28,9 +28,9 @@ def kmer_to_index(const CHAR[:] kmer):
return idx


cdef np.uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil:
cdef uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil:
cdef:
np.uint64_t idx = 0
uint64_t idx = 0
int i, k = kmer.shape[0]
CHAR nuc

Expand Down Expand Up @@ -61,7 +61,7 @@ def kmer_to_index_rc(const CHAR[:] kmer):
Get the integer index of the reverse complement of a k-mer byte string.
"""
cdef:
np.uint64_t idx
uint64_t idx
bint exc = False

if kmer.shape[0] > 32:
Expand All @@ -75,9 +75,9 @@ def kmer_to_index_rc(const CHAR[:] kmer):
return idx


cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil:
cdef uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil:
cdef:
np.uint64_t idx = 0
uint64_t idx = 0
int i, k = kmer.shape[0]
CHAR nuc

Expand Down Expand Up @@ -112,7 +112,7 @@ def index_to_kmer(index, int k):
return bytes(buf)


cdef void c_index_to_kmer(np.uint64_t index, CHAR[:] out) nogil:
cdef void c_index_to_kmer(uint64_t index, CHAR[:] out) nogil:
"""Convert k-mer index to sequence."""
cdef:
int k = out.shape[0]
Expand Down
1 change: 1 addition & 0 deletions src/gambit/_cython/metric.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from libc.stdint cimport intptr_t
from .types cimport SCORE_T, BOUNDS_T, COORDS_T, COORDS_T_2

cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil
17 changes: 5 additions & 12 deletions src/gambit/_cython/metric.pyx
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
"""Cython functions for calculating k-mer distance metrics"""

cimport numpy as np
import numpy as np
from cython.parallel import prange, parallel


# Numpy dtypes equivalent to SCORE_T and BOUNDS_T
SCORE_DTYPE = np.dtype(np.float32)
BOUNDS_DTYPE = np.dtype(np.intp)


def jaccard(COORDS_T[:] coords1, COORDS_T_2[:] coords2):
"""Compute the Jaccard index between two k-mer sets in sparse coordinate format.
Expand Down Expand Up @@ -76,15 +69,15 @@ cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil:

cdef:
# Lengths of the two arrays
np.intp_t N = coords1.shape[0]
np.intp_t M = coords2.shape[0]
intptr_t N = coords1.shape[0]
intptr_t M = coords2.shape[0]

# Index and value of items in each array as we are iterating
np.intp_t i = 0, j = 0
intptr_t i = 0, j = 0
COORDS_T a
COORDS_T_2 b

np.intp_t u = 0 # Size of union
intptr_t u = 0 # Size of union

# Iterate through both arrays simultaneously, advance index for the array
# with the smaller value. Advance both if they are equal. Increment the
Expand Down Expand Up @@ -136,7 +129,7 @@ def _jaccarddist_parallel(COORDS_T[:] query, COORDS_T_2[:] ref_coords, BOUNDS_T[
out : numpy.ndarray
Pre-allocated array to write distances to.
"""
cdef np.intp_t N = ref_bounds.shape[0] - 1
cdef intptr_t N = ref_bounds.shape[0] - 1
cdef BOUNDS_T begin, end
cdef int i

Expand Down
38 changes: 28 additions & 10 deletions src/gambit/_cython/threads.pyx
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""OpenMP stuff."""

from cython import parallel
import array

import numpy as np
cimport numpy as np
cimport cython
from cpython cimport array
cimport openmp


Expand All @@ -25,18 +26,35 @@ def omp_get_max_threads():
return openmp.omp_get_max_threads()


def get_thread_ids(int num_threads):
"""Run a multithreaded loop and get the thread ID running in each iteration."""
@cython.boundscheck(True)
def get_thread_ids(int n):
"""Run a multithreaded loop and get the thread ID running in each iteration.
Used to check that Cython code parallelization is working correctly. Result should contain
integers from 0 to ``num_threads``, repeated up to length ``n``.
Parameters
----------
n: int
Size of loop. Make this at least as large as the expected number of threads.
Returns
-------
array.array
Array of size ``n`` containing the thread ID running in each loop iteration.
"""

cdef:
np.ndarray[np.intp_t, ndim=1] thread_ids
np.intp_t thread_id = -1
array.array thread_ids_arr = array.array('i')
int[:] thread_ids
int i

thread_ids = np.full(num_threads, -1, dtype=np.intp)
for i in range(n):
thread_ids_arr.append(-1)

thread_ids = thread_ids_arr

for i in parallel.prange(num_threads, nogil=True, schedule='static', chunksize=1):
thread_id = parallel.threadid()
thread_ids[i] = thread_id
for i in parallel.prange(n, nogil=True, schedule='static', chunksize=1):
thread_ids[i] = parallel.threadid()

return thread_ids
25 changes: 10 additions & 15 deletions src/gambit/_cython/types.pxd
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
"""Shared typedefs."""

cimport numpy as np
from libc.stdint cimport uint16_t, uint32_t, uint64_t, intptr_t


# Type for similarity scores
ctypedef np.float32_t SCORE_T
ctypedef float SCORE_T

# Type for bounds on c_jaccard_coords_col
ctypedef np.intp_t BOUNDS_T
# This should be equal to Numpy's intp dtype
ctypedef intptr_t BOUNDS_T

# Fused type for storing k-mer coordinates/indices
ctypedef fused COORDS_T:
np.int16_t
np.uint16_t
np.int32_t
np.uint32_t
np.int64_t
np.uint64_t
uint16_t
uint32_t
uint64_t

# Copy of COORDS_T, used when two arguments have types in this set but may be different than each other.
ctypedef fused COORDS_T_2:
np.int16_t
np.uint16_t
np.int32_t
np.uint32_t
np.int64_t
np.uint64_t
uint16_t
uint32_t
uint64_t
9 changes: 7 additions & 2 deletions src/gambit/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@

import numpy as np

from gambit._cython.metric import BOUNDS_DTYPE, SCORE_DTYPE, jaccard, jaccarddist, \
_jaccarddist_parallel
from gambit._cython.metric import jaccard, jaccarddist, _jaccarddist_parallel
from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList
from gambit.util.misc import chunk_slices
from gambit.util.progress import get_progress


#: Numpy dtype for output of Cython Jaccard distance calculation code
# Equivalent to SCORE_T in types.pxd
SCORE_DTYPE = np.dtype(np.float32)
BOUNDS_DTYPE = np.dtype(np.intp)


def jaccard_generic(set1: Iterable, set2: Iterable) -> float:
"""Get the Jaccard index of of two arbitrary sets.
Expand Down
2 changes: 1 addition & 1 deletion src/gambit/sigs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Calculate and store collections of k-mer signatures."""

from .base import KmerSignature, SignatureArray, SignatureList, sigarray_eq, SignaturesMeta,\
AnnotatedSignatures, dump_signatures, load_signatures
AnnotatedSignatures, dump_signatures, load_signatures, BOUNDS_DTYPE
7 changes: 6 additions & 1 deletion src/gambit/sigs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from attr import attrs, attrib

from gambit.kmers import KmerSpec
from gambit._cython.metric import BOUNDS_DTYPE
from gambit.util.indexing import AdvancedIndexingMixin
from gambit.util.io import FilePath

Expand All @@ -15,6 +14,12 @@
# TODO - use nptyping package to specify dimensions and data type?


#: Preferred Numpy dtype for :attr:`.ConcatenatedSignatureArray.bounds`. Can be used in parallelized
#: Cython metric calculation code without conversion.
# Equivalent to BOUNDS_T in types.pxd
BOUNDS_DTYPE = np.dtype(np.intp)


def sigarray_eq(a1: Sequence[KmerSignature], a2: Sequence[KmerSignature]) -> bool:
"""Check two sequences of sparse k-mer signatures for equality.
Expand Down
3 changes: 1 addition & 2 deletions src/gambit/sigs/hdf5.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
import h5py as h5

from .base import SignatureArray, ConcatenatedSignatureArray, AbstractSignatureArray, SignaturesMeta,\
ReferenceSignatures, SignaturesFileError
ReferenceSignatures, SignaturesFileError, BOUNDS_DTYPE
from gambit.kmers import KmerSpec
from gambit._cython.metric import BOUNDS_DTYPE
from gambit.util.io import FilePath


Expand Down
4 changes: 2 additions & 2 deletions tests/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import numpy as np

from gambit.metric import jaccard, jaccarddist, jaccard_bits, jaccard_generic, jaccarddist_array, \
jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE, BOUNDS_DTYPE
jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE
from gambit.sigs.calc import sparse_to_dense
from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures
from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures, BOUNDS_DTYPE
from gambit.kmers import KmerSpec
from gambit.util.progress import check_progress
from .common import make_signatures
Expand Down

0 comments on commit 1e6bb5e

Please sign in to comment.