Skip to content

Commit

Permalink
WIP - untested.
Browse files Browse the repository at this point in the history
  • Loading branch information
pp-mo committed Jan 9, 2025
1 parent 8544972 commit a5e7fd5
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 0 deletions.
3 changes: 3 additions & 0 deletions lib/ncdata/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""General user utility functions."""
from ._compare_nc_datasets import dataset_differences, variable_differences
from ._copy import ncdata_copy
from ._dim_indexing import Slicer, index_by_dimensions
from ._save_errors import save_errors

__all__ = [
"Slicer",
"dataset_differences",
"index_by_dimensions",
"ncdata_copy",
"save_errors",
"variable_differences",
Expand Down
161 changes: 161 additions & 0 deletions lib/ncdata/utils/_dim_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from numbers import Number
from typing import Any, List, Mapping, Union

import dask.array as da

from ncdata import NcData
from ncdata.utils import ncdata_copy


def index_by_dimensions(
ncdata: NcData, **dim_index_kwargs: Mapping[str, Any]
) -> NcData:
"""
Index an NcData over dimensions.
Parameters
----------
ncdata
input data
dim_index_kwargs
specify indexing to apply to dimensions.
E.G. ``x=1``, ``time=slice(0, 100)``, ``levels=[1,2,5]``.
Returns
-------
copy of input with dimensions, and all relevant variables, sub-indexed.
Notes
-----
Where a dimension key is a single value, the dimension will be *removed*.
This mimics how numpy arrays behave, i.e. the difference between a[1] and a[1:2]
Examples
--------
ncdata = index_by_dimensions(ncdata, time=slice(0, 10)) # equivalent to [:10]
ncdata = index_by_dimensions(ncdata, levels=[1,2,5])
ncdata = index_by_dimensions(ncdata, time=3, levels=slice(2, 10, 3))
See Also
--------
:class:`Slicer` provides the same function with a slicing syntax
"""
# Start by copying the input : then modify that in-place
ncdata = ncdata_copy(ncdata)
for dim_name, key in dim_index_kwargs.items():
# Dimension names must occur in the ncdata.
dimension = ncdata.dimensions.get(dim_name)
if dimension is None:
raise ValueError(
f"Dimension {dim_name!r} is not present in 'ncdata'."
)

# Check for and fail repeated dimensions: the meaning would be unclear (!)
matches = [name for name in dim_index_kwargs if name == dim_name]
if len(matches) > 1:
msg = (
f"Dimensions to index, {tuple(dim_index_kwargs.keys())}, "
f"includes dimension {dim_name!r} more than once."
)
raise ValueError(msg)

# Hopefully this replicates how numpy makes this decision?
remove_dim = isinstance(key, Number)

# TODO:
# Key types must be supported:
# * int (or other numeric, including numpy scalars ?)
# * list of int
# * slice object
# * 1-D array of numeric
# Key "special" types we could possibly error or convert, to avoid confusion
# with numpy behaviours ? :
# arrays, tuples, booleans, None, newaxis, ellipsis ...

# Index the data of all referencing variables
for var in ncdata.variables.values():
if dim_name in var.dimensions:
# construct a list of slice objects
(i_slicedim,) = [
i
for i, name in enumerate(var.dimensions)
if name == dim_name
]
slices = [slice(None) for dim in var.dimensions]
slices[i_slicedim] = key

# index the data
var.data = var.data[tuple(slices)]

# also remove the dim, if it will be removed
if remove_dim:
del var.dimensions[dim_name]

# Remove or reduce the dimension itself.
if remove_dim:
del ncdata.dimensions[dim_name]
else:
# calculate the new dim size, using numpy-like logic
# TODO: there is probably a better way of calculating this ?
new_size = da.zeros(dimension.size)[key].shape[0]
dimension.size = new_size

return ncdata


class Slicer:
"""
An object which can index an NcData over its dimensions.
This wraps the :meth:`index_by_dimensions` method for convenience, returning an
object which supports the Python extended slicing syntax.
Examples
--------
data = Slicer(ncdata, 'time')[:10]
data = Slicer(ncdata, 'level')[[1, 2, 5]]
data = Slicer(ncdata, 'level', 'time', 'x', 'y')[1, :3, 2:10:3, ::-1]
"""

def __init__(self, ncdata: NcData, dimensions: Union[str, List[str]]):
"""
Create an indexer for an NcData, applying to specific dimensions.
This can then be indexed to produce a derived (sub-indexed) dataset.
Parameters
----------
ncdata
input data
dimensions
one or more dimension names, to which successive index keys will be applied
"""
self.ncdata = ncdata
if isinstance(dimensions, str):
dimensions = [dimensions]
self.dim_names = tuple(dimensions)

def __getitem__(self, keys) -> NcData:
"""
Return an indexed portion of self.ncdata.
Index with 'keys' in the specified dimensions.
"""
if not isinstance(keys, tuple):
# Single key, e.g. 1, slice(None), [2,3,4], array([2,3])
# N.B. *otherwise* keys is always a tuple
# A single tuple argument is passed as-is, i.e. interprets as multiple keys
keys = [keys]

n_keys = len(keys)
if len(keys) > len(self.dim_names):
msg = (
f"Too many index keys, {n_keys}, for the specified indexing dimension "
"names, {self.dim_names!r}."
)
raise ValueError(msg)

# NB too *few* keys is not a problem, since 'zip' truncates for us.
dim_kwargs = {name: key for name, key in zip(self.dim_names, keys)}

return index_by_dimensions(self.ncdata, **dim_kwargs)

0 comments on commit a5e7fd5

Please sign in to comment.