Skip to content

Commit

Permalink
Added crude dimension-based load chunking control.
Browse files Browse the repository at this point in the history
  • Loading branch information
pp-mo committed Jan 15, 2025
1 parent 03681b9 commit 583b912
Showing 1 changed file with 30 additions and 7 deletions.
37 changes: 30 additions & 7 deletions lib/ncdata/netcdf4.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,7 @@ def to_nc4(
nc4ds.close()


def _from_nc4_group(
nc4ds: Union[nc.Dataset, nc.Group],
) -> NcData:
def _from_nc4_group(nc4ds: Union[nc.Dataset, nc.Group], dim_chunks) -> NcData:
"""
Inner routine for :func:`from_nc4`.
Expand Down Expand Up @@ -261,8 +259,9 @@ def _from_nc4_group(
variable_name=varname,
group_names_path=group_names_path,
)
chunks = [dim_chunks.get(name, "auto") for name in var.dimensions]
var.data = da.from_array(
proxy, chunks="auto", asarray=True, meta=np.ndarray
proxy, chunks=chunks, asarray=True, meta=np.ndarray
)

for attrname in nc4var.ncattrs():
Expand All @@ -277,13 +276,16 @@ def _from_nc4_group(

# And finally, groups -- by the magic of recursion ...
for group_name, group in nc4ds.groups.items():
ncdata.groups[group_name] = _from_nc4_group(nc4ds.groups[group_name])
ncdata.groups[group_name] = _from_nc4_group(
nc4ds.groups[group_name], dim_chunks=dim_chunks
)

return ncdata


def from_nc4(
nc4_dataset_or_file: Union[nc.Dataset, nc.Group, Path, str]
nc4_dataset_or_file: Union[nc.Dataset, nc.Group, Path, str],
dim_chunks: Dict[str, Union[int, str]] = None,
) -> NcData:
"""
Load NcData from a :class:`netCDF4.Dataset` or netCDF file.
Expand All @@ -294,18 +296,39 @@ def from_nc4(
source of load data. Can be either a :class:`netCDF4.Dataset`,
a :class:`netCDF4.Group`, a :class:`pathlib.Path` or a string.
dim_chunks
a dictionary of chunk sizes (number, or -1 or "auto") for specific
dimensions, specified by dimension name.
Defaults to "auto" for all unspecified dimensions.
Returns
-------
ncdata : NcData
Examples
--------
For example, to avoid cases where a simple dask ``from_array(chunks="auto")``
will fail
>>> from ncdata.netcdf4 import from_nc4
>>> from tests import testdata_dir
>>> path = testdata_dir / "toa_brightness_temperature.nc"
>>> ds = from_nc4(path, dim_chunks={"x": 15})
>>> ds.variables["data"].data.chunksize
(160, 15)
>>>
"""
if dim_chunks is None:
dim_chunks = {}
caller_owns_dataset = hasattr(nc4_dataset_or_file, "variables")
if caller_owns_dataset:
nc4ds = nc4_dataset_or_file
else:
nc4ds = nc.Dataset(nc4_dataset_or_file)

try:
ncdata = _from_nc4_group(nc4ds)
ncdata = _from_nc4_group(nc4ds, dim_chunks)
finally:
if not caller_owns_dataset:
nc4ds.close()
Expand Down

0 comments on commit 583b912

Please sign in to comment.