diff --git a/docs/releases.md b/docs/releases.md index e38b4a12..42544559 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -23,6 +23,7 @@ [Chuck Daniels](https://github.com/chuckwondo). - Now throws a warning if you attempt to write an entirely non-virtual dataset to a virtual references format ([#657](https://github.com/zarr-developers/VirtualiZarr/pull/657)). By [Tom Nicholas](https://github.com/TomNicholas). +- Support big-endian data via zarr-python 3.0.9 and zarr v3's new data types system ([#618](https://github.com/zarr-developers/VirtualiZarr/issues/618), [#677](https://github.com/zarr-developers/VirtualiZarr/issues/677)) By [Max Jones](https://github.com/maxrjones) and [Tom Nicholas](https://github.com/TomNicholas). ### Breaking changes diff --git a/pyproject.toml b/pyproject.toml index d4a3763d..5bf43250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "numcodecs>=0.15.1", "ujson", "packaging", - "zarr>=3.0.8,!=3.1.0", + "zarr>=3.1.0", "obstore>=0.5.1", ] @@ -169,7 +169,7 @@ rust = "*" xarray = "==2025.3.0" numpy = "==2.0.0" numcodecs = "==0.15.1" -zarr = "==3.0.8" +zarr = "==3.1.0" obstore = "==0.5.1" # Define commands to run within the test environments diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 642a4d62..f6d93819 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -87,8 +87,10 @@ def chunks(self) -> tuple[int, ...]: @property def dtype(self) -> np.dtype: - dtype_str = self.metadata.data_type - return dtype_str.to_numpy() + """The native dtype of the data (typically a numpy dtype)""" + zdtype = self.metadata.data_type + dtype = zdtype.to_native_dtype() + return dtype @property def shape(self) -> tuple[int, ...]: @@ -111,9 +113,11 @@ def __repr__(self) -> str: @property def nbytes_virtual(self) -> int: """ - Size required to hold these references in memory in bytes. + The total number of bytes required to hold these virtual references in memory in bytes. - Note this is not the size of the referenced array if it were actually loaded into memory, + Notes + ----- + This is not the size of the referenced array if it were actually loaded into memory (use `.nbytes`), this is only the size of the pointers to the chunk locations. If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. """ @@ -170,7 +174,7 @@ def __eq__( # type: ignore[override] if self.shape != other.shape: raise NotImplementedError("Unsure how to handle broadcasting like this") - if not utils.metadata_identical(self.metadata, other.metadata): + if not self.metadata.to_dict() == other.metadata.to_dict(): return np.full(shape=self.shape, fill_value=False, dtype=np.dtype(bool)) else: if self.manifest == other.manifest: diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index f1deb129..222ac7a1 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -8,6 +8,7 @@ parse_dimension_names, parse_shapelike, ) +from zarr.dtype import parse_data_type from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs @@ -75,15 +76,16 @@ def create_v3_array_metadata( ArrayV3Metadata A configured ArrayV3Metadata instance with standard defaults """ + zdtype = parse_data_type(data_type, zarr_format=3) return ArrayV3Metadata( shape=shape, - data_type=data_type.name if hasattr(data_type, "name") else data_type, + data_type=zdtype, chunk_grid={ "name": "regular", "configuration": {"chunk_shape": chunk_shape}, }, chunk_key_encoding=chunk_key_encoding, - fill_value=fill_value, + fill_value=zdtype.default_scalar() if fill_value is None else fill_value, codecs=convert_to_codec_pipeline( codecs=codecs or [], dtype=data_type, @@ -155,24 +157,6 @@ def check_same_shapes(shapes: list[tuple[int, ...]]) -> None: ) -# TODO remove this once https://github.com/zarr-developers/zarr-python/issues/2929 is solved upstream -def metadata_identical(metadata1: ArrayV3Metadata, metadata2: ArrayV3Metadata) -> bool: - """Checks the metadata of two zarr arrays are identical, including special treatment for NaN fill_values.""" - metadata_dict1 = metadata1.to_dict() - metadata_dict2 = metadata2.to_dict() - - # fill_value is a special case because numpy NaNs cannot be compared using __eq__, see https://stackoverflow.com/a/10059796 - fill_value1 = metadata_dict1.pop("fill_value") - fill_value2 = metadata_dict2.pop("fill_value") - if np.isnan(fill_value1) and np.isnan(fill_value2): # type: ignore[arg-type] - fill_values_equal = fill_value1.dtype == fill_value2.dtype # type: ignore[union-attr] - else: - fill_values_equal = fill_value1 == fill_value2 - - # everything else in ArrayV3Metadata is a string, Enum, or Dataclass - return fill_values_equal and metadata_dict1 == metadata_dict2 - - def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...]: new_l = list(t) new_l.pop(pos) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 14b74d79..d7db3536 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -343,12 +343,12 @@ def test_non_dimension_coordinates( for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs - @pytest.mark.xfail( - reason="Datetime and timedelta data types not yet supported by zarr-python 3.0" # https://github.com/zarr-developers/zarr-python/issues/2616 - ) def test_datetime64_dtype_fill_value( self, tmpdir, roundtrip_func, array_v3_metadata ): + if roundtrip_func == roundtrip_as_in_memory_icechunk: + pytest.xfail(reason="xarray can't decode the ns datetime fill_value") + chunks_dict = { "0.0.0": {"path": "/foo.nc", "offset": 100, "length": 100}, } diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 9f6ac5b1..6b09cb0a 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -394,9 +394,9 @@ def test_refuse_combine(array_v3_metadata): with pytest.raises(NotImplementedError, match="different codecs"): func([marr1, marr2], axis=0) - metadata_copy = metadata_common.to_dict().copy() - metadata_copy["data_type"] = np.dtype("int64") - metadata_wrong_dtype = ArrayV3Metadata.from_dict(metadata_copy) + metadata_wrong_dtype = array_v3_metadata( + shape=shape, chunks=chunks, data_type=np.dtype("int64") + ) marr2 = ManifestArray(metadata=metadata_wrong_dtype, chunkmanifest=chunkmanifest2) for func in [np.concatenate, np.stack]: with pytest.raises(ValueError, match="inconsistent dtypes"): diff --git a/virtualizarr/tests/test_parsers/test_dmrpp.py b/virtualizarr/tests/test_parsers/test_dmrpp.py index 4945696c..c1e245bf 100644 --- a/virtualizarr/tests/test_parsers/test_dmrpp.py +++ b/virtualizarr/tests/test_parsers/test_dmrpp.py @@ -354,7 +354,7 @@ def test_parse_variable(tmp_path): basic_dmrpp = dmrparser(DMRPP_XML_STRINGS["basic"], tmp_path=tmp_path) var = basic_dmrpp._parse_variable(basic_dmrpp.find_node_fqn("/data")) - assert var.metadata.dtype == "float32" + assert var.metadata.dtype.to_native_dtype() == "float32" assert var.metadata.dimension_names == ("x", "y") assert var.shape == (720, 1440) assert var.chunks == (360, 720) diff --git a/virtualizarr/tests/test_writers/test_kerchunk.py b/virtualizarr/tests/test_writers/test_kerchunk.py index 6abc38a7..24985930 100644 --- a/virtualizarr/tests/test_writers/test_kerchunk.py +++ b/virtualizarr/tests/test_writers/test_kerchunk.py @@ -211,7 +211,7 @@ def testconvert_v3_to_v2_metadata(array_v3_metadata): assert isinstance(v2_metadata, ArrayV2Metadata) assert v2_metadata.shape == shape - assert v2_metadata.dtype == np.dtype("int32") + assert v2_metadata.dtype.to_native_dtype() == np.dtype("int32") assert v2_metadata.chunks == chunks assert v2_metadata.fill_value == 0 compressor_config = v2_metadata.compressor.get_config() diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index fd7f46df..f0c94e0e 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -157,7 +157,7 @@ def convert_v3_to_v2_metadata( v2_metadata = ArrayV2Metadata( shape=v3_metadata.shape, - dtype=v3_metadata.data_type.to_numpy(), + dtype=v3_metadata.data_type, chunks=v3_metadata.chunks, fill_value=fill_value or v3_metadata.fill_value, compressor=compressor_config, @@ -175,6 +175,7 @@ def kerchunk_refs_as_json(refs: KerchunkStoreRefs) -> JSON: See https://github.com/zarr-developers/VirtualiZarr/issues/679 for context as to why this is needed. """ + normalized_result: dict[str, JSON] = copy.deepcopy(refs) v0_refs: dict[str, JSON] = refs["refs"] diff --git a/virtualizarr/writers/icechunk.py b/virtualizarr/writers/icechunk.py index 85f85162..a1136ce6 100644 --- a/virtualizarr/writers/icechunk.py +++ b/virtualizarr/writers/icechunk.py @@ -344,7 +344,7 @@ def write_virtual_variable_to_icechunk( name=name, shape=metadata.shape, chunks=metadata.chunks, - dtype=metadata.data_type.to_numpy(), + dtype=metadata.data_type.to_native_dtype(), filters=filters, compressors=compressors, dimension_names=var.dims, diff --git a/virtualizarr/writers/kerchunk.py b/virtualizarr/writers/kerchunk.py index 139a8619..0431f004 100644 --- a/virtualizarr/writers/kerchunk.py +++ b/virtualizarr/writers/kerchunk.py @@ -11,6 +11,7 @@ from xarray.conventions import encode_dataset_coordinates from zarr.core.common import JSON from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.dtype import parse_data_type from virtualizarr.manifests import ManifestArray from virtualizarr.manifests.manifest import join @@ -161,7 +162,9 @@ def variable_to_kerchunk_arr_refs(var: Variable, var_name: str) -> KerchunkArrRe array_v2_metadata = ArrayV2Metadata( chunks=np_arr.shape, shape=np_arr.shape, - dtype=np_arr.dtype, + dtype=parse_data_type( + np_arr.dtype, zarr_format=2 + ), # needed unless zarr-python fixes https://github.com/zarr-developers/zarr-python/issues/3253 order="C", fill_value=None, )