diff --git a/pyproject.toml b/pyproject.toml index e6ed8542b..3045670bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "rich (>=14.1.0,<15.0.0)", "segy (>=0.4.2,<0.5.0)", "tqdm (>=4.67.1,<5.0.0)", + "universal-pathlib>=0.2.6", "xarray>=2025.7.1", "zarr (>=3.1.2,<4.0.0)", ] diff --git a/src/mdio/__init__.py b/src/mdio/__init__.py index bb66c689f..5d14853c1 100644 --- a/src/mdio/__init__.py +++ b/src/mdio/__init__.py @@ -2,36 +2,16 @@ from importlib import metadata -from mdio.api import MDIOReader -from mdio.api import MDIOWriter -from mdio.api.convenience import copy_mdio -from mdio.api.opener import open_dataset +from mdio.api.io import open_mdio +from mdio.api.io import to_mdio from mdio.converters import mdio_to_segy -from mdio.converters import numpy_to_mdio from mdio.converters import segy_to_mdio -from mdio.core.dimension import Dimension -from mdio.core.factory import MDIOCreateConfig -from mdio.core.factory import MDIOVariableConfig -from mdio.core.factory import create_empty -from mdio.core.factory import create_empty_like -from mdio.core.grid import Grid -from mdio.core.storage_location import StorageLocation __all__ = [ - "MDIOReader", - "MDIOWriter", - "copy_mdio", - "open_dataset", + "open_mdio", + "to_mdio", "mdio_to_segy", - "numpy_to_mdio", "segy_to_mdio", - "Dimension", - "MDIOCreateConfig", - "MDIOVariableConfig", - "create_empty", - "create_empty_like", - "Grid", - "StorageLocation", ] diff --git a/src/mdio/api/__init__.py b/src/mdio/api/__init__.py index ca6daf799..f731a65c2 100644 --- a/src/mdio/api/__init__.py +++ b/src/mdio/api/__init__.py @@ -1,6 +1 @@ -"""API Module.""" - -from .accessor import MDIOReader -from .accessor import MDIOWriter - -__all__ = ["MDIOReader", "MDIOWriter"] +"""Public API.""" diff --git a/src/mdio/api/accessor.py b/src/mdio/api/accessor.py deleted file mode 100644 index dfb6a35eb..000000000 --- a/src/mdio/api/accessor.py +++ /dev/null @@ -1,558 +0,0 @@ -"""MDIO accessor APIs.""" - -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING - -import numpy as np -import numpy.typing as npt -import zarr - -from mdio.api.io_utils import open_zarr_array -from mdio.api.io_utils import open_zarr_array_dask -from mdio.api.io_utils import process_url -from mdio.core import Grid -from mdio.core.exceptions import MDIONotFoundError -from mdio.exceptions import ShapeError - -if TYPE_CHECKING: - from dask.array import Array as DaskArray - from numpy.typing import NDArray - - -logger = logging.getLogger(__name__) - -TEXT_HEADER_ROWS = 40 - - -class MDIOAccessor: - """Accessor class for MDIO files. - - The accessor can be used to read and write MDIO files. It allows you to open an MDIO file - in several `mode` and `access_pattern` combinations. - - Access pattern defines the dimensions that are chunked. For instance if you have a 3-D array - that is chunked in every direction (i.e. a 3-D seismic stack consisting of inline, crossline, - and sample dimensions) its access pattern would be "012". If it was only chunked in the first - two dimensions (i.e. seismic inline and crossline), it would be "01". - - By default, MDIO will try to open with "012" access pattern, and will raise an error if that - pattern doesn't exist. - - After dataset is opened, when the accessor is sliced it will either return just seismic trace - data as a Numpy array or a tuple of live mask, headers, and seismic trace in Numpy based on - the parameter `return_metadata`. - - Regarding object store access, if the user credentials have been set system-wide on local - machine or VM; there is no need to specify credentials. However, the `storage_options` option - allows users to specify credentials for the store that is being accessed. Please see the - `fsspec` documentation for configuring storage options. - - MDIO currently supports `Zarr` and `Dask` backends. The Zarr backend is useful for reading - small amounts of data with minimal overhead. However, by utilizing the `Dask` backend with - a larger chunk size using the `new_chunks` argument, the data can be read in parallel using - a Dask LocalCluster or a distributed Cluster. - - The accessor also allows users to enable `fsspec` caching. These are particularly useful - when we are accessing the data from a high-latency store such as object stores, or mounted - network drives with high latency. We can use the `disk_cache` option to fetch chunks the - local temporary directory for faster repetitive access. - - Args: - mdio_path_or_buffer: Store or URL for MDIO file. - mode: Read or read/write mode. The file must exist. Options are in {'r', 'r+', 'w'}. - 'r' is read only, 'r+' is append mode where only existing arrays can be modified, - 'w' is similar to 'r+' but rechunking or other file-wide operations are allowed. - access_pattern: Chunk access pattern, optional. Default is "012". Examples: '012', '01'. - storage_options: Options for the storage backend. By default, system-wide credentials - will be used. - return_metadata: Flag for returning live mask, headers, and traces or just the trace data. - Default is False, which means just trace data will be returned. - new_chunks: Chunk sizes used in Dask backend. Ignored for Zarr backend. By default, the - disk-chunks will be used. However, if we want to stream groups of chunks to a Dask - worker, we can rechunk here. Then each Dask worker can asynchronously fetch multiple - chunks before working. - backend: Backend selection, optional. Default is "zarr". Must be in {'zarr', 'dask'}. - disk_cache: Disk cache implemented by `fsspec`, optional. Default is False, which turns - off disk caching. See `simplecache` from `fsspec` documentation for more details. - - Raises: - MDIONotFoundError: If the MDIO file can not be opened. - - Examples: - Assuming we ingested `my_3d_seismic.segy` as `my_3d_seismic.mdio` we can open the file - in read-only mode like this. - - >>> from mdio import MDIOReader - >>> - >>> - >>> mdio = MDIOReader("my_3d_seismic.mdio") - - This will open the file with the lazy `Zarr` backend. To access a specific inline, - crossline, or sample index we can do: - - >>> inline = mdio[15] # get the 15th inline - >>> crossline = mdio[:, 15] # get the 50th crossline - >>> samples = mdio[..., 250] # get the 250th sample slice - - The above will variables will be Numpy arrays of the relevant trace data. If we want to - retreive the live mask and trace headers for our sliding we need to open the file with - the `return_metadata` option. - - >>> mdio = MDIOReader("my_3d_seismic.mdio", return_metadata=True) - - Then we can fetch the data like this (for inline): - - >>> il_live, il_headers, il_traces = mdio[15] - - Since MDIOAccessor returns a tuple with these three Numpy arrays, we can directly unpack it - and use it further down our code. - """ - - _stats_keys = {"mean", "std", "rms", "min", "max"} - _array_load_function_mapper = { - "zarr": open_zarr_array, - "dask": open_zarr_array_dask, - } - - def __init__( # noqa: PLR0913 - self, - mdio_path_or_buffer: str, - mode: str, - access_pattern: str, - storage_options: dict | None, - return_metadata: bool, - new_chunks: tuple[int, ...] | None, - backend: str, - disk_cache: bool, - ): - # Set public attributes - self.url = mdio_path_or_buffer - self.mode = mode - self.access_pattern = access_pattern - - # Set private attributes for public interface. - # Pep8 complains because they are defined outside __init__ - self._chunks = None - self._live_mask = None - self._root = None - self._n_dim = None - self._orig_chunks = None - self._shape = None - self._trace_count = None - - # Private attributes - self._array_loader = self._array_load_function_mapper[backend] - self._backend = backend - self._return_metadata = return_metadata - self._new_chunks = new_chunks - self._storage_options = storage_options - self._disk_cache = disk_cache - - # Call methods to finish initialization - self._process_url() - try: - self._connect() - except FileNotFoundError as exc: - msg = ( - f"MDIO file not found or corrupt at {self.url}. Please check the URL or ensure " - "it is not a deprecated version of MDIO file." - ) - raise MDIONotFoundError(msg) from exc - self._deserialize_grid() - self._set_attributes() - self._open_arrays() - - def _process_url(self) -> None: - """Method to validate the provided store.""" - self.url = process_url( - url=self.url, - disk_cache=self._disk_cache, - ) - - def _connect(self) -> None: - """Open the zarr root.""" - kwargs = {"store": self.url, "storage_options": self._storage_options} - if self.mode in {"r", "r+"}: - self.root = zarr.open_consolidated(mode=self.mode, **kwargs) - elif self.mode == "w": - self.root = zarr.open(mode="r+", **kwargs) - else: - msg = f"Invalid mode: {self.mode}" - raise ValueError(msg) - - def _consolidate_metadata(self) -> None: - """Flush optimized MDIO metadata, run after modifying it.""" - zarr.consolidate_metadata(self.root.store) - - def _deserialize_grid(self) -> None: - """Deserialize grid from Zarr metadata.""" - self.grid = Grid.from_zarr(self.root) - - def _set_attributes(self) -> None: - """Deserialize attributes from Zarr metadata.""" - self.trace_count = self.root.attrs["trace_count"] - - # Grid based attributes - self.shape = self.grid.shape - self.n_dim = len(self.shape) - - # Access pattern attributes - data_array_name = f"chunked_{self.access_pattern}" - self.chunks = self._data_group[data_array_name].chunks - self._orig_chunks = self.chunks - - if self._backend == "dask" and self._new_chunks is not None: - # Handle None values (take original chunksize) - new_chunks = tuple(self.chunks[idx] if dim is None else dim for idx, dim in enumerate(self._new_chunks)) - - # Handle "-1" values, which means don't chunk that dimension - new_chunks = tuple(self.shape[idx] if dim == -1 else dim for idx, dim in enumerate(new_chunks)) - - self._orig_chunks = self.chunks - self.chunks = new_chunks - - def _open_arrays(self) -> None: - """Open arrays with requested backend.""" - data_array_name = f"chunked_{self.access_pattern}" - header_array_name = f"chunked_{self.access_pattern}_trace_headers" - - trace_kwargs = {"group_handle": self._data_group, "name": data_array_name} - - if self._backend == "dask": - trace_kwargs["chunks"] = self.chunks - - self._traces = self._array_loader(**trace_kwargs) - - if self._backend == "dask" and self._orig_chunks != self._chunks: - dask_chunks = self._traces.chunks - logger.info("Setting MDIO in-memory chunks to %s", dask_chunks) - self.chunks = dask_chunks - - header_kwargs = { - "group_handle": self._metadata_group, - "name": header_array_name, - } - - if self._backend == "dask": - header_kwargs["chunks"] = self.chunks[:-1] - - self._headers = self._array_loader(**header_kwargs) - - self.grid.live_mask = self._array_loader(self._metadata_group, name="live_mask") - self.live_mask = self.grid.live_mask - - @property - def live_mask(self) -> npt.ArrayLike | DaskArray: - """Get live mask (i.e. not-null value mask).""" - return self._live_mask - - @live_mask.setter - def live_mask(self, value: npt.ArrayLike | DaskArray) -> None: - """Set live mask (i.e. not-null value mask).""" - self._live_mask = value - - @property - def n_dim(self) -> int: - """Get number of dimensions for dataset.""" - return self._n_dim - - @n_dim.setter - def n_dim(self, value: int) -> None: - """Set number of dimensions for dataset.""" - self._n_dim = value - - @property - def shape(self) -> tuple[int, ...]: - """Get shape of dataset.""" - return self._shape - - @shape.setter - def shape(self, value: tuple[int, ...]) -> None: - """Validate and set shape of dataset.""" - if not isinstance(value, tuple): - msg = "Array shape needs to be a tuple" - raise AttributeError(msg) - self._shape = value - - @property - def trace_count(self) -> int: - """Get trace count from seismic MDIO.""" - return self._trace_count - - @trace_count.setter - def trace_count(self, value: int) -> None: - """Validate and set trace count for seismic MDIO.""" - if not isinstance(value, int): - msg = "Live trace count needs to be an integer" - raise AttributeError(msg) - self._trace_count = value - - @property - def text_header(self) -> list: - """Get seismic text header.""" - return self._metadata_group.attrs["text_header"] - - @text_header.setter - def text_header(self, value: list) -> None: - """Validate and set seismic text header.""" - if not isinstance(value, list) or len(value) != TEXT_HEADER_ROWS: - msg = "Text header must be a list of str with 40 elements" - raise AttributeError(msg) - self._metadata_group.attrs["text_header"] = value - self._consolidate_metadata() - - @property - def binary_header(self) -> dict: - """Get seismic binary header metadata.""" - return self._metadata_group.attrs["binary_header"] - - @binary_header.setter - def binary_header(self, value: dict) -> None: - """Validate and set seismic binary header metadata.""" - if not isinstance(value, dict): - msg = "Binary header has to be a dictionary type collection" - raise AttributeError(msg) - self._metadata_group.attrs["binary_header"] = value - self._consolidate_metadata() - - @property - def chunks(self) -> tuple[int, ...]: - """Get dataset chunk sizes.""" - return self._chunks - - @chunks.setter - def chunks(self, value: tuple[int, ...]) -> None: - """Set dataset chunk sizes.""" - self._chunks = value - - @property - def stats(self) -> dict: - """Get global statistics like min/max/rms/std.""" - return {key: self.root.attrs[key] for key in self._stats_keys} - - @stats.setter - def stats(self, value: dict) -> None: - """Set global statistics like min/max/rms/std.""" - if not isinstance(value, dict) or not self._stats_keys.issubset(value.keys()): - msg = f"For settings status, you must provide keys: {self._stats_keys}" - raise AttributeError(msg) - self.root.attrs.update(value) - self._consolidate_metadata() - - @property - def _metadata_group(self) -> zarr.Group: - """Get metadata zarr.group handle.""" - return self.root["metadata"] - - @property - def _data_group(self) -> zarr.Group: - """Get data zarr.Group handle.""" - return self.root["data"] - - def __getitem__(self, item: int | tuple) -> npt.ArrayLike | DaskArray | tuple: - """Data getter.""" - if self._return_metadata is True: - if isinstance(item, int | slice): - meta_index = item - elif len(item) == len(self.shape): - meta_index = tuple(dim for dim in item[:-1]) - else: - meta_index = item - - return ( - self.live_mask[meta_index], - self._headers[meta_index], - self._traces[item], - ) - - return self._traces[item] - - def __setitem__(self, key: int | tuple, value: npt.ArrayLike) -> None: - """Data setter.""" - self._traces[key] = value - self._live_mask[key] = True - - def coord_to_index( - self, - *args: list[int] | int, - dimensions: str | list[str] | None = None, - ) -> tuple[NDArray[int], ...]: - """Convert dimension coordinate to zero-based index. - - The coordinate labels of the array dimensions are converted to - zero-based indices. For instance if we have an inline dimension like - this: - - `[10, 20, 30, 40, 50]` - - then the indices would be: - - `[0, 1, 2, 3, 4]` - - This method converts from coordinate labels of a dimension to - equivalent indices. - - Multiple dimensions can be queried at the same time, see the examples. - - Args: - *args: Variable length argument queries. - dimensions: Name of the dimensions to query. If not provided, it - will query all dimensions in the grid and will require - `len(args) == grid.ndim` - - Returns: - Zero-based indices of coordinates. Each item in result corresponds - to indicies of that dimension - - Raises: - ShapeError: if number of queries don't match requested dimensions. - ValueError: if requested coordinates don't exist. - - Examples: - Opening an MDIO file. - - >>> from mdio import MDIOReader - >>> - >>> - >>> mdio = MDIOReader("path_to.mdio") - >>> mdio.coord_to_index([10, 7, 15], dimensions='inline') - array([ 8, 5, 13], dtype=uint16) - - >>> ils, xls = [10, 7, 15], [5, 10] - >>> mdio.coord_to_index(ils, xls, dimensions=['inline', 'crossline']) - (array([ 8, 5, 13], dtype=uint16), array([3, 8], dtype=uint16)) - - With the above indices, we can slice the data: - - >>> mdio[ils] # only inlines - >>> mdio[:, xls] # only crosslines - >>> mdio[ils, xls] # intersection of the lines - - Note that some fancy-indexing may not work with Zarr backend. - The Dask backend is more flexible when it comes to indexing. - - If we are querying all dimensions of a 3D array, we can omit the - `dimensions` argument. - - >>> mdio.coord_to_index(10, 5, [50, 100]) - (array([8], dtype=uint16), - array([3], dtype=uint16), - array([25, 50], dtype=uint16)) - """ - queries = [np.atleast_1d(dim_query) for dim_query in args] - - # Ensure dimensions is a list - if dimensions is not None and not isinstance(dimensions, list): - dimensions = [dimensions] - - # Ensure the query arrays and dimensions match size - ndim_expect = self.grid.ndim if dimensions is None else len(dimensions) - - if len(queries) != ndim_expect: - msg = "Coordinate queries not the same size as n_dimensions" - raise ShapeError(msg, ("# Coord Dims", "# Dimensions"), (len(queries), ndim_expect)) - - dims = self.grid.dims if dimensions is None else [self.grid.select_dim(dim_name) for dim_name in dimensions] - - dim_indices = () - for mdio_dim, dim_query_coords in zip(dims, queries): # noqa: B905 - # Make sure all coordinates exist. - query_diff = np.setdiff1d(dim_query_coords, mdio_dim.coords) - if len(query_diff) > 0: - msg = f"{mdio_dim.name} dimension does not have coordinate(s) {query_diff}" - raise ValueError(msg) - - sorter = mdio_dim.coords.argsort() - dim_idx = np.searchsorted(mdio_dim, dim_query_coords, sorter=sorter) - dim_idx = dim_idx.astype("uint32") # cast max: 2,147,483,647 - dim_indices += (dim_idx,) - - return dim_indices if len(dim_indices) > 1 else dim_indices[0] - - -class MDIOReader(MDIOAccessor): - """Read-only accessor for MDIO files. - - For detailed documentation see MDIOAccessor. - - Args: - mdio_path_or_buffer: Store or URL for MDIO file. - access_pattern: Chunk access pattern, optional. Default is "012". Examples: '012', '01'. - storage_options: Options for the storage backend. By default, system-wide credentials - will be used. - return_metadata: Flag for returning live mask, headers, and traces or just the trace data. - Default is False, which means just trace data will be returned. - new_chunks: Chunk sizes used in Dask backend. Ignored for Zarr backend. By default, the - disk-chunks will be used. However, if we want to stream groups of chunks to a Dask - worker, we can rechunk here. Then each Dask worker can asynchronously fetch multiple - chunks before working. - backend: Backend selection, optional. Default is "zarr". Must be in {'zarr', 'dask'}. - disk_cache: Disk cache implemented by `fsspec`, optional. Default is False, which turns - off disk caching. See `simplecache` from `fsspec` documentation for more details. - """ - - def __init__( # noqa: PLR0913 - self, - mdio_path_or_buffer: str, - access_pattern: str = "012", - storage_options: dict = None, - return_metadata: bool = False, - new_chunks: tuple[int, ...] = None, - backend: str = "zarr", - disk_cache: bool = False, - ): - super().__init__( - mdio_path_or_buffer=mdio_path_or_buffer, - mode="r", - access_pattern=access_pattern, - storage_options=storage_options, - return_metadata=return_metadata, - new_chunks=new_chunks, - backend=backend, - disk_cache=disk_cache, - ) - - -class MDIOWriter(MDIOAccessor): - """Writable accessor for MDIO files. - - For detailed documentation see MDIOAccessor. - - Args: - mdio_path_or_buffer: Store or URL for MDIO file. - access_pattern: Chunk access pattern, optional. Default is "012". Examples: '012', '01'. - storage_options: Options for the storage backend. By default, system-wide credentials - will be used. - return_metadata: Flag for returning live mask, headers, and traces or just the trace data. - Default is False, which means just trace data will be returned. - new_chunks: Chunk sizes used in Dask backend. Ignored for Zarr backend. By default, the - disk-chunks will be used. However, if we want to stream groups of chunks to a Dask - worker, we can rechunk here. Then each Dask worker can asynchronously fetch multiple - chunks before working. - backend: Backend selection, optional. Default is "zarr". Must be in {'zarr', 'dask'}. - disk_cache: Disk cache implemented by `fsspec`, optional. Default is False, which turns - off disk caching. See `simplecache` from `fsspec` documentation for more details. - """ - - def __init__( # noqa: PLR0913 - self, - mdio_path_or_buffer: str, - access_pattern: str = "012", - storage_options: dict = None, - return_metadata: bool = False, - new_chunks: tuple[int, ...] = None, - backend: str = "zarr", - disk_cache: bool = False, - ): - super().__init__( - mdio_path_or_buffer=mdio_path_or_buffer, - mode="w", - access_pattern=access_pattern, - storage_options=storage_options, - return_metadata=return_metadata, - new_chunks=new_chunks, - backend=backend, - disk_cache=disk_cache, - ) diff --git a/src/mdio/api/convenience.py b/src/mdio/api/convenience.py deleted file mode 100644 index 1a47e199e..000000000 --- a/src/mdio/api/convenience.py +++ /dev/null @@ -1,278 +0,0 @@ -"""Convenience APIs for working with MDIO files.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import zarr -from numcodecs import Blosc -from tqdm.auto import tqdm - -from mdio import MDIOReader -from mdio import MDIOWriter -from mdio.core.factory import create_empty_like -from mdio.core.indexing import ChunkIterator - -if TYPE_CHECKING: - from typing import Any - - from numcodecs.abc import Codec - from numpy.typing import NDArray - from zarr import Array - from zarr.core.array import CompressorsLike - - from mdio import MDIOAccessor - - -def copy_mdio( # noqa: PLR0913 - source_path: str, - target_path: str, - overwrite: bool = False, - copy_traces: bool = False, - copy_headers: bool = False, - storage_options_input: dict[str, Any] | None = None, - storage_options_output: dict[str, Any] | None = None, -) -> None: - """Copy MDIO file. - - This function copies an MDIO file from a source path to a target path, optionally including - trace data, headers, or both, for all access patterns. It creates a new MDIO file at the target - path with the same structure as the source, and selectively copies data based on the provided - flags. The function supports custom storage options for both input and output, enabling - compatibility with various filesystems via FSSpec. - - Args: - source_path: Source MDIO path. Data will be copied from here - target_path: Destination path. Could be any FSSpec mapping. - overwrite: Overwrite destination or not. - copy_traces: Flag to enable copying trace data for all access patterns. - copy_headers: Flag to enable copying headers for all access patterns. - storage_options_input: Storage options for input MDIO. - storage_options_output: Storage options for output MDIO. - - """ - create_empty_like( - source_path, - target_path, - overwrite, - storage_options_input, - storage_options_output, - ) - - source_root = zarr.open_consolidated( - source_path, - mode="r", - storage_options=storage_options_input, - ) - src_data_grp = source_root["data"] - access_patterns = [key.removeprefix("chunked_") for key in src_data_grp] - - if not copy_traces and not copy_headers: - return - - for access_pattern in access_patterns: - reader = MDIOReader(source_path, access_pattern, storage_options_input) - writer = MDIOWriter(target_path, access_pattern, storage_options_output) - - writer.live_mask[:] = reader.live_mask[:] - - shape = reader._traces.shape - chunks = reader._traces.chunks - chunks = chunks[:-1] + (shape[-1],) # don't chunk samples - - iterator = ChunkIterator(shape=shape, chunks=chunks) - progress = tqdm(iterator, unit="block") - progress.set_description(desc=f"Copying data for '{access_pattern=}'") - for slice_ in progress: - if copy_traces: - writer.stats = reader.stats - writer._traces[slice_] = reader._traces[slice_] - - if copy_headers: - meta_slice = slice_[:-1] - writer._headers[meta_slice] = reader._headers[meta_slice] - - zarr.consolidate_metadata(writer.root.store) - - -MAX_BUFFER = 512 - - -def create_rechunk_plan( - source: MDIOAccessor, - chunks_list: list[tuple[int, ...]], - suffix_list: list[str], - compressors: CompressorsLike = None, - overwrite: bool = False, -) -> tuple[[list[Array]], list[Array], NDArray, ChunkIterator]: - """Create a rechunk plan based on source and user input. - - It will buffer 512 x n-dimensions in memory. Approximately - 128MB. However, if you need to adjust the buffer size, change - the `MAX_BUFFER` variable in this module. - - Args: - source: MDIO accessor instance. Data will be copied from here. - chunks_list: List of tuples containing new chunk sizes. - suffix_list: List of suffixes to append to new chunk sizes. - compressors: Data compressor to use, optional. Default is Blosc('zstd'). - overwrite: Overwrite destination or not. - - Returns: - Tuple containing the rechunk plan variables and iterator. - - Raises: - NameError: if trying to write to original data. - """ - zarr.config.set({"write_empty_chunks": False}) - - data_group = source._data_group - metadata_group = source._metadata_group - - data_array = source._traces - metadata_array = source._headers - live_mask = source.live_mask[:] - - metadata_arrs = [] - data_arrs = [] - - header_compressor = Blosc("zstd") - trace_compressor = Blosc("zstd") if compressors is None else compressors - - for chunks, suffix in zip(chunks_list, suffix_list, strict=True): - norm_chunks = tuple(min(chunk, size) for chunk, size in zip(chunks, source.shape, strict=True)) - - if suffix == source.access_pattern: - msg = f"Can't write over source data with suffix {suffix}" - raise NameError(msg) - - metadata_arrs.append( - metadata_group.zeros( - name=f"chunked_{suffix}_trace_headers", - shape=metadata_array.shape, - dtype=metadata_array.dtype, - chunks=norm_chunks[:-1], - compressor=header_compressor, - overwrite=overwrite, - zarr_format=2, - dimension_separator="/", - ) - ) - - data_arrs.append( - data_group.zeros( - name=f"chunked_{suffix}", - shape=data_array.shape, - dtype=data_array.dtype, - chunks=norm_chunks, - compressor=trace_compressor, - overwrite=overwrite, - zarr_format=2, - dimension_separator="/", - ) - ) - - zarr.consolidate_metadata(source.root.store) - - n_dimension = len(data_array.shape) - dummy_array = zarr.empty(shape=data_array.shape, chunks=(MAX_BUFFER,) * n_dimension) - - shape = dummy_array.shape - chunks = dummy_array.chunks - iterator = ChunkIterator(shape=shape, chunks=chunks) - - return metadata_arrs, data_arrs, live_mask, iterator - - -def write_rechunked_values( # noqa: PLR0913 - source: MDIOAccessor, - suffix_list: list[str], - metadata_arrs_out: list[Array], - data_arrs_out: list[Array], - live_mask: NDArray, - iterator: ChunkIterator, -) -> None: - """Create rechunk plan based on source and user input. - - Args: - source: MDIO accessor instance. Data will be copied from here. - suffix_list: List of suffixes to append to new chunk sizes. - metadata_arrs_out: List of new metadata Zarr arrays. - data_arrs_out: List of new data Zarr arrays. - live_mask: Live mask to apply during copies. - iterator: The chunk iterator to use. - """ - suffix_names = ",".join(suffix_list) - for slice_ in tqdm(iterator, desc=f"Rechunking to {suffix_names}", unit="chunk"): - meta_slice = slice_[:-1] - - if live_mask[meta_slice].sum() == 0: - continue - - for array in metadata_arrs_out: - array[meta_slice] = source._headers[meta_slice] - - for array in data_arrs_out: - array[slice_] = source._traces[slice_] - - -def rechunk_batch( - source: MDIOAccessor, - chunks_list: list[tuple[int, ...]], - suffix_list: list[str], - compressor: Codec | None = None, - overwrite: bool = False, -) -> None: - """Rechunk MDIO file to multiple variables, reading it once. - - Args: - source: MDIO accessor instance. Data will be copied from here. - chunks_list: List of tuples containing new chunk sizes. - suffix_list: List of suffixes to append to new chunk sizes. - compressor: Data compressor to use, optional. Default is Blosc('zstd'). - overwrite: Overwrite destination or not. - - Examples: - To rechunk multiple variables we can do things like: - - >>> accessor = MDIOAccessor(...) - >>> rechunk_batch( - >>> accessor, - >>> chunks_list=[(1, 1024, 1024), (1024, 1, 1024), (1024, 1024, 1)], - >>> suffix_list=["fast_il", "fast_xl", "fast_z"], - >>> ) - """ - plan = create_rechunk_plan( - source, - chunks_list=chunks_list, - suffix_list=suffix_list, - compressors=compressor, - overwrite=overwrite, - ) - - write_rechunked_values(source, suffix_list, *plan) - - -def rechunk( - source: MDIOAccessor, - chunks: tuple[int, ...], - suffix: str, - compressor: Codec | None = None, - overwrite: bool = False, -) -> None: - """Rechunk MDIO file adding a new variable. - - Args: - source: MDIO accessor instance. Data will be copied from here. - chunks: Tuple containing chunk sizes for new rechunked array. - suffix: Suffix to append to new rechunked array. - compressor: Data compressor to use, optional. Default is Blosc('zstd'). - overwrite: Overwrite destination or not. - - Examples: - To rechunk a single variable we can do this - - >>> accessor = MDIOAccessor(...) - >>> rechunk(accessor, (1, 1024, 1024), suffix="fast_il") - """ - rechunk_batch(source, [chunks], [suffix], compressor, overwrite) diff --git a/src/mdio/api/io.py b/src/mdio/api/io.py new file mode 100644 index 000000000..358c8970b --- /dev/null +++ b/src/mdio/api/io.py @@ -0,0 +1,91 @@ +"""Utils for reading MDIO dataset.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Literal + +from upath import UPath +from xarray import Dataset as xr_Dataset +from xarray import open_zarr as xr_open_zarr +from xarray.backends.api import to_zarr as xr_to_zarr + +if TYPE_CHECKING: + from collections.abc import Mapping + from pathlib import Path + + from xarray import Dataset + from xarray.core.types import T_Chunks + from xarray.core.types import ZarrWriteModes + + +def _normalize_path(path: UPath | Path | str) -> UPath: + return UPath(path) + + +def _normalize_storage_options(path: UPath) -> dict[str, Any] | None: + return None if len(path.storage_options) == 0 else path.storage_options + + +def open_mdio(input_path: UPath | Path | str, chunks: T_Chunks = None) -> xr_Dataset: + """Open a Zarr dataset from the specified universal file path. + + Args: + input_path: Universal input path for the MDIO dataset. + chunks: If provided, loads data into dask arrays with new chunking. + - ``chunks="auto"`` will use dask ``auto`` chunking + - ``chunks=None`` skips using dask, which is generally faster for small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using the engine's preferred chunk size (on disk). + - ``chunks={dim: chunk, ...}`` loads the data with dask using the specified chunk size for each dimension. + + See dask chunking for more details. + + Returns: + An Xarray dataset opened from the input path. + """ + input_path = _normalize_path(input_path) + storage_options = _normalize_storage_options(input_path) + return xr_open_zarr(input_path.path, chunks=chunks, storage_options=storage_options) + + +def to_mdio( # noqa: PLR0913 + dataset: Dataset, + output_path: UPath | Path | str, + mode: ZarrWriteModes | None = None, + *, + compute: bool = True, + region: Mapping[str, slice | Literal["auto"]] | Literal["auto"] | None = None, + zarr_format: int = 3, +) -> None: + """Write dataset contents to an MDIO output_path. + + Args: + dataset: The dataset to write. + output_path: The universal path of the output MDIO file. + mode: Persistence mode: "w" means create (overwrite if exists) + "w-" means create (fail if exists) + "a" means override all existing variables including dimension coordinates (create if does not exist) + "a-" means only append those variables that have ``append_dim``. + "r+" means modify existing array *values* only (raise an error if any metadata or shapes would change). + The default mode is "r+" if ``region`` is set and ``w-`` otherwise. + compute: If True write array data immediately; otherwise return a ``dask.delayed.Delayed`` object that + can be computed to write array data later. Metadata is always updated eagerly. + region: Optional mapping from dimension names to either a) ``"auto"``, or b) integer slices, indicating + the region of existing MDIO array(s) in which to write this dataset's data. + zarr_format: The desired zarr format to target. The default is 3. + """ + output_path = _normalize_path(output_path) + storage_options = _normalize_storage_options(output_path) + xr_to_zarr( + dataset, + store=output_path.path, + mode=mode, + compute=compute, + consolidated=False, + region=region, + storage_options=storage_options, + zarr_format=zarr_format, + write_empty_chunks=False, + ) diff --git a/src/mdio/api/io_utils.py b/src/mdio/api/io_utils.py deleted file mode 100644 index 6b25b0add..000000000 --- a/src/mdio/api/io_utils.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Utilities related to API functions and classes.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING -from typing import Any - -from dask.array import from_array - -if TYPE_CHECKING: - from dask.array import Array as DaskArray - from zarr import Array as ZarrArray - from zarr import Group as ZarrGroup - - -def process_url(url: str, disk_cache: bool) -> str: - """Process URL based on options. - - File cache is only valid for remote stores. The LRU caching works on both remote and local. - - Args: - url: FSSpec compliant url - disk_cache: This enables FSSpec's `simplecache` if True. - - Returns: - String to store with augmentations like cache, etc. - - Examples: - If we want to access an MDIO file from S3 without using disk caching, the simplecache - protocol is not used, and therefore we only need to specify the s3 filesystem options: - - >>> from mdio.api.convenience import process_url - >>> - >>> - >>> process_url( - ... url="s3://bucket/key", - ... disk_cache=False, - ... ) - """ - if disk_cache is True: - url = f"simplecache::{url}" - - return url - - -def open_zarr_array(group_handle: ZarrGroup, name: str) -> ZarrArray: - """Open Zarr array lazily using Zarr. - - Note: All other kwargs are ignored, used for API compatibility for dask backend. - - Args: - group_handle: Group handle where the array is located - name: Name of the array within the group - - Returns: - Zarr array opened with default engine. - """ - return group_handle[name] - - -def open_zarr_array_dask(group_handle: ZarrGroup, name: str, **kwargs: Any) -> DaskArray: # noqa: ANN401 - """Open Zarr array lazily using Dask. - - Note: All other kwargs get passed to dask.array.from_zarr() - - Args: - group_handle: Group handle where the array is located - name: Name of the array within the group - **kwargs: Extra keyword arguments for Dask from_zarr. - - Returns: - Zarr array opened with Dask engine. - """ - zarr_array = open_zarr_array(group_handle=group_handle, name=name) - return from_array(zarr_array, **kwargs, inline_array=True) diff --git a/src/mdio/api/opener.py b/src/mdio/api/opener.py deleted file mode 100644 index 6e2ea2809..000000000 --- a/src/mdio/api/opener.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Utils for reading MDIO dataset.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import xarray as xr - -if TYPE_CHECKING: - from xarray.core.types import T_Chunks - - from mdio.core.storage_location import StorageLocation - - -def open_dataset(storage_location: StorageLocation, chunks: T_Chunks = None) -> xr.Dataset: - """Open a Zarr dataset from the specified storage location. - - Args: - storage_location: StorageLocation for the dataset. - chunks: If provided, loads data into dask arrays with new chunking. - - ``chunks="auto"`` will use dask ``auto`` chunking - - ``chunks=None`` skips using dask, which is generally faster for small arrays. - - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. - - ``chunks={}`` loads the data with dask using the engine's preferred chunk size (on disk). - - ``chunks={dim: chunk, ...}`` loads the data with dask using the specified chunk size for each dimension. - - See dask chunking for more details. - - Returns: - An Xarray dataset opened from the storage location. - """ - # NOTE: If mask_and_scale is not set, - # Xarray will convert int to float and replace _FillValue with NaN - # Fixed in Zarr v3, so we can fix this later. - return xr.open_dataset(storage_location.uri, engine="zarr", chunks=chunks, mask_and_scale=False) diff --git a/src/mdio/constants.py b/src/mdio/constants.py index 5b8384e02..0036f5a90 100644 --- a/src/mdio/constants.py +++ b/src/mdio/constants.py @@ -1,50 +1,48 @@ """Constant values used across MDIO.""" -from numpy import finfo as np_finfo -from numpy import iinfo as np_iinfo -from numpy import nan as np_nan +import numpy as np from mdio.schemas.dtype import ScalarType -FLOAT16_MAX = np_finfo("float16").max -FLOAT16_MIN = np_finfo("float16").min +FLOAT16_MAX = np.finfo("float16").max +FLOAT16_MIN = np.finfo("float16").min -FLOAT32_MAX = np_finfo("float32").max -FLOAT32_MIN = np_finfo("float32").min +FLOAT32_MAX = np.finfo("float32").max +FLOAT32_MIN = np.finfo("float32").min -FLOAT64_MIN = np_finfo("float64").min -FLOAT64_MAX = np_finfo("float64").max +FLOAT64_MIN = np.finfo("float64").min +FLOAT64_MAX = np.finfo("float64").max -INT8_MIN = np_iinfo("int8").min -INT8_MAX = np_iinfo("int8").max +INT8_MIN = np.iinfo("int8").min +INT8_MAX = np.iinfo("int8").max -INT16_MIN = np_iinfo("int16").min -INT16_MAX = np_iinfo("int16").max +INT16_MIN = np.iinfo("int16").min +INT16_MAX = np.iinfo("int16").max -INT32_MIN = np_iinfo("int32").min -INT32_MAX = np_iinfo("int32").max +INT32_MIN = np.iinfo("int32").min +INT32_MAX = np.iinfo("int32").max -INT64_MIN = np_iinfo("int64").min -INT64_MAX = np_iinfo("int64").max +INT64_MIN = np.iinfo("int64").min +INT64_MAX = np.iinfo("int64").max UINT8_MIN = 0 -UINT8_MAX = np_iinfo("uint8").max +UINT8_MAX = np.iinfo("uint8").max UINT16_MIN = 0 -UINT16_MAX = np_iinfo("uint16").max +UINT16_MAX = np.iinfo("uint16").max UINT32_MIN = 0 -UINT32_MAX = np_iinfo("uint32").max +UINT32_MAX = np.iinfo("uint32").max UINT64_MIN = 0 -UINT64_MAX = np_iinfo("uint64").max +UINT64_MAX = np.iinfo("uint64").max # Zarr fill values for different scalar types fill_value_map = { ScalarType.BOOL: None, - ScalarType.FLOAT16: np_nan, - ScalarType.FLOAT32: np_nan, - ScalarType.FLOAT64: np_nan, + ScalarType.FLOAT16: np.nan, + ScalarType.FLOAT32: np.nan, + ScalarType.FLOAT64: np.nan, ScalarType.UINT8: UINT8_MAX, ScalarType.UINT16: UINT16_MAX, ScalarType.UINT32: UINT32_MAX, @@ -53,7 +51,7 @@ ScalarType.INT16: INT16_MAX, ScalarType.INT32: INT32_MAX, ScalarType.INT64: INT64_MAX, - ScalarType.COMPLEX64: complex(np_nan, np_nan), - ScalarType.COMPLEX128: complex(np_nan, np_nan), - ScalarType.COMPLEX256: complex(np_nan, np_nan), + ScalarType.COMPLEX64: complex(np.nan, np.nan), + ScalarType.COMPLEX128: complex(np.nan, np.nan), + ScalarType.COMPLEX256: complex(np.nan, np.nan), } diff --git a/src/mdio/converters/__init__.py b/src/mdio/converters/__init__.py index 92ca63e95..0c3ebe922 100644 --- a/src/mdio/converters/__init__.py +++ b/src/mdio/converters/__init__.py @@ -1,7 +1,6 @@ """MDIO Data conversion API.""" from .mdio import mdio_to_segy -from .numpy import numpy_to_mdio from .segy import segy_to_mdio -__all__ = ["mdio_to_segy", "segy_to_mdio", "numpy_to_mdio"] +__all__ = ["mdio_to_segy", "segy_to_mdio"] diff --git a/src/mdio/converters/mdio.py b/src/mdio/converters/mdio.py index 4da32f143..183ee345c 100644 --- a/src/mdio/converters/mdio.py +++ b/src/mdio/converters/mdio.py @@ -3,7 +3,6 @@ from __future__ import annotations import os -from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING @@ -11,7 +10,8 @@ from psutil import cpu_count from tqdm.dask import TqdmCallback -from mdio.api.opener import open_dataset +from mdio.api.io import _normalize_path +from mdio.api.io import open_mdio from mdio.segy.blocked_io import to_segy from mdio.segy.creation import concat_files from mdio.segy.creation import mdio_spec_to_segy @@ -23,9 +23,11 @@ distributed = None if TYPE_CHECKING: + from pathlib import Path + from segy.schema import SegySpec + from upath import UPath - from mdio.core.storage_location import StorageLocation default_cpus = cpu_count(logical=True) NUM_CPUS = int(os.getenv("MDIO__EXPORT__CPU_COUNT", default_cpus)) @@ -33,8 +35,8 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915 segy_spec: SegySpec, - input_location: StorageLocation, - output_location: StorageLocation, + input_path: UPath | Path | str, + output_path: UPath | Path | str, selection_mask: np.ndarray = None, client: distributed.Client = None, ) -> None: @@ -50,8 +52,8 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915 Args: segy_spec: The SEG-Y specification to use for the conversion. - input_location: Store or URL (and cloud options) for MDIO file. - output_location: Path to the output SEG-Y file. + input_path: Store or URL (and cloud options) for MDIO file. + output_path: Path to the output SEG-Y file. selection_mask: Array that lists the subset of traces client: Dask client. If `None` we will use local threaded scheduler. If `auto` is used we will create multiple processes (with 8 threads each). @@ -64,17 +66,19 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915 To export an existing local MDIO file to SEG-Y we use the code snippet below. This will export the full MDIO (without padding) to SEG-Y format. - >>> from mdio import mdio_to_segy, StorageLocation + >>> from upath import UPath + >>> from mdio import mdio_to_segy >>> - >>> input_location = StorageLocation("prefix2/file.mdio") - >>> output_location = StorageLocation("prefix/file.segy") - >>> mdio_to_segy(input_location, output_location) + >>> input_path = UPath("prefix2/file.mdio") + >>> output_path = UPath("prefix/file.segy") + >>> mdio_to_segy(input_path, output_path) """ - output_segy_path = Path(output_location.uri) + input_path = _normalize_path(input_path) + output_path = _normalize_path(output_path) # First we open with vanilla zarr backend and then get some info # We will re-open with `new_chunks` and Dask later in mdio_spec_to_segy - dataset = open_dataset(input_location) + dataset = open_mdio(input_path) default_variable_name = dataset.attrs["attributes"]["default_variable_name"] amplitude = dataset[default_variable_name] @@ -83,7 +87,7 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915 dtype = amplitude.dtype new_chunks = segy_export_rechunker(chunks, sizes, dtype) - creation_args = [segy_spec, input_location, output_location, new_chunks] + creation_args = [segy_spec, input_path, output_path, new_chunks] if client is not None: if distributed is not None: @@ -128,7 +132,7 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915 dataset["trace_mask"] = dataset["trace_mask"] & selection_mask # tmp file root - out_dir = output_segy_path.parent + out_dir = output_path.parent tmp_dir = TemporaryDirectory(dir=out_dir) with tmp_dir: @@ -147,7 +151,7 @@ def mdio_to_segy( # noqa: PLR0912, PLR0913, PLR0915 block_records = block_records.compute(num_workers=NUM_CPUS) ordered_files = [rec.path for rec in block_records.ravel() if rec != 0] - ordered_files = [output_segy_path] + ordered_files + ordered_files = [output_path] + ordered_files if client is not None: _ = client.submit(concat_files, paths=ordered_files).result() diff --git a/src/mdio/converters/numpy.py b/src/mdio/converters/numpy.py deleted file mode 100644 index aa78784e3..000000000 --- a/src/mdio/converters/numpy.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Conversion from Numpy to MDIO.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy as np - -from mdio.api.accessor import MDIOWriter -from mdio.core.dimension import Dimension -from mdio.core.factory import MDIOCreateConfig -from mdio.core.factory import MDIOVariableConfig -from mdio.core.factory import create_empty -from mdio.core.grid import Grid - -if TYPE_CHECKING: - from typing import Any - - from numpy.typing import DTypeLike - from numpy.typing import NDArray - - -def numpy_to_mdio( # noqa: PLR0913 - array: NDArray, - mdio_path_or_buffer: str, - chunksize: tuple[int, ...], - index_names: list[str] | None = None, - index_coords: dict[str, NDArray] | None = None, - header_dtype: DTypeLike | None = None, - lossless: bool = True, - compression_tolerance: float = 0.01, - storage_options: dict[str, Any] | None = None, - overwrite: bool = False, -) -> None: - """Conversion from NumPy array to MDIO format. - - This module provides functionality to convert a NumPy array into the MDIO format. The - conversion process organizes the input array into a multidimensional tensor with specified - indexing and compression options. - - Args: - array: Input NumPy array to be converted to MDIO format. - mdio_path_or_buffer: Output path for the MDIO file, either local or cloud-based (e.g., - with `s3://`, `gcs://`, or `abfs://` protocols). - chunksize: Tuple specifying the chunk sizes for each dimension of the array. It must match - the number of dimensions in the input array. - index_names: List of names for the index dimensions. If not provided, defaults to `dim_0`, - `dim_1`, ..., with the last dimension named `sample`. - index_coords: Dictionary mapping dimension names to their coordinate arrays. If not - provided, defaults to sequential integers (0 to size-1) for each dimension. - header_dtype: Data type for trace headers, if applicable. Defaults to None. - lossless: If True, uses lossless Blosc compression with zstandard. If False, uses ZFP lossy - compression (requires `zfpy` library). - compression_tolerance: Tolerance for ZFP compression in lossy mode. Ignored if - `lossless=True`. Default is 0.01, providing ~70% size reduction. - storage_options: Dictionary of storage options for the MDIO output file (e.g., - cloud credentials). Defaults to None (anonymous access). - overwrite: If True, overwrites existing MDIO file at the specified path. - - Raises: - ValueError: When length of `chunksize` does not match the number of dims in the input array - or if an element of `index_names` not included in the `index_coords` dictionary. Also - raised when size of a coordinate array in does not match the corresponding dimension. - - - Examples: - To convert a 3D NumPy array to MDIO format locally with default chunking: - - >>> import numpy as np - >>> from mdio.converters import numpy_to_mdio - >>> - >>> array = np.random.rand(100, 200, 300) - >>> numpy_to_mdio( - ... array=array, - ... mdio_path_or_buffer="output/file.mdio", - ... chunksize=(64, 64, 64), - ... index_names=["inline", "crossline", "sample"], - ... ) - - For a cloud-based output on AWS S3 with custom coordinates: - - >>> coords = { - ... "inline": np.arange(0, 100, 2), - ... "crossline": np.arange(0, 200, 4), - ... "sample": np.linspace(0, 0.3, 300), - ... } - >>> numpy_to_mdio( - ... array=array, - ... mdio_path_or_buffer="s3://bucket/file.mdio", - ... chunksize=(32, 32, 128), - ... index_names=["inline", "crossline", "sample"], - ... index_coords=coords, - ... lossless=False, - ... compression_tolerance=0.01, - ... ) - - To convert a 2D array with default indexing and lossless compression: - - >>> array_2d = np.random.rand(500, 1000) - >>> numpy_to_mdio( - ... array=array_2d, - ... mdio_path_or_buffer="output/file_2d.mdio", - ... chunksize=(512, 512), - ... ) - """ - storage_options = storage_options or {} - - if len(chunksize) != array.ndim: - message = (f"Length of chunks={len(chunksize)} must be equal to array dimensions={array.ndim}",) - raise ValueError(message) - - if index_names is None: - index_names = index_names or [f"dim_{i}" for i in range(array.ndim - 1)] - index_names.append("sample") - - if index_coords is None: - index_coords = {} - for name, size in zip(index_names, array.shape, strict=True): - index_coords[name] = np.arange(size) - else: - for name, size in zip(index_names, array.shape, strict=True): - if name not in index_coords: - message = f"Index name {name} not found in index_coords" - raise ValueError(message) - - if index_coords[name].size != size: - message = ( - f"Size of index_coords[{name}]={index_coords[name].size} does not match array dimension={size}" - ) - raise ValueError(message) - - suffix = [dim_chunks if dim_chunks > 0 else None for dim_chunks in chunksize] - suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None] - suffix = "".join(suffix) - - # TODO(Dmitrit Repin): Implement Numpy converted in MDIO v1 - # https://github.com/TGSAI/mdio-python/issues/596 - def get_compressor(lossless: bool, tolerance: float) -> list[str]: - pass - - compressors = get_compressor(lossless, compression_tolerance) - mdio_var = MDIOVariableConfig( - name=f"chunked_{suffix}", - dtype=str(array.dtype), - chunks=chunksize, - compressors=compressors, - header_dtype=header_dtype, - ) - - dims = [Dimension(name=name, coords=index_coords[name]) for name in index_names] - create_conf = MDIOCreateConfig(path=mdio_path_or_buffer, grid=Grid(dims), variables=[mdio_var]) - create_empty(create_conf, overwrite, storage_options) - - writer = MDIOWriter(mdio_path_or_buffer, suffix, storage_options) - writer[:] = array - writer.stats = { - "mean": array.mean().item(), - "std": array.std().item(), - "rms": np.sqrt((array**2).sum() / array.size).item(), - "min": array.min().item(), - "max": array.max().item(), - } diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 3052433c8..b5f43dff6 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -12,6 +12,8 @@ from segy.standards.codes import MeasurementSystem as segy_MeasurementSystem from segy.standards.fields.trace import Rev0 as TraceHeaderFieldsRev0 +from mdio.api.io import _normalize_path +from mdio.api.io import to_mdio from mdio.constants import UINT32_MAX from mdio.converters.exceptions import EnvironmentFormatError from mdio.converters.exceptions import GridTraceCountError @@ -26,14 +28,15 @@ from mdio.segy.utilities import get_grid_plan if TYPE_CHECKING: + from pathlib import Path from typing import Any from segy.arrays import HeaderArray as SegyHeaderArray from segy.schema import SegySpec + from upath import UPath from xarray import Dataset as xr_Dataset from mdio.core.dimension import Dimension - from mdio.core.storage_location import StorageLocation from mdio.schemas.v1.dataset import Dataset from mdio.schemas.v1.templates.abstract_dataset_template import AbstractDatasetTemplate @@ -325,8 +328,8 @@ def _add_segy_ingest_attributes(dataset: Dataset, segy_file: SegyFile, grid_over def segy_to_mdio( # noqa PLR0913 segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate, - input_location: StorageLocation, - output_location: StorageLocation, + input_path: UPath | Path | str, + output_path: UPath | Path | str, overwrite: bool = False, grid_overrides: dict[str, Any] | None = None, ) -> None: @@ -337,20 +340,23 @@ def segy_to_mdio( # noqa PLR0913 Args: segy_spec: The SEG-Y specification to use for the conversion. mdio_template: The MDIO template to use for the conversion. - input_location: The storage location of the input SEG-Y file. - output_location: The storage location for the output MDIO v1 file. + input_path: The universal path of the input SEG-Y file. + output_path: The universal path for the output MDIO v1 file. overwrite: Whether to overwrite the output file if it already exists. Defaults to False. grid_overrides: Option to add grid overrides. Raises: FileExistsError: If the output location already exists and overwrite is False. """ - if not overwrite and output_location.exists(): - err = f"Output location '{output_location.uri}' exists. Set `overwrite=True` if intended." + input_path = _normalize_path(input_path) + output_path = _normalize_path(output_path) + + if not overwrite and output_path.exists(): + err = f"Output location '{output_path.path}' exists. Set `overwrite=True` if intended." raise FileExistsError(err) - segy_settings = SegySettings(storage_options=input_location.options) - segy_file = SegyFile(url=input_location.uri, spec=segy_spec, settings=segy_settings) + segy_settings = SegySettings(storage_options=input_path.storage_options) + segy_file = SegyFile(url=input_path.path, spec=segy_spec, settings=segy_settings) segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template, grid_overrides) @@ -381,18 +387,15 @@ def segy_to_mdio( # noqa PLR0913 xr_dataset.trace_mask.data[:] = grid.live_mask - # TODO(Dmitriy Repin): Write out text and binary headers. - # https://github.com/TGSAI/mdio-python/issues/595 - # IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in # blocked_io.to_zarr() -> _workers.trace_worker() # This will create the Zarr store with the correct structure but with empty arrays - xr_dataset.to_zarr(store=output_location.uri, mode="w", write_empty_chunks=False, zarr_format=2, compute=False) + to_mdio(xr_dataset, output_path=output_path, mode="w", compute=False) # This will write the non-dimension coordinates and trace mask meta_ds = xr_dataset[drop_vars_delayed + ["trace_mask"]] - meta_ds.to_zarr(store=output_location.uri, mode="r+", write_empty_chunks=False, zarr_format=2, compute=True) + to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) # Now we can drop them to simplify chunked write of the data variable xr_dataset = xr_dataset.drop_vars(drop_vars_delayed) @@ -403,7 +406,7 @@ def segy_to_mdio( # noqa PLR0913 # performed in chunks to save the memory blocked_io.to_zarr( segy_file=segy_file, - output_location=output_location, + output_path=output_path, grid_map=grid.map, dataset=xr_dataset, data_variable_name=default_variable_name, diff --git a/src/mdio/core/dimension.py b/src/mdio/core/dimension.py index c5fe369c8..9aafbed35 100644 --- a/src/mdio/core/dimension.py +++ b/src/mdio/core/dimension.py @@ -2,14 +2,12 @@ from __future__ import annotations -import inspect from dataclasses import dataclass from typing import TYPE_CHECKING from typing import Any import numpy as np -from mdio.core.serialization import Serializer from mdio.exceptions import ShapeError if TYPE_CHECKING: @@ -88,36 +86,3 @@ def min(self) -> NDArray[float]: def max(self) -> NDArray[float]: """Get maximum value of dimension.""" return np.max(self.coords) - - def serialize(self, stream_format: str) -> str: - """Serialize the dimension into buffer.""" - serializer = DimensionSerializer(stream_format) - return serializer.serialize(self) - - @classmethod - def deserialize(cls, stream: str, stream_format: str) -> Dimension: - """Deserialize buffer into Dimension.""" - serializer = DimensionSerializer(stream_format) - return serializer.deserialize(stream) - - -class DimensionSerializer(Serializer): - """Serializer implementation for Dimension.""" - - def serialize(self, dimension: Dimension) -> str: - """Serialize Dimension into buffer.""" - payload = { - "name": dimension.name, - "length": len(dimension), - "coords": dimension.coords.tolist(), - } - return self.serialize_func(payload) - - def deserialize(self, stream: str) -> Dimension: - """Deserialize buffer into Dimension.""" - signature = inspect.signature(Dimension) - - payload = self.deserialize_func(stream) - payload = self.validate_payload(payload, signature) - - return Dimension(**payload) diff --git a/src/mdio/core/factory.py b/src/mdio/core/factory.py deleted file mode 100644 index e84ea6793..000000000 --- a/src/mdio/core/factory.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Module for creating empty MDIO datasets. - -This module provides tools to configure and initialize empty MDIO datasets, which are -used for storing multidimensional data with associated metadata. It includes: - -- `MDIOVariableConfig`: Config for individual variables in the dataset. -- `MDIOCreateConfig`: Config for the dataset, including path, grid, and variables. -- `create_empty`: Function to create the empty dataset based on provided configuration. -- `create_empty_like`: Create an empty dataset with same structure as an existing one. - -The `create_empty` function sets up the Zarr hierarchy with metadata and data groups, -creates datasets for each variable and their trace headers, and initializes attributes -such as creation time, API version, grid dimensions, and basic statistics. - -The `create_empty_like` function creates a new empty dataset by replicating the -structure of an existing MDIO dataset, including its grid, variables, and headers. - -For detailed usage and parameters, see the docstring of the `create_empty` function. -""" - -from dataclasses import dataclass -from datetime import UTC -from datetime import datetime -from importlib import metadata -from typing import Any - -import zarr -from numcodecs import Blosc -from numpy.typing import DTypeLike -from zarr import Group -from zarr import open_group -from zarr.core.array import CompressorsLike - -from mdio.api.accessor import MDIOWriter -from mdio.api.io_utils import process_url -from mdio.core.grid import Grid -from mdio.core.utils_write import get_live_mask_chunksize -from mdio.core.utils_write import write_attribute -from mdio.segy.compat import mdio_segy_spec -from mdio.segy.helpers_segy import create_zarr_hierarchy - -try: - API_VERSION = metadata.version("multidimio") -except metadata.PackageNotFoundError: - API_VERSION = "unknown" - -DEFAULT_TEXT = [f"C{idx:02d}" + " " * 77 for idx in range(40)] -DEFAULT_TRACE_HEADER_DTYPE = mdio_segy_spec().trace.header.dtype - - -@dataclass -class MDIOVariableConfig: - """Configuration for creating an MDIO variable. - - This dataclass defines the parameters required to configure a variable - in an MDIO dataset, including its name, data type, chunking strategy, - compression method, and optional header data type. - - Attributes: - name: The name of the variable. - dtype: The data type of the variable (e.g., 'float32', 'int16'). - chunks: The chunk size for the variable along each dimension. - compressors: The compression algorithm(s) to use. - header_dtype: The data type for the variable's header. - """ - - name: str - dtype: str - chunks: tuple[int, ...] | None = None - compressors: CompressorsLike | None = None - header_dtype: DTypeLike | None = None - - -@dataclass -class MDIOCreateConfig: - """Configuration for creating an MDIO dataset. - - This dataclass encapsulates the parameters needed to create an MDIO dataset, - including the storage path, grid specification, and a list of variable - configurations. - - Attributes: - path: The file path or URI where the MDIO dataset will be created. - grid: The grid specification defining the dataset's spatial structure. - variables: A list of configurations for variables to be included in dataset. - """ - - path: str - grid: Grid - variables: list[MDIOVariableConfig] - - -def create_empty( - config: MDIOCreateConfig, - overwrite: bool = False, - storage_options: dict[str, Any] | None = None, - consolidate_meta: bool = True, -) -> Group: - """Create an empty MDIO dataset. - - This function initializes a new MDIO dataset at the specified path based on the provided - configuration. It constructs a Zarr hierarchy with groups for metadata and data, creates - datasets for each variable and its associated trace headers, and sets initial attributes - such as creation time, API version, grid dimensions, and basic statistics (all initialized - to zero). An empty 'live_mask' dataset is also created to track valid traces. - - Important: It is up to the user to update live_mask and other attributes. - - Args: - config: Configuration object specifying the dataset's path, grid structure, and a - list of variable configurations (e.g., name, dtype, chunks). - overwrite: If True, overwrites any existing dataset at the specified path. If - False, an error is raised if the dataset exists. Defaults to False. - storage_options: Options for the storage backend, such as credentials or settings for - cloud storage (e.g., S3, GCS). Defaults to None. - consolidate_meta: If True, consolidates metadata into a single file after creation, - improving performance for large metadata. Defaults to True. - - Returns: - Group: The root Zarr group representing the newly created MDIO dataset. - """ - zarr.config.set({"default_zarr_format": 2, "write_empty_chunks": False}) - - url = process_url(url=config.path, disk_cache=False) - root_group = open_group(url, mode="w", storage_options=storage_options) - root_group = create_zarr_hierarchy(root_group, overwrite) - - meta_group = root_group["metadata"] - data_group = root_group["data"] - - # Get UTC time, then add local timezone information offset. - iso_datetime = datetime.now(UTC).isoformat() - dimensions_dict = [dim.to_dict() for dim in config.grid.dims] - - write_attribute(name="created", zarr_group=root_group, attribute=iso_datetime) - write_attribute(name="api_version", zarr_group=root_group, attribute=API_VERSION) - write_attribute(name="dimension", zarr_group=root_group, attribute=dimensions_dict) - write_attribute(name="trace_count", zarr_group=root_group, attribute=0) - write_attribute(name="text_header", zarr_group=meta_group, attribute=DEFAULT_TEXT) - write_attribute(name="binary_header", zarr_group=meta_group, attribute={}) - - live_shape = config.grid.shape[:-1] - live_chunks = get_live_mask_chunksize(live_shape) - meta_group.create_array( - name="live_mask", - shape=live_shape, - chunks=live_chunks, - dtype="bool", - chunk_key_encoding={"name": "v2", "separator": "/"}, - ) - - for variable in config.variables: - data_group.create_array( - name=variable.name, - shape=config.grid.shape, - dtype=variable.dtype, - compressors=variable.compressors, - chunks=variable.chunks, - chunk_key_encoding={"name": "v2", "separator": "/"}, - ) - - header_dtype = variable.header_dtype or DEFAULT_TRACE_HEADER_DTYPE - meta_group.create_array( - name=f"{variable.name}_trace_headers", - shape=config.grid.shape[:-1], # Same spatial shape as data - chunks=variable.chunks[:-1], # Same spatial chunks as data - compressors=Blosc("zstd"), - dtype=header_dtype, - chunk_key_encoding={"name": "v2", "separator": "/"}, - ) - - stats = {"mean": 0, "std": 0, "rms": 0, "min": 0, "max": 0} - - for key, value in stats.items(): - write_attribute(name=key, zarr_group=root_group, attribute=value) - - if consolidate_meta: - zarr.consolidate_metadata(root_group.store) - - return root_group - - -def create_empty_like( - source_path: str, - dest_path: str, - overwrite: bool = False, - storage_options_input: dict[str, Any] | None = None, - storage_options_output: dict[str, Any] | None = None, -) -> None: - """Create an empty MDIO dataset with the same structure as an existing one. - - This function initializes a new empty MDIO dataset at the specified - destination path, replicating the structure of an existing dataset, including - its grid, variables, chunking strategy, compression, and headers. It copies - metadata such as text and binary headers from the source dataset and sets - initial attributes like creation time, API version, and zeroed statistics. - - Important: It is up to the user to update headers, `live_mask` and stats. - - Args: - source_path: The path or URI of the existing MDIO dataset to replicate. - dest_path: The path or URI where the new MDIO dataset will be created. - overwrite: If True, overwrites any existing dataset at the destination. - storage_options_input: Options for storage backend of the source dataset. - storage_options_output: Options for storage backend of the destination dataset. - """ - source_root = zarr.open_consolidated( - source_path, - mode="r", - storage_options=storage_options_input, - ) - src_data_grp = source_root["data"] - src_meta_grp = source_root["metadata"] - - grid = Grid.from_zarr(source_root) - - variables = [] - for var_name in src_data_grp: - variable = MDIOVariableConfig( - name=var_name, - dtype=src_data_grp[var_name].dtype, - chunks=src_data_grp[var_name].chunks, - compressors=src_data_grp[var_name].compressors, - header_dtype=src_meta_grp[f"{var_name}_trace_headers"].dtype, - ) - variables.append(variable) - - config = MDIOCreateConfig(path=dest_path, grid=grid, variables=variables) - - create_empty(config=config, overwrite=overwrite, storage_options=storage_options_output) - - writer = MDIOWriter(dest_path, storage_options=storage_options_output) - writer.text_header = src_meta_grp.attrs["text_header"] - writer.binary_header = src_meta_grp.attrs["binary_header"] diff --git a/src/mdio/core/grid.py b/src/mdio/core/grid.py index ec66ee471..97e3bfac2 100644 --- a/src/mdio/core/grid.py +++ b/src/mdio/core/grid.py @@ -2,22 +2,22 @@ from __future__ import annotations -import inspect from dataclasses import dataclass from typing import TYPE_CHECKING import numpy as np import zarr +from zarr.codecs import BloscCodec from mdio.constants import UINT32_MAX -from mdio.core import Dimension -from mdio.core.serialization import Serializer from mdio.core.utils_write import get_constrained_chunksize if TYPE_CHECKING: from segy.arrays import HeaderArray from zarr import Array as ZarrArray + from mdio.core import Dimension + @dataclass class Grid: @@ -90,24 +90,6 @@ def get_max(self, name: str) -> float: """Get maximum value of a dimension by name.""" return self.select_dim(name).max().item() - def serialize(self, stream_format: str) -> str: - """Serialize the grid to a string buffer.""" - serializer = GridSerializer(stream_format) - return serializer.serialize(self) - - @classmethod - def deserialize(cls, stream: str, stream_format: str) -> Grid: - """Deserialize a string buffer into a Grid instance.""" - serializer = GridSerializer(stream_format) - return serializer.deserialize(stream) - - @classmethod - def from_zarr(cls, zarr_root: zarr.Group) -> Grid: - """Create a Grid instance from Zarr group attributes.""" - dims_list = zarr_root.attrs["dimension"] - dims_list = [Dimension.from_dict(dim) for dim in dims_list] - return cls(dims_list) - def build_map(self, index_headers: HeaderArray) -> None: """Build trace mapping and live mask from header indices. @@ -126,8 +108,10 @@ def build_map(self, index_headers: HeaderArray) -> None: dtype=map_dtype, max_bytes=self._INTERNAL_CHUNK_SIZE_TARGET, ) - self.map = zarr.full(live_shape, fill_value, dtype=map_dtype, chunks=chunks) - self.live_mask = zarr.zeros(live_shape, dtype=bool, chunks=chunks) + grid_compressor = BloscCodec(cname="zstd") + common_kwargs = {"shape": live_shape, "chunks": chunks, "compressors": grid_compressor, "store": None} + self.map = zarr.create_array(fill_value=fill_value, dtype=map_dtype, **common_kwargs) + self.live_mask = zarr.create_array(fill_value=0, dtype=bool, **common_kwargs) # Calculate batch size memory_per_trace_index = index_headers.itemsize @@ -151,22 +135,3 @@ def build_map(self, index_headers: HeaderArray) -> None: self.map.vindex[live_dim_indices] = trace_indices self.live_mask.vindex[live_dim_indices] = True - - -class GridSerializer(Serializer): - """Serializer implementation for Grid.""" - - def serialize(self, grid: Grid) -> str: - """Serialize Grid into buffer.""" - payload = [dim.to_dict() for dim in grid.dims] - return self.serialize_func(payload) - - def deserialize(self, stream: str) -> Grid: - """Deserialize buffer into Grid.""" - signature = inspect.signature(Grid) - - payload = self.deserialize_func(stream) - payload = [Dimension.from_dict(dim) for dim in payload] - payload = self.validate_payload({"dims": payload}, signature) - - return Grid(**payload) diff --git a/src/mdio/core/serialization.py b/src/mdio/core/serialization.py deleted file mode 100644 index ace45fce1..000000000 --- a/src/mdio/core/serialization.py +++ /dev/null @@ -1,93 +0,0 @@ -"""(De)serialization factory design pattern. - -Current support for JSON and YAML. -""" - -import json -from abc import ABC -from abc import abstractmethod -from collections.abc import Callable -from inspect import Signature - -import yaml - - -class Serializer(ABC): - """Serializer base class. - - Here we define the interface for any serializer implementation. - - Args: - stream_format: Stream format. Must be in {"JSON", "YAML"}. - """ - - def __init__(self, stream_format: str) -> None: - self.format = stream_format - self.serialize_func = get_serializer(stream_format) - self.deserialize_func = get_deserializer(stream_format) - - @abstractmethod - def serialize(self, payload: dict) -> str: - """Abstract method for serialize.""" - - @abstractmethod - def deserialize(self, stream: str) -> dict: - """Abstract method for deserialize.""" - - @staticmethod - def validate_payload(payload: dict, signature: Signature) -> dict: - """Validate if required keys exist in the payload for a function signature.""" - observed = set(payload) - expected = set(signature.parameters) - - if not expected.issubset(observed): - msg = f"Key mismatch: {observed}, expected {expected}" - raise KeyError(msg) - - if len(observed) != len(expected): - print(f"Ignoring extra key: {observed - expected}") - payload = {key: payload[key] for key in expected} - - return payload - - -def get_serializer(stream_format: str) -> Callable: - """Get serializer based on format.""" - stream_format = stream_format.upper() - if stream_format == "JSON": - return _serialize_to_json - if stream_format == "YAML": - return _serialize_to_yaml - msg = f"Unsupported serializer for format: {stream_format}" - raise ValueError(msg) - - -def get_deserializer(stream_format: str) -> Callable: - """Get deserializer based on format.""" - stream_format = stream_format.upper() - if stream_format == "JSON": - return _deserialize_json - if stream_format == "YAML": - return _deserialize_yaml - msg = f"Unsupported deserializer for format: {stream_format}" - raise ValueError(msg) - - -def _serialize_to_json(payload: dict) -> str: - """Convert dictionary to JSON string.""" - return json.dumps(payload) - - -def _serialize_to_yaml(payload: dict) -> str: - """Convert dictionary to YAML string.""" - return yaml.dump(payload, sort_keys=False) - - -def _deserialize_json(stream: str) -> dict: - """Convert JSON string to dictionary.""" - return json.loads(stream) - - -def _deserialize_yaml(stream: str) -> dict: - """Convert YAML string to dictionary.""" - return yaml.safe_load(stream) diff --git a/src/mdio/core/storage_location.py b/src/mdio/core/storage_location.py deleted file mode 100644 index 3a5adb2a1..000000000 --- a/src/mdio/core/storage_location.py +++ /dev/null @@ -1,87 +0,0 @@ -"""StorageLocation class for managing local and cloud storage locations.""" - -from pathlib import Path -from typing import Any - -import fsspec - - -# TODO(Dmitriy Repin): Reuse fsspec functions for some methods we implemented here -# https://github.com/TGSAI/mdio-python/issues/597 -class StorageLocation: - """A class to represent a local or cloud storage location for SEG-Y or MDIO files. - - This class abstracts the storage location, allowing for both local file paths and - cloud storage URIs (e.g., S3, GCS). It uses fsspec to check existence and manage options. - Note, we do not want to make it a dataclass because we want the uri and the options to - be read-only immutable properties. - - uri: The URI of the storage location (e.g., '/path/to/file', 'file:///path/to/file', - 's3://bucket/path', 'gs://bucket/path'). - options: Optional dictionary of options for the cloud, such as credentials. - - """ - - def __init__(self, uri: str = "", options: dict[str, Any] = None): - self._uri = uri - self._options = options or {} - self._fs = None - - if uri.startswith(("s3://", "gs://")): - return - - if uri.startswith("file://"): - self._uri = self._uri.removeprefix("file://") - # For local paths, ensure they are absolute and resolved - self._uri = str(Path(self._uri).resolve()) - return - - @property - def uri(self) -> str: - """Get the URI (read-only).""" - return self._uri - - @property - def options(self) -> dict[str, Any]: - """Get the options (read-only).""" - # Return a copy to prevent external modification - return self._options.copy() - - @property - def _filesystem(self) -> fsspec.AbstractFileSystem: - """Get the fsspec filesystem instance for this storage location.""" - if self._fs is None: - self._fs = fsspec.filesystem(self._protocol, **self._options) - return self._fs - - @property - def _path(self) -> str: - """Extract the path portion from the URI.""" - if "://" in self._uri: - return self._uri.split("://", 1)[1] - return self._uri # For local paths without file:// prefix - - @property - def _protocol(self) -> str: - """Extract the protocol/scheme from the URI.""" - if "://" in self._uri: - return self._uri.split("://", 1)[0] - return "file" # Default to file protocol - - def exists(self) -> bool: - """Check if the storage location exists using fsspec.""" - try: - return self._filesystem.exists(self._path) - except Exception as e: - # Log the error and return False for safety - # In a production environment, you might want to use proper logging - print(f"Error checking existence of {self._uri}: {e}") - return False - - def __str__(self) -> str: - """String representation of the storage location.""" - return self._uri - - def __repr__(self) -> str: - """Developer representation of the storage location.""" - return f"StorageLocation(uri='{self._uri}', options={self._options})" diff --git a/src/mdio/schemas/chunk_grid.py b/src/mdio/schemas/chunk_grid.py index 40d573a47..757b6103d 100644 --- a/src/mdio/schemas/chunk_grid.py +++ b/src/mdio/schemas/chunk_grid.py @@ -10,13 +10,13 @@ class RegularChunkShape(CamelCaseStrictModel): """Represents regular chunk sizes along each dimension.""" - chunk_shape: list[int] = Field(..., description="Lengths of the chunk along each dimension of the array.") + chunk_shape: tuple[int, ...] = Field(..., description="Lengths of the chunk along each dimension of the array.") class RectilinearChunkShape(CamelCaseStrictModel): """Represents irregular chunk sizes along each dimension.""" - chunk_shape: list[list[int]] = Field( + chunk_shape: tuple[tuple[int, ...], ...] = Field( ..., description="Lengths of the chunk along each dimension of the array.", ) diff --git a/src/mdio/schemas/compressors.py b/src/mdio/schemas/compressors.py index 2eb87f7ad..bb4729fbc 100644 --- a/src/mdio/schemas/compressors.py +++ b/src/mdio/schemas/compressors.py @@ -7,51 +7,25 @@ from __future__ import annotations -from enum import IntEnum from enum import StrEnum from pydantic import Field from pydantic import model_validator +from zarr.codecs import BloscCname +from zarr.codecs import BloscShuffle from mdio.schemas.core import CamelCaseStrictModel -class BloscAlgorithm(StrEnum): - """Enum for Blosc algorithm options.""" - - BLOSCLZ = "blosclz" - LZ4 = "lz4" - LZ4HC = "lz4hc" - ZLIB = "zlib" - ZSTD = "zstd" - - -class BloscShuffle(IntEnum): - """Enum for Blosc shuffle options.""" - - NOSHUFFLE = 0 - SHUFFLE = 1 - BITSHUFFLE = 2 - AUTOSHUFFLE = -1 - - class Blosc(CamelCaseStrictModel): """Data Model for Blosc options.""" name: str = Field(default="blosc", description="Name of the compressor.") - algorithm: BloscAlgorithm = Field( - default=BloscAlgorithm.LZ4, - description="The Blosc compression algorithm to be used.", - ) - level: int = Field(default=5, ge=0, le=9, description="The compression level.") - shuffle: BloscShuffle = Field( - default=BloscShuffle.SHUFFLE, - description="The shuffle strategy to be applied before compression.", - ) - blocksize: int = Field( - default=0, - description="The size of the block to be used for compression.", - ) + cname: BloscCname = Field(default=BloscCname.zstd, description="Compression algorithm name.") + clevel: int = Field(default=5, ge=0, le=9, description="Compression level (integer 0–9)") + shuffle: BloscShuffle | None = Field(default=None, description="Shuffling mode before compression.") + typesize: int | None = Field(default=None, description="The size in bytes that the shuffle is performed over.") + blocksize: int = Field(default=0, description="The size (in bytes) of blocks to divide data before compression.") zfp_mode_map = { diff --git a/src/mdio/schemas/v0/__init__.py b/src/mdio/schemas/v0/__init__.py deleted file mode 100644 index 9b9533049..000000000 --- a/src/mdio/schemas/v0/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Schema specific to MDIO v0.""" - -from mdio.schemas.v0.dataset import DatasetModelV0 - -__all__ = ["DatasetModelV0"] diff --git a/src/mdio/schemas/v0/dataset.py b/src/mdio/schemas/v0/dataset.py deleted file mode 100644 index 50103736c..000000000 --- a/src/mdio/schemas/v0/dataset.py +++ /dev/null @@ -1,82 +0,0 @@ -"""Dataset model for MDIO V0.""" - -from __future__ import annotations - -from pydantic import AwareDatetime -from pydantic import Field - -from mdio.schemas.base import BaseArray -from mdio.schemas.base import BaseDataset -from mdio.schemas.core import CamelCaseStrictModel -from mdio.schemas.core import StrictModel - - -class DimensionModelV0(CamelCaseStrictModel): - """Represents dimension schema for MDIO v0.""" - - name: str = Field(..., description="Name of the dimension.") - coords: list[int] = Field(..., description="Coordinate labels (ticks).") - - -class DatasetMetadataModelV0(StrictModel): - """Represents dataset attributes schema for MDIO v0.""" - - api_version: str = Field( - ..., - description="MDIO version.", - ) - - created: AwareDatetime = Field( - ..., - description="Creation time with TZ info.", - ) - - dimension: list[DimensionModelV0] = Field( - ..., - description="Dimensions.", - ) - - mean: float | None = Field( - default=None, - description="Mean value of the samples.", - ) - - # Statistical information - std: float | None = Field(default=None, description="Standard deviation of the samples.") - - rms: float | None = Field(default=None, description="Root mean squared value of the samples.") - - min: float | None = Field( - default=None, - description="Minimum value of the samples.", - ) - - max: float | None = Field( - default=None, - description="Maximum value of the samples.", - ) - - trace_count: int | None = Field(default=None, description="Number of traces in the SEG-Y file.") - - -class VariableModelV0(BaseArray): - """Represents an MDIO v0 variable schema.""" - - -class DatasetModelV0(BaseDataset): - """Represents an MDIO v0 dataset schema.""" - - seismic: list[VariableModelV0] = Field( - ..., - description="Variable containing seismic.", - ) - - headers: list[VariableModelV0] = Field( - ..., - description="Variable containing headers.", - ) - - metadata: DatasetMetadataModelV0 = Field( - ..., - description="Dataset metadata.", - ) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 7c56c21cd..ba9fe528a 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -136,7 +136,7 @@ def add_coordinate( # noqa: PLR0913 name: str, *, long_name: str = None, - dimensions: list[str], + dimensions: tuple[str, ...], data_type: ScalarType, compressor: Blosc | ZFP | None = None, metadata_info: CoordinateMetadataList | None = None, @@ -218,10 +218,10 @@ def add_variable( # noqa: PLR0913 name: str, *, long_name: str = None, - dimensions: list[str], + dimensions: tuple[str, ...], data_type: ScalarType | StructuredType, compressor: Blosc | ZFP | None = None, - coordinates: list[str] | None = None, + coordinates: tuple[str, ...] | None = None, metadata_info: VariableMetadataList | None = None, ) -> "MDIODatasetBuilder": """Add a variable after adding at least one dimension and, optionally, coordinate. diff --git a/src/mdio/schemas/v1/dataset_serializer.py b/src/mdio/schemas/v1/dataset_serializer.py index af513f8b6..1e0a92d77 100644 --- a/src/mdio/schemas/v1/dataset_serializer.py +++ b/src/mdio/schemas/v1/dataset_serializer.py @@ -2,10 +2,10 @@ import numpy as np from dask import array as dask_array -from numcodecs import Blosc as nc_Blosc +from dask.array.core import normalize_chunks from xarray import DataArray as xr_DataArray from xarray import Dataset as xr_Dataset -from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding +from zarr.codecs import BloscCodec from mdio.converters.type_converter import to_numpy_dtype @@ -114,25 +114,20 @@ def _get_zarr_shape(var: Variable, all_named_dims: dict[str, NamedDimension]) -> def _get_zarr_chunks(var: Variable, all_named_dims: dict[str, NamedDimension]) -> tuple[int, ...]: """Get the chunk shape for a variable, defaulting to its shape if no chunk grid is defined.""" if var.metadata is not None and var.metadata.chunk_grid is not None: - return tuple(var.metadata.chunk_grid.configuration.chunk_shape) + return var.metadata.chunk_grid.configuration.chunk_shape # Default to full shape if no chunk grid is defined return _get_zarr_shape(var, all_named_dims=all_named_dims) def _convert_compressor( compressor: mdio_Blosc | mdio_ZFP | None, -) -> nc_Blosc | zfpy_ZFPY | None: +) -> BloscCodec | zfpy_ZFPY | None: """Convert a compressor to a numcodecs compatible format.""" if compressor is None: return None if isinstance(compressor, mdio_Blosc): - return nc_Blosc( - cname=compressor.algorithm.value, - clevel=compressor.level, - shuffle=compressor.shuffle.value, - blocksize=compressor.blocksize if compressor.blocksize > 0 else 0, - ) + return BloscCodec(**compressor.model_dump(exclude={"name"})) if isinstance(compressor, mdio_ZFP): if zfpy_ZFPY is None: @@ -150,17 +145,13 @@ def _convert_compressor( def _get_fill_value(data_type: ScalarType | StructuredType | str) -> any: - """Get the fill value for a given data type. - - The Zarr fill_value is a scalar value providing the default value to use for - uninitialized portions of the array, or null if no fill_value is to be used - https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html - """ + """Get the fill value for a given data type.""" if isinstance(data_type, ScalarType): return fill_value_map.get(data_type) if isinstance(data_type, StructuredType): - d_type = to_numpy_dtype(data_type) - return np.zeros((), dtype=d_type) + numpy_dtype = to_numpy_dtype(data_type) + fill_value = (0,) * len(numpy_dtype.fields) + return np.void(fill_value, dtype=numpy_dtype) if isinstance(data_type, str): return "" # If we do not have a fill value for this type, use None @@ -183,20 +174,31 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset: # noqa: PLR0912 Returns: The constructed dataset with proper MDIO structure and metadata. """ - # See the xarray tutorial for more details on how to create datasets: - # https://tutorial.xarray.dev/fundamentals/01.1_creating_data_structures.html - all_named_dims = _get_all_named_dimensions(mdio_ds) # First pass: Build all variables data_arrays: dict[str, xr_DataArray] = {} for v in mdio_ds.variables: + # Retrieve the array shape, data type, and original chunk sizes shape = _get_zarr_shape(v, all_named_dims=all_named_dims) dtype = to_numpy_dtype(v.data_type) - chunks = _get_zarr_chunks(v, all_named_dims=all_named_dims) + original_chunks = _get_zarr_chunks(v, all_named_dims=all_named_dims) + + # For efficient lazy array creation with Dask use larger chunks to minimize the task graph size + # Initialize with original chunks for lazy array creation + lazy_chunks = original_chunks + if shape != original_chunks: + # Compute automatic chunk sizes based on heuristics, respecting original chunks where possible + auto_chunks = normalize_chunks("auto", shape=shape, dtype=dtype, previous_chunks=original_chunks) + + # Extract the primary (uniform) chunk size for each dimension, ignoring any variable remainder chunks + uniform_auto = tuple(dim_chunks[0] for dim_chunks in auto_chunks) + + # Ensure creation chunks are at least as large as the original chunks to avoid splitting chunks + lazy_chunks = tuple(max(auto, orig) for auto, orig in zip(uniform_auto, original_chunks, strict=True)) + + data = dask_array.full(shape=shape, dtype=dtype, chunks=lazy_chunks, fill_value=_get_fill_value(v.data_type)) - # Use dask.array.zeros to create a lazy array - data = dask_array.full(shape=shape, dtype=dtype, chunks=chunks, fill_value=_get_fill_value(v.data_type)) # Create a DataArray for the variable. We will set coords in the second pass dim_names = _get_dimension_names(v) data_array = xr_DataArray(data, dims=dim_names) @@ -213,20 +215,11 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_Dataset: # noqa: PLR0912 if v.long_name: data_array.attrs["long_name"] = v.long_name - # Create a custom chunk key encoding with "/" as separator - chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict() encoding = { - "chunks": chunks, - "chunk_key_encoding": chunk_key_encoding, + "chunks": original_chunks, "compressor": _convert_compressor(v.compressor), + "fill_value": _get_fill_value(v.data_type), } - # NumPy structured data types have fields attribute, while scalar types do not. - if not hasattr(v.data_type, "fields"): - # TODO(Dmitriy Repin): work around of the bug - # https://github.com/TGSAI/mdio-python/issues/582 - # For structured data types we will not use the _FillValue - # NOTE: See Zarr documentation on use of fill_value and _FillValue in Zarr v2 vs v3 - encoding["_FillValue"] = _get_fill_value(v.data_type) data_array.encoding = encoding diff --git a/src/mdio/schemas/v1/templates/abstract_dataset_template.py b/src/mdio/schemas/v1/templates/abstract_dataset_template.py index f3e1f8027..cc7d333be 100644 --- a/src/mdio/schemas/v1/templates/abstract_dataset_template.py +++ b/src/mdio/schemas/v1/templates/abstract_dataset_template.py @@ -19,54 +19,25 @@ class AbstractDatasetTemplate(ABC): """Abstract base class that defines the template method for Dataset building factory. - The template method defines the skeleton of the data processing algorithm, - while allowing subclasses to override specific steps. + The template method defines the skeleton of the data processing algorithm, while allowing subclasses + to override specific steps. """ def __init__(self, domain: str = "") -> None: - # Template attributes to be overridden by subclasses - # Domain of the seismic data, e.g. "time" or "depth" self._trace_domain = domain.lower() - # Names of all coordinate dimensions in the dataset - # e.g. ["cdp"] for 2D post-stack depth - # e.g. ["inline", "crossline"] for 3D post-stack - # e.g. ["inline", "crossline"] for 3D pre-stack CDP gathers - # Note: For pre-stack Shot gathers, the coordinates are defined differently - # and are not directly tied to _coord_dim_names. - self._coord_dim_names = [] - # *ORDERED* list of names of all dimensions in the dataset - # e.g. ["cdp", "depth"] for 2D post-stack depth - # e.g. ["inline", "crossline", "depth"] for 3D post-stack depth - # e.g. ["inline", "crossline", "offset", "depth"] for 3D pre-stack depth CPD gathers - # e.g. ["shot_point", "cable", "channel", "time"] for 3D pre-stack - # time Shot gathers - self._dim_names = [] - # Names of all coordinates in the dataset - # e.g. ["cdp_x", "cdp_y"] for 2D post-stack depth - # e.g. ["cdp_x", "cdp_y"] for 3D post-stack depth - # e.g. ["cdp_x", "cdp_y"] for 3D pre-stack CPD depth - # e.g. ["gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y"] - # for 3D pre-stack time Shot gathers - self._coord_names = [] - # Chunk shape for the variable in the dataset - # e.g. [1024, 1024] for 2D post-stack depth - # e.g. [128, 128, 128] for 3D post-stack depth - # e.g. [1, 1, 512, 4096] for 3D pre-stack CPD depth - # e.g. [1, 1, 512, 4096] for 3D pre-stack time Shot gathers - self._var_chunk_shape = [] - - # Variables instantiated when build_dataset() is called - self._builder: MDIODatasetBuilder = None - # Sizes of the dimensions in the dataset, to be set when build_dataset() is called - self._dim_sizes = [] - # Horizontal units for the coordinates (e.g, "m", "ft"), to be set when - # build_dataset() is called + self._coord_dim_names = () + self._dim_names = () + self._coord_names = () + self._var_chunk_shape = () + + self._builder: MDIODatasetBuilder | None = None + self._dim_sizes = () self._horizontal_coord_unit = None def build_dataset( self, name: str, - sizes: list[int], + sizes: tuple[int, ...], horizontal_coord_unit: AllUnits, headers: StructuredType = None, ) -> Dataset: @@ -111,17 +82,17 @@ def trace_domain(self) -> str: return self._trace_domain @property - def dimension_names(self) -> list[str]: + def dimension_names(self) -> tuple[str, ...]: """Returns the names of the dimensions.""" return copy.deepcopy(self._dim_names) @property - def coordinate_names(self) -> list[str]: + def coordinate_names(self) -> tuple[str, ...]: """Returns the names of the coordinates.""" return copy.deepcopy(self._coord_names) @property - def full_chunk_size(self) -> list[int]: + def full_chunk_size(self) -> tuple[int, ...]: """Returns the chunk size for the variables.""" return copy.deepcopy(self._var_chunk_shape) @@ -133,7 +104,7 @@ def _name(self) -> str: Must be implemented by subclasses. Returns: - str: The name of the template + The name of the template """ @property @@ -144,7 +115,7 @@ def _default_variable_name(self) -> str: custom data variable name. Returns: - str: The name of the data variable + The name of the data variable """ return "amplitude" @@ -177,7 +148,7 @@ def _add_coordinates(self) -> None: for name in self._dim_names: self._builder.add_coordinate( name, - dimensions=[name], + dimensions=(name,), data_type=ScalarType.INT32, metadata_info=None, ) @@ -204,7 +175,7 @@ def _add_trace_mask(self) -> None: name="trace_mask", dimensions=self._dim_names[:-1], # All dimensions except vertical (the last one) data_type=ScalarType.BOOL, - compressor=compressors.Blosc(algorithm=compressors.BloscAlgorithm.ZSTD), + compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), # also default in zarr3 coordinates=self._coord_names, metadata_info=None, ) @@ -217,7 +188,7 @@ def _add_trace_headers(self, headers: StructuredType) -> None: name="headers", dimensions=self._dim_names[:-1], # All dimensions except vertical (the last one) data_type=headers, - compressor=compressors.Blosc(algorithm=compressors.BloscAlgorithm.ZSTD), + compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), # also default in zarr3 coordinates=self._coord_names, metadata_info=[ ChunkGridMetadata( @@ -236,11 +207,11 @@ def _add_variables(self) -> None: name=self.default_variable_name, dimensions=self._dim_names, data_type=ScalarType.FLOAT32, - compressor=compressors.Blosc(algorithm=compressors.BloscAlgorithm.ZSTD), + compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), # also default in zarr3 coordinates=self._coord_names, metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=self._var_chunk_shape)) - ) + ), ], ) diff --git a/src/mdio/schemas/v1/templates/seismic_2d_poststack.py b/src/mdio/schemas/v1/templates/seismic_2d_poststack.py index 297d598bc..d22e24236 100644 --- a/src/mdio/schemas/v1/templates/seismic_2d_poststack.py +++ b/src/mdio/schemas/v1/templates/seismic_2d_poststack.py @@ -10,10 +10,10 @@ class Seismic2DPostStackTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ["cdp"] - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["cdp_x", "cdp_y"] - self._var_chunk_shape = [1024, 1024] + self._coord_dim_names = ("cdp",) + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("cdp_x", "cdp_y") + self._var_chunk_shape = (1024, 1024) @property def _name(self) -> str: diff --git a/src/mdio/schemas/v1/templates/seismic_2d_prestack_cdp.py b/src/mdio/schemas/v1/templates/seismic_2d_prestack_cdp.py index 0f9884d71..f52035ab3 100644 --- a/src/mdio/schemas/v1/templates/seismic_2d_prestack_cdp.py +++ b/src/mdio/schemas/v1/templates/seismic_2d_prestack_cdp.py @@ -10,10 +10,10 @@ class Seismic2DPreStackCDPTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ["cdp", "offset"] - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["cdp_x", "cdp_y"] - self._var_chunk_shape = [1, 512, 4096] + self._coord_dim_names = ("cdp", "offset") + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("cdp_x", "cdp_y") + self._var_chunk_shape = (1, 512, 4096) @property def _name(self) -> str: diff --git a/src/mdio/schemas/v1/templates/seismic_2d_prestack_shot.py b/src/mdio/schemas/v1/templates/seismic_2d_prestack_shot.py index 6113aba46..1db964738 100644 --- a/src/mdio/schemas/v1/templates/seismic_2d_prestack_shot.py +++ b/src/mdio/schemas/v1/templates/seismic_2d_prestack_shot.py @@ -12,10 +12,10 @@ class Seismic2DPreStackShotTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ["shot_point", "channel"] # Custom coordinate definition for shot gathers - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y"] - self._var_chunk_shape = [1, 512, 4096] + self._coord_dim_names = ("shot_point", "channel") # Custom coordinate definition for shot gathers + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") + self._var_chunk_shape = (1, 512, 4096) @property def _name(self) -> str: @@ -35,7 +35,7 @@ def _add_coordinates(self) -> None: for name in self._dim_names: self._builder.add_coordinate( name, - dimensions=[name], + dimensions=(name,), data_type=ScalarType.INT32, metadata_info=None, ) @@ -43,31 +43,31 @@ def _add_coordinates(self) -> None: # Add non-dimension coordinates self._builder.add_coordinate( "gun", - dimensions=["shot_point", "channel"], + dimensions=("shot_point", "channel"), data_type=ScalarType.UINT8, metadata_info=[AllUnits(units_v1=None)], ) self._builder.add_coordinate( "source_coord_x", - dimensions=["shot_point", "channel"], + dimensions=("shot_point", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "source_coord_y", - dimensions=["shot_point", "channel"], + dimensions=("shot_point", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "group_coord_x", - dimensions=["shot_point", "channel"], + dimensions=("shot_point", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "group_coord_y", - dimensions=["shot_point", "channel"], + dimensions=("shot_point", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) diff --git a/src/mdio/schemas/v1/templates/seismic_3d_poststack.py b/src/mdio/schemas/v1/templates/seismic_3d_poststack.py index 78021f72e..f21ffdbe7 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_poststack.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_poststack.py @@ -10,10 +10,10 @@ class Seismic3DPostStackTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) # Template attributes to be overridden by subclasses - self._coord_dim_names = ["inline", "crossline"] - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["cdp_x", "cdp_y"] - self._var_chunk_shape = [128, 128, 128] + self._coord_dim_names = ("inline", "crossline") + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("cdp_x", "cdp_y") + self._var_chunk_shape = (128, 128, 128) @property def _name(self) -> str: diff --git a/src/mdio/schemas/v1/templates/seismic_3d_prestack_cdp.py b/src/mdio/schemas/v1/templates/seismic_3d_prestack_cdp.py index 2ae22b550..b35e047d1 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_prestack_cdp.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_prestack_cdp.py @@ -10,10 +10,10 @@ class Seismic3DPreStackCDPTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ["inline", "crossline", "offset"] - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["cdp_x", "cdp_y"] - self._var_chunk_shape = [1, 1, 512, 4096] + self._coord_dim_names = ("inline", "crossline", "offset") + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("cdp_x", "cdp_y") + self._var_chunk_shape = (1, 1, 512, 4096) @property def _name(self) -> str: diff --git a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py index 0903fc770..808b849d1 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py @@ -12,10 +12,10 @@ class Seismic3DPreStackCocaTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ["inline", "crossline", "offset", "azimuth"] - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["cdp_x", "cdp_y"] - self._var_chunk_shape = [8, 8, 32, 1, 1024] + self._coord_dim_names = ("inline", "crossline", "offset", "azimuth") + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("cdp_x", "cdp_y") + self._var_chunk_shape = (8, 8, 32, 1, 1024) @property def _name(self) -> str: @@ -34,43 +34,43 @@ def _add_coordinates(self) -> None: # Add dimension coordinates self._builder.add_coordinate( "inline", - dimensions=["inline"], + dimensions=("inline",), data_type=ScalarType.INT32, ) self._builder.add_coordinate( "crossline", - dimensions=["crossline"], + dimensions=("crossline",), data_type=ScalarType.INT32, ) self._builder.add_coordinate( "offset", - dimensions=["offset"], + dimensions=("offset",), data_type=ScalarType.INT32, metadata_info=[self._horizontal_coord_unit], ) angle_unit = AllUnits(units_v1={"angle": "deg"}) self._builder.add_coordinate( "azimuth", - dimensions=["azimuth"], + dimensions=("azimuth",), data_type=ScalarType.FLOAT32, metadata_info=[angle_unit], ) self._builder.add_coordinate( self.trace_domain, - dimensions=[self.trace_domain], + dimensions=(self.trace_domain,), data_type=ScalarType.INT32, ) # Add non-dimension coordinates self._builder.add_coordinate( "cdp_x", - dimensions=["inline", "crossline"], + dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "cdp_y", - dimensions=["inline", "crossline"], + dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) diff --git a/src/mdio/schemas/v1/templates/seismic_3d_prestack_shot.py b/src/mdio/schemas/v1/templates/seismic_3d_prestack_shot.py index d50f621c9..ff02377b4 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_prestack_shot.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_prestack_shot.py @@ -12,10 +12,10 @@ class Seismic3DPreStackShotTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ["shot_point", "cable", "channel"] # Custom coordinates for shot gathers - self._dim_names = [*self._coord_dim_names, self._trace_domain] - self._coord_names = ["gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y"] - self._var_chunk_shape = [1, 1, 512, 4096] + self._coord_dim_names = ("shot_point", "cable", "channel") # Custom coordinates for shot gathers + self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_names = ("gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") + self._var_chunk_shape = (1, 1, 512, 4096) @property def _name(self) -> str: @@ -35,7 +35,7 @@ def _add_coordinates(self) -> None: for name in self._dim_names: self._builder.add_coordinate( name, - dimensions=[name], + dimensions=(name,), data_type=ScalarType.INT32, metadata_info=None, ) @@ -43,31 +43,31 @@ def _add_coordinates(self) -> None: # Add non-dimension coordinates self._builder.add_coordinate( "gun", - dimensions=["shot_point", "cable", "channel"], + dimensions=("shot_point", "cable", "channel"), data_type=ScalarType.UINT8, metadata_info=[AllUnits(units_v1=None)], ) self._builder.add_coordinate( "source_coord_x", - dimensions=["shot_point", "cable", "channel"], + dimensions=("shot_point", "cable", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "source_coord_y", - dimensions=["shot_point", "cable", "channel"], + dimensions=("shot_point", "cable", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "group_coord_x", - dimensions=["shot_point", "cable", "channel"], + dimensions=("shot_point", "cable", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "group_coord_y", - dimensions=["shot_point", "cable", "channel"], + dimensions=("shot_point", "cable", "channel"), data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py index 1f4a91ca6..de3257e7e 100644 --- a/src/mdio/segy/_workers.py +++ b/src/mdio/segy/_workers.py @@ -10,17 +10,17 @@ import numpy as np from segy import SegyFile +from mdio.api.io import to_mdio from mdio.schemas import ScalarType if TYPE_CHECKING: from segy.arrays import HeaderArray from segy.config import SegySettings from segy.schema import SegySpec + from upath import UPath from xarray import Dataset as xr_Dataset from zarr import Array as zarr_Array - from mdio.core.storage_location import StorageLocation - from xarray import Variable from zarr.core.config import config as zarr_config @@ -66,7 +66,8 @@ def header_scan_worker( trace_header = segy_file.header[slice_] if subset is not None: - trace_header = trace_header[subset] + # struct field selection needs a list, not a tuple; a subset is a tuple from the template. + trace_header = trace_header[list(subset)] # Get non-void fields from dtype and copy to new array for memory efficiency fields = trace_header.dtype.fields @@ -82,7 +83,7 @@ def header_scan_worker( def trace_worker( # noqa: PLR0913 segy_kw: SegyFileArguments, - output_location: StorageLocation, + output_path: UPath, data_variable_name: str, region: dict[str, slice], grid_map: zarr_Array, @@ -92,7 +93,7 @@ def trace_worker( # noqa: PLR0913 Args: segy_kw: Arguments to open SegyFile instance. - output_location: StorageLocation for the output Zarr dataset + output_path: Universal Path for the output Zarr dataset (e.g. local file path or cloud storage URI) the location also includes storage options for cloud storage. data_variable_name: Name of the data variable to write. @@ -103,22 +104,21 @@ def trace_worker( # noqa: PLR0913 Returns: SummaryStatistics object containing statistics about the written traces. """ - if not dataset.trace_mask.any(): - return None + region_slices = tuple(region.values()) + local_grid_map = grid_map[region_slices[:-1]] # minus last (vertical) axis - # Setting the zarr config to 1 thread to ensure we honor the `MDIO__IMPORT__MAX_WORKERS` - # environment variable. - # Since the release of the Zarr 3 engine, it will default to many threads. - # This can cause resource contention and unpredicted memory consumption. - zarr_config.set({"threading.max_workers": 1}) + not_null = local_grid_map != UINT32_MAX + if not not_null.any(): + return None - # Open the SEG-Y file in every new process / spawned worker since the - # open file handles cannot be shared across processes. + # Open the SEG-Y file in this process since the open file handles cannot be shared across processes. segy_file = SegyFile(**segy_kw) - not_null = grid_map != UINT32_MAX + # Setting the zarr config to 1 thread to ensure we honor the `MDIO__IMPORT__MAX_WORKERS` environment variable. + # The Zarr 3 engine utilizes multiple threads. This can lead to resource contention and unpredictable memory usage. + zarr_config.set({"threading.max_workers": 1}) - live_trace_indexes = grid_map[not_null].tolist() + live_trace_indexes = local_grid_map[not_null].tolist() traces = segy_file.trace[live_trace_indexes] header_key = "headers" @@ -160,7 +160,7 @@ def trace_worker( # noqa: PLR0913 encoding=ds_to_write[data_variable_name].encoding, # Not strictly necessary, but safer than not doing it. ) - ds_to_write.to_zarr(output_location.uri, region=region, mode="r+", write_empty_chunks=False, zarr_format=2) + to_mdio(ds_to_write, output_path=output_path, region=region, mode="r+") histogram = CenteredBinHistogram(bin_centers=[], counts=[]) return SummaryStatistics( diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py index 52a23b22e..4a3a8d3ad 100644 --- a/src/mdio/segy/blocked_io.py +++ b/src/mdio/segy/blocked_io.py @@ -14,9 +14,9 @@ from dask.array import map_blocks from psutil import cpu_count from tqdm.auto import tqdm -from zarr import consolidate_metadata as zarr_consolidate_metadata from zarr import open_group as zarr_open_group +from mdio.api.io import _normalize_storage_options from mdio.core.indexing import ChunkIterator from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import SummaryStatistics @@ -30,11 +30,10 @@ from numpy.typing import NDArray from segy import SegyFactory from segy import SegyFile + from upath import UPath from xarray import Dataset as xr_Dataset from zarr import Array as zarr_Array - from mdio.core.storage_location import StorageLocation - default_cpus = cpu_count(logical=True) @@ -53,18 +52,16 @@ def _update_stats(final_stats: SummaryStatistics, partial_stats: SummaryStatisti def to_zarr( # noqa: PLR0913, PLR0915 segy_file: SegyFile, - output_location: StorageLocation, + output_path: UPath, grid_map: zarr_Array, dataset: xr_Dataset, data_variable_name: str, -) -> None: +) -> SummaryStatistics: """Blocked I/O from SEG-Y to chunked `xarray.Dataset`. Args: segy_file: SEG-Y file instance. - output_location: StorageLocation for the output Zarr dataset - (e.g. local file path or cloud storage URI) the location - also includes storage options for cloud storage. + output_path: Output universal path for the output MDIO dataset. grid_map: Zarr array with grid map for the traces. dataset: Handle for xarray.Dataset we are writing trace data data_variable_name: Name of the data variable in the dataset. @@ -76,22 +73,11 @@ def to_zarr( # noqa: PLR0913, PLR0915 final_stats = _create_stats() - # Must use data.encoding.get instead of data.chunks - chunks_t_of_t = (data.encoding.get("chunks"),) - # Unroll tuple of tuples into a flat list - chunks = [c for sub_tuple in chunks_t_of_t for c in sub_tuple] - # We will not chunk traces (old option chunk_samples=False) - chunks[-1] = data.shape[-1] - dim_names = list(data.dims) - # Initialize chunk iterator - # Since the dimensions are provided, it will return a dict of slices - chunk_iter = ChunkIterator(shape=data.shape, chunks=chunks, dim_names=dim_names) + data_variable_chunks = data.encoding.get("chunks") + worker_chunks = data_variable_chunks[:-1] + (data.shape[-1],) # un-chunk sample axis + chunk_iter = ChunkIterator(shape=data.shape, chunks=worker_chunks, dim_names=data.dims) num_chunks = chunk_iter.num_chunks - # The following could be extracted in a function to allow executor injection - # (e.g. for unit testing or for debugging with non-parallelized processing) - # def _create_executor(num_chunks: int)-> ProcessPoolExecutor: - # For Unix async writes with s3fs/fsspec & multiprocessing, use 'spawn' instead of default # 'fork' to avoid deadlocks on cloud stores. Slower but necessary. Default on Windows. num_cpus = int(os.getenv("MDIO__IMPORT__CPU_COUNT", default_cpus)) @@ -106,14 +92,9 @@ def to_zarr( # noqa: PLR0913, PLR0915 } with executor: futures = [] - common_args = (segy_kw, output_location, data_variable_name) + common_args = (segy_kw, output_path, data_variable_name) for region in chunk_iter: - index_slices = tuple(region[key] for key in data.dims[:-1]) - subset_args = ( - region, - grid_map[index_slices], - dataset.isel(region), - ) + subset_args = (region, grid_map, dataset.isel(region)) future = executor.submit(trace_worker, *common_args, *subset_args) futures.append(future) @@ -129,17 +110,13 @@ def to_zarr( # noqa: PLR0913, PLR0915 if result is not None: _update_stats(final_stats, result) - # Xarray doesn't directly support incremental attribute updates when appending to an - # existing Zarr store. + # Xarray doesn't directly support incremental attribute updates when appending to an existing Zarr store. # HACK: We will update the array attribute using zarr's API directly. - # Open the Zarr store using zarr directly - zarr_group = zarr_open_group(output_location.uri, mode="a") + # Use the data_variable_name to get the array in the Zarr group and write "statistics" metadata there + storage_options = _normalize_storage_options(output_path) + zarr_group = zarr_open_group(output_path.path, mode="a", storage_options=storage_options) attr_json = final_stats.model_dump_json() - # Use the data_variable_name to get the array in the Zarr group - # and write "statistics" metadata there zarr_group[data_variable_name].attrs.update({"statsV1": attr_json}) - # Consolidate metadata (important for Xarray to recognize changes) - zarr_consolidate_metadata(output_location.uri) return final_stats diff --git a/src/mdio/segy/creation.py b/src/mdio/segy/creation.py index 774ebe060..de916518b 100644 --- a/src/mdio/segy/creation.py +++ b/src/mdio/segy/creation.py @@ -12,15 +12,14 @@ from segy.factory import SegyFactory from tqdm.auto import tqdm -from mdio.api.opener import open_dataset +from mdio.api.io import open_mdio from mdio.segy.compat import revision_encode if TYPE_CHECKING: import xarray as xr from numpy.typing import NDArray from segy.schema import SegySpec - - from mdio.core.storage_location import StorageLocation + from upath import UPath logger = logging.getLogger(__name__) @@ -40,8 +39,8 @@ def make_segy_factory(dataset: xr.Dataset, spec: SegySpec) -> SegyFactory: def mdio_spec_to_segy( segy_spec: SegySpec, - input_location: StorageLocation, - output_location: StorageLocation, + input_path: UPath, + output_path: UPath, new_chunks: tuple[int, ...] | None = None, ) -> tuple[xr.Dataset, SegyFactory]: """Create SEG-Y file without any traces given MDIO specification. @@ -56,14 +55,14 @@ def mdio_spec_to_segy( Args: segy_spec: The SEG-Y specification to use for the conversion. - input_location: Store or URL (and cloud options) for MDIO file. - output_location: Path to the output SEG-Y file. + input_path: Store or URL (and cloud options) for MDIO file. + output_path: Path to the output SEG-Y file. new_chunks: Set in memory chunksize for export or other reasons. Returns: Opened Xarray Dataset for MDIO file and SegyFactory """ - dataset = open_dataset(input_location, chunks=new_chunks) + dataset = open_mdio(input_path, chunks=new_chunks) factory = make_segy_factory(dataset, spec=segy_spec) attr = dataset.attrs["attributes"] @@ -77,7 +76,7 @@ def mdio_spec_to_segy( binary_header = revision_encode(bin_header, mdio_file_version) bin_hdr_bytes = factory.create_binary_header(binary_header) - with Path(output_location.uri).open(mode="wb") as fp: + with output_path.open(mode="wb") as fp: fp.write(text_bytes) fp.write(bin_hdr_bytes) diff --git a/tests/integration/test_numpy_ingest.py b/tests/integration/test_numpy_ingest.py deleted file mode 100644 index 4eaef7467..000000000 --- a/tests/integration/test_numpy_ingest.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Module for testing NumPy to MDIO conversion functionality. - -This module contains tests for the `numpy_to_mdio` function, ensuring proper conversion -of NumPy arrays to MDIO format, including validation of grid dimensions, chunk sizes, -and coordinate handling. -""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy as np -import numpy.testing as npt -import pytest - -from mdio.api.accessor import MDIOReader -from mdio.converters.numpy import numpy_to_mdio -from mdio.core.dimension import Dimension -from mdio.core.grid import Grid - -if TYPE_CHECKING: - from numpy.typing import NDArray - - -TEST_DIMS = [ - Dimension(name="inline", coords=np.arange(101, 131, 2)), - Dimension(name="crossline", coords=np.arange(10, 20, 1)), - Dimension(name="sample", coords=np.arange(0, 100, 5)), -] - - -@pytest.fixture -def mock_grid() -> Grid: - """Make a mock grid using test dimensions.""" - return Grid(dims=TEST_DIMS) - - -@pytest.fixture -def mock_array(mock_grid: Grid) -> NDArray: - """Make a mock array using mock grid.""" - rng = np.random.default_rng() - return rng.uniform(size=mock_grid.shape).astype("float32") - - -CHUNK_SIZE = (8, 8, 8) - - -def test_npy_to_mdio(mock_array: NDArray, mock_grid: Grid) -> None: - """Test basic NumPy to MDIO conversion without custom coordinates.""" - numpy_to_mdio(mock_array, "memory://npy.mdio", CHUNK_SIZE) - reader = MDIOReader("memory://npy.mdio") - - npt.assert_array_equal(reader._traces, mock_array) - assert reader.grid.dim_names == ("dim_0", "dim_1", "sample") - assert reader.chunks == CHUNK_SIZE - assert reader.shape == mock_grid.shape - assert reader.grid.dims != mock_grid.dims - - -def test_npy_to_mdio_coords(mock_array: NDArray, mock_grid: Grid) -> None: - """Test NumPy to MDIO conversion with custom coordinates.""" - index_names = mock_grid.dim_names - index_coords = {dim.name: dim.coords for dim in mock_grid.dims} - numpy_to_mdio(mock_array, "memory://npy_coord.mdio", CHUNK_SIZE, index_names, index_coords) - reader = MDIOReader("memory://npy_coord.mdio") - - npt.assert_array_equal(reader._traces, mock_array) - assert reader.chunks == CHUNK_SIZE - assert reader.shape == mock_grid.shape - assert reader.grid.dims == mock_grid.dims - - -def test_npy_to_mdio_chunksize_mismatch(mock_array: NDArray) -> None: - """Test error handling for mismatched chunk size dimensions.""" - with pytest.raises(ValueError, match="equal to array dimensions"): - numpy_to_mdio(mock_array, "", (5, 10, 15, 20, 25)) - - -def test_npy_to_mdio_coord_missing(mock_array: NDArray, mock_grid: Grid) -> None: - """Test error handling for missing coordinate names.""" - index_names = ["mismatch", "dimension", "names"] - index_coords = {dim.name: dim.coords for dim in mock_grid.dims} - - with pytest.raises(ValueError, match="not found in index_coords"): - numpy_to_mdio( - mock_array, - "", - CHUNK_SIZE, - index_names, - index_coords, - ) - - -def test_npy_to_mdio_coord_size_error(mock_array: NDArray, mock_grid: Grid) -> None: - """Test error handling for coordinate size mismatch.""" - index_names = mock_grid.dim_names - index_coords = {dim.name: np.arange(5) for dim in mock_grid.dims} - - with pytest.raises(ValueError, match="does not match array dimension"): - numpy_to_mdio( - mock_array, - "", - CHUNK_SIZE, - index_names, - index_coords, - ) diff --git a/tests/integration/test_segy_import_export.py b/tests/integration/test_segy_import_export.py index 51f693d00..cc3750325 100644 --- a/tests/integration/test_segy_import_export.py +++ b/tests/integration/test_segy_import_export.py @@ -21,10 +21,9 @@ from tests.integration.testing_helpers import validate_variable from mdio import mdio_to_segy -from mdio.api.opener import open_dataset +from mdio.api.io import open_mdio from mdio.converters.exceptions import GridTraceSparsityError from mdio.converters.segy import segy_to_mdio -from mdio.core.storage_location import StorageLocation from mdio.schemas.v1.templates.template_registry import TemplateRegistry from mdio.segy.geometry import StreamerShotGeometryType @@ -53,13 +52,11 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec: SegySpec = get_segy_mock_4d_spec() segy_path = segy_mock_4d_shots[chan_header_type] - input_location = StorageLocation(str(segy_path)) - output_location = StorageLocation(str(zarr_tmp)) segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), - input_location=input_location, - output_location=output_location, + input_path=segy_path, + output_path=zarr_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -70,7 +67,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_dataset(output_location) + ds = open_mdio(zarr_tmp) assert ds.attrs["attributes"]["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"]["gridOverrides"] == grid_override @@ -102,12 +99,11 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec: SegySpec = get_segy_mock_4d_spec() segy_path = segy_mock_4d_shots[chan_header_type] - output_location = StorageLocation(str(zarr_tmp)) segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), - input_location=StorageLocation(str(segy_path)), - output_location=output_location, + input_path=segy_path, + output_path=zarr_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -118,7 +114,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_dataset(output_location) + ds = open_mdio(zarr_tmp) assert ds.attrs["attributes"]["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"].get("gridOverrides", None) == grid_override # may not exist, so default=None @@ -156,8 +152,8 @@ def test_import_4d_segy( # noqa: PLR0913 segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), - input_location=StorageLocation(str(segy_path)), - output_location=StorageLocation(str(zarr_tmp)), + input_path=segy_path, + output_path=zarr_tmp, overwrite=True, ) @@ -182,12 +178,11 @@ def test_import_6d_segy( # noqa: PLR0913 segy_spec: SegySpec = get_segy_mock_4d_spec() segy_path = segy_mock_4d_shots[chan_header_type] - output_location = StorageLocation(str(zarr_tmp)) segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("XYZ"), # Placeholder for the template - input_location=StorageLocation(str(segy_path)), - output_location=output_location, + input_path=segy_path, + output_path=zarr_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -203,7 +198,7 @@ def test_import_6d_segy( # noqa: PLR0913 guns = [1, 2] receivers_per_cable = [1, 5, 7, 5] - ds = open_dataset(output_location) + ds = open_mdio(zarr_tmp) xrt.assert_duckarray_equal(ds["gun"], guns) xrt.assert_duckarray_equal(ds["shot_point"], shots) @@ -228,8 +223,8 @@ def test_3d_import(segy_input: Path, zarr_tmp: Path) -> None: segy_to_mdio( segy_spec=custom_teapot_dome_segy_spec(keep_unaltered=True), mdio_template=TemplateRegistry().get("PostStack3DTime"), - input_location=StorageLocation(str(segy_input)), - output_location=StorageLocation(str(zarr_tmp)), + input_path=segy_input, + output_path=zarr_tmp, overwrite=True, ) @@ -243,7 +238,7 @@ class TestReader: def test_dataset_metadata(self, zarr_tmp: Path) -> None: """Metadata reading tests.""" - ds = open_dataset(StorageLocation(str(zarr_tmp))) + ds = open_mdio(zarr_tmp) expected_attrs = { "apiVersion": "1.0.0a1", "createdOn": "2025-08-06 16:21:54.747880+00:00", @@ -271,7 +266,7 @@ def test_dataset_metadata(self, zarr_tmp: Path) -> None: def test_variable_metadata(self, zarr_tmp: Path) -> None: """Metadata reading tests.""" - ds = open_dataset(StorageLocation(str(zarr_tmp))) + ds = open_mdio(zarr_tmp) expected_attrs = { "count": 97354860, "sum": -8594.551666259766, @@ -285,7 +280,7 @@ def test_variable_metadata(self, zarr_tmp: Path) -> None: def test_grid(self, zarr_tmp: Path) -> None: """Test validating MDIO variables.""" - ds = open_dataset(StorageLocation(str(zarr_tmp))) + ds = open_mdio(zarr_tmp) # Validate the dimension coordinate variables validate_variable(ds, "inline", (345,), ("inline",), np.int32, range(1, 346), get_values) @@ -327,14 +322,14 @@ def test_grid(self, zarr_tmp: Path) -> None: def test_inline_reads(self, zarr_tmp: Path) -> None: """Read and compare every 75 inlines' mean and std. dev.""" - ds = open_dataset(StorageLocation(str(zarr_tmp))) + ds = open_mdio(zarr_tmp) inlines = ds["amplitude"][::75, :, :] mean, std = inlines.mean(), inlines.std() npt.assert_allclose([mean, std], [1.0555277e-04, 6.0027051e-01]) def test_crossline_reads(self, zarr_tmp: Path) -> None: """Read and compare every 75 crosslines' mean and std. dev.""" - ds = open_dataset(StorageLocation(str(zarr_tmp))) + ds = open_mdio(zarr_tmp) xlines = ds["amplitude"][:, ::75, :] mean, std = xlines.mean(), xlines.std() @@ -342,7 +337,7 @@ def test_crossline_reads(self, zarr_tmp: Path) -> None: def test_zslice_reads(self, zarr_tmp: Path) -> None: """Read and compare every 225 z-slices' mean and std. dev.""" - ds = open_dataset(StorageLocation(str(zarr_tmp))) + ds = open_mdio(zarr_tmp) slices = ds["amplitude"][:, :, ::225] mean, std = slices.mean(), slices.std() npt.assert_allclose([mean, std], [0.005236923, 0.61279935]) @@ -359,11 +354,7 @@ def test_3d_export(self, segy_input: Path, zarr_tmp: Path, segy_export_tmp: Path """Test 3D export to IBM and IEEE.""" spec = custom_teapot_dome_segy_spec(keep_unaltered=True) - mdio_to_segy( - segy_spec=spec, - input_location=StorageLocation(str(zarr_tmp)), - output_location=StorageLocation(str(segy_export_tmp)), - ) + mdio_to_segy(segy_spec=spec, input_path=zarr_tmp, output_path=segy_export_tmp) # Check if file sizes match on IBM file. assert segy_input.stat().st_size == segy_export_tmp.stat().st_size diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 4c0cd2c97..9290dae80 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -23,9 +23,8 @@ from tests.conftest import DEBUG_MODE from mdio import mdio_to_segy -from mdio.api.opener import open_dataset +from mdio.api.io import open_mdio from mdio.converters.segy import segy_to_mdio -from mdio.core.storage_location import StorageLocation from mdio.schemas.v1.templates.template_registry import TemplateRegistry if TYPE_CHECKING: @@ -300,8 +299,8 @@ def test_import(self, test_conf: MaskedExportConfig, export_masked_path: Path) - segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get(template_name), - input_location=StorageLocation(str(segy_path)), - output_location=StorageLocation(str(mdio_path)), + input_path=segy_path, + output_path=mdio_path, overwrite=True, ) @@ -315,7 +314,7 @@ def test_ingested_mdio(self, test_conf: MaskedExportConfig, export_masked_path: mdio_path = export_masked_path / f"{grid_conf.name}.mdio" # Open the MDIO file - ds = open_dataset(StorageLocation(str(mdio_path))) + ds = open_mdio(mdio_path) # Test dimensions and ingested dimension headers expected_dims = grid_conf.dims @@ -376,8 +375,8 @@ def test_export(self, test_conf: MaskedExportConfig, export_masked_path: Path) - mdio_to_segy( segy_spec=_segy_spec_mock_nd_segy(grid_conf, segy_factory_conf), - input_location=StorageLocation(str(mdio_path)), - output_location=StorageLocation(str(segy_rt_path)) + input_path=mdio_path, + output_path=segy_rt_path ) expected_sgy = SegyFile(segy_path) @@ -414,8 +413,8 @@ def test_export_masked(self, test_conf: MaskedExportConfig, export_masked_path: mdio_to_segy( segy_spec=_segy_spec_mock_nd_segy(grid_conf, segy_factory_conf), - input_location=StorageLocation(str(mdio_path)), - output_location=StorageLocation(str(segy_rt_path)), + input_path=mdio_path, + output_path=segy_rt_path, selection_mask=selection_mask ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index c87aa33e6..0ac9fa04c 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -8,18 +8,10 @@ import numpy as np import pytest -from mdio import MDIOReader -from mdio import MDIOWriter from mdio.core import Dimension from mdio.core import Grid -from mdio.core.factory import MDIOCreateConfig -from mdio.core.factory import MDIOVariableConfig -from mdio.core.factory import create_empty -from mdio.core.utils_write import write_attribute if TYPE_CHECKING: - from pathlib import Path - from numpy.typing import NDArray API_VERSION = metadata.version("multidimio") @@ -65,62 +57,3 @@ def mock_data(mock_grid: Grid, mock_ilxl_values: tuple[NDArray, ...]) -> NDArray sample_axis = mock_grid.select_dim("sample").coords data = il_grid / xl_grid return data[..., None] + sample_axis[None, None, :] - - -@pytest.fixture -def mock_mdio( - mock_mdio_dir: Path, - mock_grid: Grid, - mock_ilxl_values: tuple[NDArray, NDArray], - mock_data: NDArray, - mock_bin: dict[str, int], -) -> Path: - """This mocks most of mdio.converters.segy in memory.""" - il_grid, xl_grid = mock_ilxl_values - mock_header_dtype = np.dtype([("inline", "i4"), ("crossline", "i4")]) - mock_grid.live_mask = np.ones(mock_grid.shape[:-1], dtype=bool) - - var = MDIOVariableConfig( - name="chunked_012", - dtype="float64", - chunks=mock_grid.shape, - header_dtype=mock_header_dtype, - ) - - conf = MDIOCreateConfig(path=mock_mdio_dir, grid=mock_grid, variables=[var]) - zarr_root = create_empty(conf, overwrite=True) - trace_count = np.count_nonzero(mock_grid.live_mask) - write_attribute(name="trace_count", zarr_group=zarr_root, attribute=trace_count) - - writer = MDIOWriter(mock_mdio_dir) - writer.binary_header = mock_bin - - writer._headers["inline"] = il_grid - writer._headers["crossline"] = xl_grid - writer[:] = mock_data - - stats = { - "mean": mock_data.mean(), - "std": mock_data.std(), - "rms": np.sqrt((mock_data**2).sum() / mock_data.size), - "min": mock_data.min(), - "max": mock_data.max(), - } - writer.stats = stats - return mock_mdio_dir - - -@pytest.fixture -def mock_reader(mock_mdio: Path) -> MDIOReader: - """Reader that points to the mocked data to be used later.""" - return MDIOReader(str(mock_mdio)) - - -@pytest.fixture -def mock_reader_cached(mock_mdio: Path) -> MDIOReader: - """Reader that points to the mocked data to be used later. (with local caching).""" - return MDIOReader( - str(mock_mdio), - disk_cache=True, - storage_options={"simplecache": {"cache_storage": "./mdio_test_cache"}}, - ) diff --git a/tests/unit/test_accessor.py b/tests/unit/test_accessor.py deleted file mode 100644 index 3dbcfe40b..000000000 --- a/tests/unit/test_accessor.py +++ /dev/null @@ -1,143 +0,0 @@ -"""Test for MDIO accessors.""" - -from __future__ import annotations - -import shutil -from pathlib import Path -from typing import TYPE_CHECKING - -import numpy as np -import numpy.testing as npt -import pytest - -from mdio import MDIOReader -from mdio.core.exceptions import MDIOAlreadyExistsError -from mdio.core.exceptions import MDIONotFoundError -from mdio.core.factory import DEFAULT_TEXT -from mdio.exceptions import ShapeError -from mdio.segy.helpers_segy import create_zarr_hierarchy - -if TYPE_CHECKING: - from numpy.typing import NDArray - - -class TestReader: - """Tests for reader units.""" - - def test_basic_attrs(self, mock_reader: MDIOReader, mock_data: NDArray) -> None: - """Compare ingested basic attrs to original.""" - assert mock_reader.n_dim == mock_data.ndim - assert mock_reader.trace_count == np.prod(mock_data.shape[:-1]) - - def test_basic_stats(self, mock_reader: MDIOReader, mock_data: NDArray) -> None: - """Ensure access to stats work properly.""" - assert mock_reader.stats["mean"] == mock_data.mean() - assert mock_reader.stats["std"] == mock_data.std() - assert mock_reader.stats["min"] == mock_data.min() - assert mock_reader.stats["max"] == mock_data.max() - - def test_text_hdr(self, mock_reader: MDIOReader) -> None: - """Compare ingested text header to original.""" - assert mock_reader.text_header == DEFAULT_TEXT - - def test_bin_hdr(self, mock_reader: MDIOReader, mock_bin: dict[str, int]) -> None: - """Compare ingested binary header to original.""" - assert mock_reader.binary_header == mock_bin - - def test_shape(self, mock_reader: MDIOReader, mock_data: NDArray) -> None: - """Compare ingested shape to expected.""" - assert mock_reader.shape == mock_data.shape - assert mock_reader.chunks == mock_data.shape - - def test_live_mask(self, mock_reader: MDIOReader) -> None: - """Check if live mask is full as expected.""" - assert np.all(mock_reader.live_mask[:]) - - @pytest.mark.parametrize( - ("il_coord", "il_index", "xl_coord", "xl_index", "z_coord", "z_index"), - [ - (101, 0, 10, 0, 0, 0), - (115, 7, 15, 5, 50, 10), - (129, 14, 19, 9, 95, 19), - ([101, 115, 129], [0, 7, 14], 11, 1, 10, 2), - ([101, 129], [0, 14], 11, 1, [10, 95], [2, 19]), - ([101], [0], [11], [1], [95], [19]), - ], - ) - def test_coord_slicing( # noqa: PLR0913 - self, - il_coord: int | list[int], - il_index: int | list[int], - xl_coord: int | list[int], - xl_index: int | list[int], - z_coord: int | list[int], - z_index: int | list[int], - mock_reader: MDIOReader, - mock_data: NDArray, - ) -> None: - """Test IL/XL number to Index slicing.""" - il_indices = mock_reader.coord_to_index(il_coord, dimensions="inline") - xl_indices = mock_reader.coord_to_index(xl_coord, dimensions="crossline") - z_indices = mock_reader.coord_to_index(z_coord, dimensions="sample") - - # 2-D should work too - _ = mock_reader.coord_to_index( - il_coord, - xl_coord, - dimensions=["inline", "crossline"], - ) - - # All dims should also work without specifying - _ = mock_reader.coord_to_index(il_coord, xl_coord, z_coord) - - il_indices = np.atleast_1d(il_indices) - il_index = np.atleast_1d(il_index) - xl_indices = np.atleast_1d(xl_indices) - xl_index = np.atleast_1d(xl_index) - z_indices = np.atleast_1d(z_indices) - z_index = np.atleast_1d(z_index) - - npt.assert_array_equal(il_indices, il_index) - npt.assert_array_equal(xl_indices, xl_index) - npt.assert_array_equal(z_indices, z_index) - - for act_idx, exp_idx in zip(il_indices, il_index, strict=True): - npt.assert_almost_equal(mock_reader[act_idx], mock_data[exp_idx]) - - for act_idx, exp_idx in zip(xl_indices, xl_index, strict=True): - npt.assert_almost_equal(mock_reader[:, act_idx], mock_data[:, exp_idx]) - - for act_idx, exp_idx in zip(z_indices, z_index, strict=True): - npt.assert_almost_equal(mock_reader[..., act_idx], mock_data[..., exp_idx]) - - @pytest.mark.usefixtures("mock_reader_cached") - def test_local_caching(self) -> None: - """Test local caching.""" - cache_path = Path("./mdio_test_cache") - assert cache_path.is_dir() - shutil.rmtree(cache_path) - - -class TestExceptions: - """Test custom exceptions and if they're raised properly.""" - - def test_mdio_not_found(self) -> None: - """MDIO doesn't exist or corrupt.""" - with pytest.raises(MDIONotFoundError): - MDIOReader("prefix/file_that_doesnt_exist.mdio") - - def test_wrong_size_index(self, mock_reader: MDIOReader) -> None: - """If user asks for N dimensions but didn't specify all.""" - with pytest.raises(ShapeError, match="queries not the same size as n_dimensions"): - mock_reader.coord_to_index(0, 0, dimensions="inline") - - def test_wrong_index(self, mock_reader: MDIOReader) -> None: - """If user asks for an index that doesn't exist.""" - with pytest.raises(ValueError, match="Invalid dimension name"): - mock_reader.coord_to_index(0, dimensions="non_existent") - - def test_mdio_exists(self, mock_reader: MDIOReader) -> None: - """MDIO doesn't exist or corrupt.""" - mock_root = mock_reader.root - with pytest.raises(MDIOAlreadyExistsError, match="MDIO file with data already exists"): - create_zarr_hierarchy(mock_root, overwrite=False) diff --git a/tests/unit/test_convenience.py b/tests/unit/test_convenience.py deleted file mode 100644 index fb01b475c..000000000 --- a/tests/unit/test_convenience.py +++ /dev/null @@ -1,87 +0,0 @@ -"""Test convenience functions in user facing API.""" - -import numpy as np -import pytest - -from mdio import MDIOReader -from mdio import MDIOWriter -from mdio.api.convenience import copy_mdio -from mdio.api.convenience import rechunk_batch - - -def test_copy_without_data(mock_reader: MDIOReader) -> None: - """Test MDIO copy with data excluding the data copy operation.""" - # Define destination path for the new dataset - dest_path = mock_reader.url + "_copy" - - copy_mdio( - source_path=mock_reader.url, - target_path=dest_path, - overwrite=True, - ) - - actual_reader = MDIOReader(dest_path) - assert actual_reader.grid.dims == mock_reader.grid.dims - - # Expected mismatches - with pytest.raises(AssertionError, match="Not equal to tolerance"): - np.testing.assert_allclose(actual_reader._traces, mock_reader._traces) - with pytest.raises(AssertionError, match="Arrays are not equal"): - np.testing.assert_array_equal(actual_reader._headers, mock_reader._headers) - with pytest.raises(AssertionError, match="Arrays are not equal"): - np.testing.assert_array_equal(actual_reader.live_mask, mock_reader.live_mask) - - -@pytest.mark.dependency -def test_copy_with_data(mock_reader: MDIOReader) -> None: - """Test MDIO copy with data included in the copy operation.""" - dest_path = mock_reader.url + "_copy2" - - copy_mdio( - source_path=mock_reader.url, - target_path=dest_path, - overwrite=True, - copy_headers=True, - copy_traces=True, - ) - - actual_reader = MDIOReader(dest_path) - assert actual_reader.grid.dims == mock_reader.grid.dims - - np.testing.assert_allclose(actual_reader._traces, mock_reader._traces) - np.testing.assert_array_equal(actual_reader._headers, mock_reader._headers) - np.testing.assert_array_equal(actual_reader.live_mask, mock_reader.live_mask) - - -@pytest.mark.dependency(depends=["test_copy_with_data"]) -def test_rechunk(mock_reader: MDIOReader) -> None: - """Test rechunking functionality.""" - dest_path = mock_reader.url + "_copy2" - - writer = MDIOWriter(dest_path) - - # Capture the original data and chunk sizes - expected_traces = writer._traces[:] - expected_headers = writer._headers[:] - original_chunks = writer.chunks - - expected_chunks = (8, 8, 8) - - # Perform rechunking with a new suffix. - rechunk_batch(writer, [expected_chunks], ["new_ap"], overwrite=True) - - # After rechunk, we need to reinitialize the reader to access the new chunks - reader_new_ap = MDIOReader(dest_path, access_pattern="new_ap") - - # Get the rechunked data using the accessor's methods - actual_traces = reader_new_ap._traces[:] - actual_headers = reader_new_ap._headers[:] - actual_chunks = reader_new_ap.chunks # New chunk sizes - - # Validate that the underlying data has not changed. - np.testing.assert_array_equal(actual_traces, expected_traces) - np.testing.assert_array_equal(actual_headers, expected_headers) - - # Validate that the new chunk sizes match what we specified - assert actual_chunks == expected_chunks - assert actual_chunks != original_chunks diff --git a/tests/unit/test_dimension.py b/tests/unit/test_dimension.py index e5645290e..2f3157041 100644 --- a/tests/unit/test_dimension.py +++ b/tests/unit/test_dimension.py @@ -38,22 +38,6 @@ def test_hash_equality(self, my_dimension: Dimension) -> None: assert my_dimension == other_dim1 assert my_dimension != other_dim2 - @pytest.mark.parametrize( - ("stream_format", "stream"), - [ - ("json", '{"name": "dim_0", "length": 4, "coords": [10, 12, 14, 16]}'), - ("yaml", "name: dim_0\nlength: 4\ncoords:\n- 10\n- 12\n- 14\n- 16\n"), - ], - ) - def test_serialization(self, my_dimension: Dimension, stream_format: str, stream: str) -> None: - """Test serialization and deserialization.""" - observed_stream = my_dimension.serialize(stream_format) - assert observed_stream == stream - - # Deserialize - observed_dimension = Dimension.deserialize(stream, stream_format) - assert observed_dimension == my_dimension - class TestExceptions: """Test custom exceptions and if they're raised properly.""" diff --git a/tests/unit/test_factory.py b/tests/unit/test_factory.py deleted file mode 100644 index dcef46d07..000000000 --- a/tests/unit/test_factory.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Test module for MDIO creation.""" - -from datetime import datetime - -from mdio import MDIOReader -from mdio.core.factory import create_empty_like - - -def test_create_empty_like(mock_reader: MDIOReader) -> None: - """Test create_empty_like function to ensure it replicates an existing MDIO dataset.""" - # Define a destination path for the new dataset - dest_path = mock_reader.url + "_copy" - - # Call create_empty_like - create_empty_like( - source_path=mock_reader.url, - dest_path=dest_path, - overwrite=True, - ) - - source_reader = mock_reader - dest_reader = MDIOReader(dest_path) - assert source_reader.grid.dims == dest_reader.grid.dims - assert source_reader.live_mask != dest_reader.grid.live_mask - - source_traces = source_reader._traces - dest_traces = dest_reader._traces - - assert source_traces.dtype == dest_traces.dtype - assert source_traces.shape == dest_traces.shape - assert source_traces.chunks == dest_traces.chunks - assert source_traces.compressors == dest_traces.compressors - - source_headers = source_reader._headers - dest_headers = dest_reader._headers - - assert source_headers.dtype == dest_headers.dtype - assert source_headers.shape == dest_headers.shape - assert source_headers.chunks == dest_headers.chunks - assert source_headers.compressors == dest_headers.compressors - - assert source_reader.text_header == dest_reader.text_header - assert source_reader.binary_header == dest_reader.binary_header - - # Verify live_mask - assert dest_reader.live_mask[:].sum() == 0 - - # Verify attributes - assert dest_reader.trace_count == 0 - for stat_value in dest_reader.stats.values(): - assert stat_value == 0 - - # Verify creation time is recent - source_time = datetime.fromisoformat(source_reader.root.attrs["created"]) - dest_time = datetime.fromisoformat(dest_reader.root.attrs["created"]) - assert (dest_time - source_time).total_seconds() > 0 diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 0b2a9f545..3aefd7e2d 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -13,7 +13,7 @@ "name": "actual_variable", "data_type": "float32", "dimensions": ["dim0", "dim1"], - "compressor": {"name": "blosc", "level": 3}, + "compressor": {"name": "blosc", "clevel": 3}, "coordinates": ["coord"], "metadata": { "chunk_grid": { diff --git a/tests/unit/test_serialization.py b/tests/unit/test_serialization.py deleted file mode 100644 index 86f59b7cb..000000000 --- a/tests/unit/test_serialization.py +++ /dev/null @@ -1,94 +0,0 @@ -"""Check lower-level serialization functions.""" - -from __future__ import annotations - -from inspect import Parameter -from inspect import Signature -from typing import Any - -import pytest - -from mdio.core.serialization import Serializer -from mdio.core.serialization import get_deserializer -from mdio.core.serialization import get_serializer - - -@pytest.mark.parametrize( - ("mappable", "json_str"), - [ - ({"a": 5}, '{"a": 5}'), - ({"a": 5, "b": [-1, 5]}, '{"a": 5, "b": [-1, 5]}'), - ({"b": 5, "a": [-1, 5]}, '{"b": 5, "a": [-1, 5]}'), - ({"b": 5, "a": [-11], "c": "5"}, '{"b": 5, "a": [-11], "c": "5"}'), - ({"k": {"l": [1, 5], "m": "v"}}, '{"k": {"l": [1, 5], "m": "v"}}'), - ], -) -class TestJSON: - """JSON conversion and back.""" - - def test_json_serialize(self, mappable: dict[str, Any], json_str: str) -> None: - """Dictionary to JSON.""" - serializer = get_serializer("json") - assert serializer(mappable) == json_str - - def test_json_deserialize(self, mappable: dict[str, Any], json_str: str) -> None: - """JSON to dictionary.""" - deserializer = get_deserializer("json") - assert deserializer(json_str) == mappable - - -@pytest.mark.parametrize( - ("mappable", "yaml_str"), - [ - ({"a": 5}, "a: 5\n"), - ({"a": 5, "b": [-1, 5]}, "a: 5\nb:\n- -1\n- 5\n"), - ({"b": 5, "a": [-1, 5]}, "b: 5\na:\n- -1\n- 5\n"), - ({"b": 5, "a": [-11], "c": "5"}, "b: 5\na:\n- -11\nc: '5'\n"), - ({"k": {"l": [1, 5], "m": "v"}}, "k:\n l:\n - 1\n - 5\n m: v\n"), - ], -) -class TestYAML: - """YAML conversion and back.""" - - def test_yaml_serialize(self, mappable: dict[str, Any], yaml_str: str) -> None: - """Dictionary to YAML.""" - serializer = get_serializer("yaml") - assert serializer(mappable) == yaml_str - - def test_yaml_deserialize(self, mappable: dict[str, Any], yaml_str: str) -> None: - """YAML to dictionary.""" - deserializer = get_deserializer("yaml") - assert deserializer(yaml_str) == mappable - - -class TestExceptions: - """Test if exceptions are raised properly.""" - - def test_unsupported_format_serializer(self) -> None: - """Unknown serializer format.""" - with pytest.raises(ValueError, match="Unsupported serializer"): - get_serializer("unsupported") - - def test_unsupported_format_deserializer(self) -> None: - """Unknown deserializer format.""" - with pytest.raises(ValueError, match="Unsupported deserializer"): - get_deserializer("unsupported") - - def test_missing_key(self) -> None: - """Raise if required key is missing.""" - mock_signature = Signature( - [ - Parameter("param1", Parameter.POSITIONAL_ONLY), - Parameter("param2", Parameter.POSITIONAL_ONLY), - ] - ) - - exact_input = {"param1": 1, "param2": 2} - extra_inputs = {"param1": 1, "param2": 2, "extra_param": 5} - - assert exact_input == Serializer.validate_payload(exact_input, mock_signature) - assert exact_input == Serializer.validate_payload(extra_inputs, mock_signature) - - missing_key = {"wrong_param1": 1, "param2": 2} - with pytest.raises(KeyError): - Serializer.validate_payload(missing_key, mock_signature) diff --git a/tests/unit/v1/converters/test_type_converter.py b/tests/unit/v1/converters/test_type_converter.py index 4cb3f55a2..903e9eb28 100644 --- a/tests/unit/v1/converters/test_type_converter.py +++ b/tests/unit/v1/converters/test_type_converter.py @@ -1,7 +1,7 @@ """Unit tests for the type converter module.""" +import numpy as np import pytest -from numpy import dtype as np_dtype from mdio.converters.type_converter import to_numpy_dtype from mdio.converters.type_converter import to_scalar_type @@ -58,19 +58,19 @@ def test_to_numpy_dtype(supported_scalar_types_map: tuple[ScalarType, str], a_st # Test 1: ScalarType cases - all supported scalar types for scalar_type, expected_numpy_type in supported_scalar_types_map: result = to_numpy_dtype(scalar_type) - expected = np_dtype(expected_numpy_type) + expected = np.dtype(expected_numpy_type) assert result == expected - assert isinstance(result, np_dtype) + assert isinstance(result, np.dtype) assert result.name == expected.name # Test 2: StructuredType with multiple fields result_multi = to_numpy_dtype(a_structured_type) - expected_multi = np_dtype( + expected_multi = np.dtype( [("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")] ) assert result_multi == expected_multi - assert isinstance(result_multi, np_dtype) + assert isinstance(result_multi, np.dtype) assert len(result_multi.names) == 5 assert set(result_multi.names) == {"x", "y", "z", "id", "valid"} @@ -78,14 +78,14 @@ def test_to_numpy_dtype(supported_scalar_types_map: tuple[ScalarType, str], a_st def test_to_scalar_type(supported_scalar_types_map: tuple[ScalarType, str]) -> None: """Test for to_scalar_type function.""" for expected_mdio_type, numpy_type in supported_scalar_types_map: - result = to_scalar_type(np_dtype(numpy_type)) + result = to_scalar_type(np.dtype(numpy_type)) assert result == expected_mdio_type def test_to_structured_type(a_structured_type: StructuredType) -> None: """Test for to_structured_type function.""" - dtype = np_dtype([("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")]) + dtype = np.dtype([("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")]) assert a_structured_type == to_structured_type(dtype) - dtype = np_dtype([("x", " None: - """Test the exists() method of StorageLocation.""" - # Test exists() returns True when file exists. - mock_fs = Mock() - mock_fs.exists.return_value = True - mock_filesystem.return_value = mock_fs - location = StorageLocation("/test/existing/file") - result = location.exists() - assert result is True - mock_fs.exists.assert_called_once() - - # Test exists() returns False when file does not exist. - mock_fs = Mock() - mock_fs.exists.return_value = False - mock_filesystem.return_value = mock_fs - location = StorageLocation("/test/nonexistent/file") - result = location.exists() - assert result is False - mock_fs.exists.assert_called_once() - - # Test exists() handles exceptions gracefully. - mock_fs = Mock() - mock_fs.exists.side_effect = Exception("Connection failed") - mock_filesystem.return_value = mock_fs - location = StorageLocation("s3://bucket/file") - result = location.exists() - assert result is False - captured = capsys.readouterr() - assert "Error checking existence of s3://bucket/file: Connection failed" in captured.out - - def test_representations(self) -> None: - """Test string and developer representations of StorageLocation.""" - # Test string representation of StorageLocation. - location = StorageLocation("/test/path") - assert str(location) == "/test/path" - - # Test developer representation of StorageLocation. - - uri = "s3://my-bucket/file.segy" - options = {"region": "us-west-2"} - location = StorageLocation(uri=uri, options=options) - expected = "StorageLocation(uri='s3://my-bucket/file.segy', options={'region': 'us-west-2'})" - assert repr(location) == expected - - def test_from_path(self) -> None: - """Test from_path class method.""" - # Test with string path. - path_str = "/home/user/data.segy" - location = StorageLocation(path_str) - # Should resolve to absolute path - expected_path = str(Path(path_str).resolve()) - assert location.uri == expected_path - assert location.options == {} - - # Test with path uri path. - location = StorageLocation(f"file://{path_str}") - # Should resolve to absolute path - expected_path = str(Path(path_str).resolve()) - assert location.uri == expected_path - assert location.options == {} - - # Test with real local file operations. - # Create a temporary file for testing - with tempfile.NamedTemporaryFile(delete=False) as temp_file: - temp_path = Path(temp_file.name) - temp_file.write(b"test content") - try: - # Test with real local file - location = StorageLocation(str(temp_path)) - # Should exist - assert location.exists() is True - # Should have correct URI - assert location.uri == str(temp_path.resolve()) - finally: - # Clean up - temp_path.unlink() - # Now should not exist - assert location.exists() is False - - def test_from_cloud(self) -> None: - """Test class for cloud storage URIs.""" - # Test from_s3 without options. - s3_uri = "s3://bucket/file" - location = StorageLocation(s3_uri) - assert location.uri == s3_uri - assert location.options == {} - - # Test from_s3 with valid S3 URI. - s3_uri = "s3://my-bucket/path/to/file.segy" - options = {"region": "us-west-2", "aws_access_key_id": "key123"} - location = StorageLocation(s3_uri, options=options) - assert location.uri == s3_uri - assert location.options == options - - def test_options_immutability(self) -> None: - """Test that options property returns a defensive copy.""" - original_options = {"region": "us-east-1", "timeout": 30} - location = StorageLocation(uri="s3://bucket/file", options=original_options) - - # Get options through property - returned_options = location.options - - # Verify it's equal to original - assert returned_options == original_options - - # Modify the returned dict - returned_options["new_key"] = "new_value" - returned_options["timeout"] = 60 - - # Original should be unchanged - assert location.options == original_options - assert "new_key" not in location.options - assert location.options["timeout"] == 30 diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index baf8af427..536594a93 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -1,8 +1,11 @@ """Helper methods used in unit tests.""" +from pathlib import Path + from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredField from mdio.schemas.dtype import StructuredType @@ -137,7 +140,7 @@ def _get_all_coordinates(dataset: Dataset) -> list[Coordinate]: return list(all_coords.values()) -def output_path(file_dir: str, file_name: str, debugging: bool = False) -> str: +def output_path(file_dir: Path, file_name: str, debugging: bool = False) -> Path: """Generate the output path for the test file-system output. Note: @@ -145,12 +148,9 @@ def output_path(file_dir: str, file_name: str, debugging: bool = False) -> str: purposes. Otherwise, the files will be created in-memory and not saved to disk. """ if debugging: - # Use the following for debugging: - file_path = f"{file_dir}/mdio-tests/{file_name}.zarr" - else: - # Use the following for normal runs: - file_path = f"memory://path_to_zarr/mdio-tests/{file_name}.zarr" - return file_path + return file_dir / f"mdio-tests/{file_name}.zarr" + + return file_dir / f"{file_name}.zarr" def make_seismic_poststack_3d_acceptance_dataset(dataset_name: str) -> Dataset: @@ -173,24 +173,24 @@ def make_seismic_poststack_3d_acceptance_dataset(dataset_name: str) -> Dataset: ds.add_dimension("inline", 256) ds.add_dimension("crossline", 512) ds.add_dimension("depth", 384) - ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + ds.add_coordinate("inline", dimensions=("inline",), data_type=ScalarType.UINT32) + ds.add_coordinate("crossline", dimensions=("crossline",), data_type=ScalarType.UINT32) ds.add_coordinate( "depth", - dimensions=["depth"], + dimensions=("depth",), data_type=ScalarType.UINT32, metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], ) # Add coordinates ds.add_coordinate( "cdp_x", - dimensions=["inline", "crossline"], + dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32, metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], ) ds.add_coordinate( "cdp_y", - dimensions=["inline", "crossline"], + dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32, metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], ) @@ -198,22 +198,22 @@ def make_seismic_poststack_3d_acceptance_dataset(dataset_name: str) -> Dataset: # Add image variable ds.add_variable( name="image", - dimensions=["inline", "crossline", "depth"], + dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp_x", "cdp_y"], + compressor=Blosc(cname=BloscCname.zstd), # also default in zarr3 + coordinates=("cdp_x", "cdp_y"), metadata_info=[ ChunkGridMetadata( - chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=(128, 128, 128))) ), StatisticsMetadata( stats_v1=SummaryStatistics( count=100, sum=1215.1, - sumSquares=125.12, + sum_squares=125.12, min=5.61, max=10.84, - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), + histogram=CenteredBinHistogram(bin_centers=[1, 2], counts=[10, 15]), ) ), UserAttributes(attributes={"fizz": "buzz"}), @@ -222,12 +222,12 @@ def make_seismic_poststack_3d_acceptance_dataset(dataset_name: str) -> Dataset: # Add velocity variable ds.add_variable( name="velocity", - dimensions=["inline", "crossline", "depth"], + dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT16, - coordinates=["cdp_x", "cdp_y"], + coordinates=("cdp_x", "cdp_y"), metadata_info=[ ChunkGridMetadata( - chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=(128, 128, 128))) ), AllUnits(units_v1=SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND)), ], @@ -236,19 +236,19 @@ def make_seismic_poststack_3d_acceptance_dataset(dataset_name: str) -> Dataset: ds.add_variable( name="image_inline", long_name="inline optimized version of 3d_stack", - dimensions=["inline", "crossline", "depth"], + dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp_x", "cdp_y"], + compressor=Blosc(cname=BloscCname.zstd), # also default in zarr3 + coordinates=("cdp_x", "cdp_y"), metadata_info=[ - ChunkGridMetadata(chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[4, 512, 512]))) + ChunkGridMetadata(chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=(4, 512, 512)))) ], ) # Add headers variable with structured dtype ds.add_variable( name="image_headers", - dimensions=["inline", "crossline"], - coordinates=["cdp_x", "cdp_y"], + dimensions=("inline", "crossline"), + coordinates=("cdp_x", "cdp_y"), data_type=StructuredType( fields=[ StructuredField(name="cdp_x", format=ScalarType.INT32), @@ -258,7 +258,7 @@ def make_seismic_poststack_3d_acceptance_dataset(dataset_name: str) -> Dataset: ] ), metadata_info=[ - ChunkGridMetadata(chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[128, 128]))) + ChunkGridMetadata(chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=(128, 128)))) ], ) return ds.build() diff --git a/tests/unit/v1/templates/test_seismic_2d_poststack.py b/tests/unit/v1/templates/test_seismic_2d_poststack.py index be4fdb168..d5aa4e7ff 100644 --- a/tests/unit/v1/templates/test_seismic_2d_poststack.py +++ b/tests/unit/v1/templates/test_seismic_2d_poststack.py @@ -82,14 +82,14 @@ def test_configuration_depth(self) -> None: # Template attributes assert t._trace_domain == "depth" - assert t._coord_dim_names == ["cdp"] - assert t._dim_names == ["cdp", "depth"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [1024, 1024] + assert t._coord_dim_names == ("cdp",) + assert t._dim_names == ("cdp", "depth") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (1024, 1024) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify dataset attributes @@ -108,14 +108,14 @@ def test_configuration_time(self) -> None: # Template attributes assert t._trace_domain == "time" - assert t._coord_dim_names == ["cdp"] - assert t._dim_names == ["cdp", "time"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [1024, 1024] + assert t._coord_dim_names == ("cdp",) + assert t._dim_names == ("cdp", "time") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (1024, 1024) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify dataset attributes @@ -145,7 +145,7 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: dataset = t.build_dataset( "Seismic 2D Depth Line 001", - sizes=[2048, 4096], + sizes=(2048, 4096), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -167,7 +167,7 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [1024, 1024] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (1024, 1024) assert seismic.metadata.stats_v1 is None def test_build_dataset_time(self, structured_headers: StructuredType) -> None: @@ -198,7 +198,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(v.metadata.chunk_grid, RegularChunkGrid) - assert v.metadata.chunk_grid.configuration.chunk_shape == [1024, 1024] + assert v.metadata.chunk_grid.configuration.chunk_shape == (1024, 1024) assert v.metadata.stats_v1 is None def test_time_vs_depth_comparison(self) -> None: diff --git a/tests/unit/v1/templates/test_seismic_3d_poststack.py b/tests/unit/v1/templates/test_seismic_3d_poststack.py index fb800bf23..d4c314fed 100644 --- a/tests/unit/v1/templates/test_seismic_3d_poststack.py +++ b/tests/unit/v1/templates/test_seismic_3d_poststack.py @@ -4,6 +4,7 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredType from mdio.schemas.v1.dataset import Dataset @@ -98,14 +99,14 @@ def test_configuration_depth(self) -> None: # Template attributes to be overridden by subclasses assert t._trace_domain == "depth" # Domain should be lowercased - assert t._coord_dim_names == ["inline", "crossline"] - assert t._dim_names == ["inline", "crossline", "depth"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [128, 128, 128] + assert t._coord_dim_names == ("inline", "crossline") + assert t._dim_names == ("inline", "crossline", "depth") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (128, 128, 128) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify dataset attributes @@ -123,14 +124,14 @@ def test_configuration_time(self) -> None: # Template attributes to be overridden by subclasses assert t._trace_domain == "time" # Domain should be lowercased - assert t._coord_dim_names == ["inline", "crossline"] - assert t._dim_names == ["inline", "crossline", "time"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [128, 128, 128] + assert t._coord_dim_names == ("inline", "crossline") + assert t._dim_names == ("inline", "crossline", "time") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (128, 128, 128) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None assert t._load_dataset_attributes().attributes == { @@ -160,7 +161,7 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: assert t.name == "PostStack3DDepth" dataset = t.build_dataset( "Seismic 3D", - sizes=[256, 512, 1024], + sizes=(256, 512, 1024), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -181,9 +182,9 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (128, 128, 128) assert seismic.metadata.stats_v1 is None def test_build_dataset_time(self, structured_headers: StructuredType) -> None: @@ -193,7 +194,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: assert t.name == "PostStack3DTime" dataset = t.build_dataset( "Seismic 3D", - sizes=[256, 512, 1024], + sizes=(256, 512, 1024), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -214,7 +215,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (128, 128, 128) assert seismic.metadata.stats_v1 is None diff --git a/tests/unit/v1/templates/test_seismic_3d_prestack_cdp.py b/tests/unit/v1/templates/test_seismic_3d_prestack_cdp.py index 488b510ca..067977b2f 100644 --- a/tests/unit/v1/templates/test_seismic_3d_prestack_cdp.py +++ b/tests/unit/v1/templates/test_seismic_3d_prestack_cdp.py @@ -4,6 +4,7 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredType from mdio.schemas.v1.dataset import Dataset @@ -107,14 +108,14 @@ def test_configuration_depth(self) -> None: # Template attributes for prestack CDP assert t._trace_domain == "depth" - assert t._coord_dim_names == ["inline", "crossline", "offset"] - assert t._dim_names == ["inline", "crossline", "offset", "depth"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [1, 1, 512, 4096] + assert t._coord_dim_names == ("inline", "crossline", "offset") + assert t._dim_names == ("inline", "crossline", "offset", "depth") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (1, 1, 512, 4096) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify prestack CDP attributes @@ -132,14 +133,14 @@ def test_configuration_time(self) -> None: # Template attributes for prestack CDP assert t._trace_domain == "time" - assert t._coord_dim_names == ["inline", "crossline", "offset"] - assert t._dim_names == ["inline", "crossline", "offset", "time"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [1, 1, 512, 4096] + assert t._coord_dim_names == ("inline", "crossline", "offset") + assert t._dim_names == ("inline", "crossline", "offset", "time") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (1, 1, 512, 4096) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify prestack CDP attributes @@ -171,7 +172,7 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: assert t.name == "PreStackCdpGathers3DDepth" dataset = t.build_dataset( "North Sea 3D Prestack Depth", - sizes=[512, 768, 36, 1536], + sizes=(512, 768, 36, 1536), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -192,9 +193,9 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [1, 1, 512, 4096] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (1, 1, 512, 4096) assert seismic.metadata.stats_v1 is None def test_build_dataset_time(self, structured_headers: StructuredType) -> None: @@ -204,7 +205,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: assert t.name == "PreStackCdpGathers3DTime" dataset = t.build_dataset( "Santos Basin 3D Prestack", - sizes=[512, 768, 36, 1536], + sizes=(512, 768, 36, 1536), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -225,7 +226,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [1, 1, 512, 4096] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (1, 1, 512, 4096) assert seismic.metadata.stats_v1 is None diff --git a/tests/unit/v1/templates/test_seismic_3d_prestack_coca.py b/tests/unit/v1/templates/test_seismic_3d_prestack_coca.py index 503e63270..bfaa14108 100644 --- a/tests/unit/v1/templates/test_seismic_3d_prestack_coca.py +++ b/tests/unit/v1/templates/test_seismic_3d_prestack_coca.py @@ -4,6 +4,7 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredType from mdio.schemas.v1.dataset import Dataset @@ -116,14 +117,14 @@ def test_configuration_time(self) -> None: t = Seismic3DPreStackCocaTemplate(domain="time") # Template attributes - assert t._coord_dim_names == ["inline", "crossline", "offset", "azimuth"] - assert t._dim_names == ["inline", "crossline", "offset", "azimuth", "time"] - assert t._coord_names == ["cdp_x", "cdp_y"] - assert t._var_chunk_shape == [8, 8, 32, 1, 1024] + assert t._coord_dim_names == ("inline", "crossline", "offset", "azimuth") + assert t._dim_names == ("inline", "crossline", "offset", "azimuth", "time") + assert t._coord_names == ("cdp_x", "cdp_y") + assert t._var_chunk_shape == (8, 8, 32, 1, 1024) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify dataset attributes @@ -141,7 +142,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: dataset = t.build_dataset( "Permian Basin 3D CDP Coca Gathers", - sizes=[256, 256, 100, 6, 2048], + sizes=(256, 256, 100, 6, 2048), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -162,7 +163,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [8, 8, 32, 1, 1024] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (8, 8, 32, 1, 1024) assert seismic.metadata.stats_v1 is None diff --git a/tests/unit/v1/templates/test_seismic_3d_prestack_shot.py b/tests/unit/v1/templates/test_seismic_3d_prestack_shot.py index 1d519c82f..4f26546a3 100644 --- a/tests/unit/v1/templates/test_seismic_3d_prestack_shot.py +++ b/tests/unit/v1/templates/test_seismic_3d_prestack_shot.py @@ -4,6 +4,7 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredType from mdio.schemas.v1.dataset import Dataset @@ -133,14 +134,14 @@ def test_configuration_depth(self) -> None: # Template attributes for prestack shot assert t._trace_domain == "depth" - assert t._coord_dim_names == ["shot_point", "cable", "channel"] - assert t._dim_names == ["shot_point", "cable", "channel", "depth"] - assert t._coord_names == ["gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y"] - assert t._var_chunk_shape == [1, 1, 512, 4096] + assert t._coord_dim_names == ("shot_point", "cable", "channel") + assert t._dim_names == ("shot_point", "cable", "channel", "depth") + assert t._coord_names == ("gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") + assert t._var_chunk_shape == (1, 1, 512, 4096) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify prestack shot attributes @@ -158,14 +159,14 @@ def test_configuration_time(self) -> None: # Template attributes for prestack shot assert t._trace_domain == "time" - assert t._coord_dim_names == ["shot_point", "cable", "channel"] - assert t._dim_names == ["shot_point", "cable", "channel", "time"] - assert t._coord_names == ["gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y"] - assert t._var_chunk_shape == [1, 1, 512, 4096] + assert t._coord_dim_names == ("shot_point", "cable", "channel") + assert t._dim_names == ("shot_point", "cable", "channel", "time") + assert t._coord_names == ("gun", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y") + assert t._var_chunk_shape == (1, 1, 512, 4096) # Variables instantiated when build_dataset() is called assert t._builder is None - assert t._dim_sizes == [] + assert t._dim_sizes == () assert t._horizontal_coord_unit is None # Verify prestack shot attributes @@ -197,7 +198,7 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: assert t.name == "PreStackShotGathers3DDepth" dataset = t.build_dataset( "Gulf of Mexico 3D Shot Depth", - sizes=[256, 512, 24, 2048], + sizes=(256, 512, 24, 2048), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -218,9 +219,9 @@ def test_build_dataset_depth(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [1, 1, 512, 4096] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (1, 1, 512, 4096) assert seismic.metadata.stats_v1 is None def test_build_dataset_time(self, structured_headers: StructuredType) -> None: @@ -230,7 +231,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: assert t.name == "PreStackShotGathers3DTime" dataset = t.build_dataset( "North Sea 3D Shot Time", - sizes=[256, 512, 24, 2048], + sizes=(256, 512, 24, 2048), horizontal_coord_unit=_UNIT_METER, headers=structured_headers, ) @@ -251,7 +252,7 @@ def test_build_dataset_time(self, structured_headers: StructuredType) -> None: dtype=ScalarType.FLOAT32, ) assert isinstance(seismic.compressor, Blosc) - assert seismic.compressor.algorithm == "zstd" + assert seismic.compressor.cname == BloscCname.zstd assert isinstance(seismic.metadata.chunk_grid, RegularChunkGrid) - assert seismic.metadata.chunk_grid.configuration.chunk_shape == [1, 1, 512, 4096] + assert seismic.metadata.chunk_grid.configuration.chunk_shape == (1, 1, 512, 4096) assert seismic.metadata.stats_v1 is None diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 56d7b9356..153687885 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -1,6 +1,7 @@ """Tests the schema v1 dataset_builder.add_coordinate() public API.""" import pytest +from zarr.codecs import BloscCname from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType @@ -96,7 +97,7 @@ def test_coordinate_with_full_parameters() -> None: long_name="Common Depth Point", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT16, - compressor=Blosc(algorithm="zstd"), + compressor=Blosc(cname=BloscCname.zstd), metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), @@ -106,7 +107,7 @@ def test_coordinate_with_full_parameters() -> None: c = validate_coordinate(builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT16) assert c.long_name == "Common Depth Point" assert isinstance(c.compressor, Blosc) - assert c.compressor.algorithm == "zstd" + assert c.compressor.cname == BloscCname.zstd assert c.metadata.attributes["MGA"] == 51 assert c.metadata.attributes["UnitSystem"] == "Imperial" assert c.metadata.units_v1.length == LengthUnitEnum.FOOT @@ -118,7 +119,7 @@ def test_coordinate_with_full_parameters() -> None: dtype=ScalarType.FLOAT16, ) assert isinstance(v.compressor, Blosc) - assert v.compressor.algorithm == "zstd" + assert v.compressor.cname == BloscCname.zstd assert isinstance(v.metadata, VariableMetadata) assert v.metadata.units_v1.length == LengthUnitEnum.FOOT assert v.metadata.attributes["MGA"] == 51 diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 6e0db5322..c896e1ccb 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -5,6 +5,7 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes @@ -30,7 +31,7 @@ def test_add_variable_no_coords() -> None: # Validate: Must add at least one dimension before adding variables msg = "Must add at least one dimension before adding variables" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", dimensions=["speed"], data_type=ScalarType.FLOAT32) + builder.add_variable("amplitude", dimensions=("speed",), data_type=ScalarType.FLOAT32) # Add dimension before we can add a data variable builder.add_dimension("inline", 100) @@ -40,21 +41,21 @@ def test_add_variable_no_coords() -> None: # Validate: required parameters must be preset bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable(bad_name, dimensions=["speed"], data_type=ScalarType.FLOAT32) + builder.add_variable(bad_name, dimensions=("speed",), data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable("", dimensions=["speed"], data_type=ScalarType.FLOAT32) + builder.add_variable("", dimensions=("speed",), data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): builder.add_variable("bad_amplitude", dimensions=None, data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=[], data_type=ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", dimensions=(), data_type=ScalarType.FLOAT32) # Validate: Add a variable using non-existent dimensions is not allowed msg = "Pre-existing dimension named 'il' is not found" with pytest.raises(ValueError, match=msg): - builder.add_variable("bad_amplitude", dimensions=["il", "xl", "depth"], data_type=ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", dimensions=("il", "xl", "depth"), data_type=ScalarType.FLOAT32) # Add a variable without coordinates - builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32) + builder.add_variable("amplitude", dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=0, n_var=1) validate_variable( builder, @@ -67,7 +68,7 @@ def test_add_variable_no_coords() -> None: # Validate: adding a variable with the same name twice is not allowed msg = "Adding variable with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32) + builder.add_variable("amplitude", dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32) def test_add_variable_with_coords() -> None: @@ -78,24 +79,24 @@ def test_add_variable_with_coords() -> None: builder.add_dimension("depth", 300) # Add dimension coordinates before we can add a data variable - builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + builder.add_coordinate("inline", dimensions=("inline",), data_type=ScalarType.UINT32) + builder.add_coordinate("crossline", dimensions=("crossline",), data_type=ScalarType.UINT32) # Validate: adding a variable with a coordinate that has not been pre-created is not allowed msg = "Pre-existing coordinate named 'depth' is not found" with pytest.raises(ValueError, match=msg): builder.add_variable( "ampl", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline", "depth"], + dimensions=("inline", "crossline", "depth"), + coordinates=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32, ) # Add a variable with pre-defined dimension coordinates builder.add_variable( "ampl", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline"], + dimensions=("inline", "crossline", "depth"), + coordinates=("inline", "crossline"), data_type=ScalarType.FLOAT32, ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=2, n_var=3) @@ -108,14 +109,14 @@ def test_add_variable_with_coords() -> None: ) # Add non-dim coordinates (e.g., 2D coordinates) - builder.add_coordinate("cdp_x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - builder.add_coordinate("cdp_y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate("cdp_x", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) + builder.add_coordinate("cdp_y", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) # Add a variable with pre-defined dimension and non-dimension coordinates builder.add_variable( "ampl2", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline", "cdp_x", "cdp_y"], + dimensions=("inline", "crossline", "depth"), + coordinates=("inline", "crossline", "cdp_x", "cdp_y"), data_type=ScalarType.FLOAT32, ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=4, n_var=6) @@ -136,17 +137,17 @@ def test_add_variable_with_defaults() -> None: builder.add_dimension("crossline", 200) builder.add_dimension("depth", 300) # Add dimension coordinates - builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + builder.add_coordinate("inline", dimensions=("inline",), data_type=ScalarType.UINT32) + builder.add_coordinate("crossline", dimensions=("crossline",), data_type=ScalarType.UINT32) builder.add_coordinate( "depth", - dimensions=["depth"], + dimensions=("depth",), data_type=ScalarType.UINT32, metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], ) # Add data variable using defaults - builder.add_variable("ampl", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32) + builder.add_variable("ampl", dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=3, n_var=4) v = validate_variable( builder, @@ -169,21 +170,21 @@ def test_add_variable_full_parameters() -> None: builder.add_dimension("crossline", 200) builder.add_dimension("depth", 300) # Add dimension coordinates - builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) - builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32) + builder.add_coordinate("inline", dimensions=("inline",), data_type=ScalarType.UINT32) + builder.add_coordinate("crossline", dimensions=("crossline",), data_type=ScalarType.UINT32) + builder.add_coordinate("depth", dimensions=("depth",), data_type=ScalarType.UINT32) # Add coordinates before we can add a data variable - builder.add_coordinate("cdp_x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) - builder.add_coordinate("cdp_y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + builder.add_coordinate("cdp_x", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT64) + builder.add_coordinate("cdp_y", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT64) # Add data variable with full parameters builder.add_variable( "ampl", long_name="Amplitude (dimensionless)", - dimensions=["inline", "crossline", "depth"], + dimensions=("inline", "crossline", "depth"), data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["inline", "crossline", "depth", "cdp_x", "cdp_y"], + compressor=Blosc(cname=BloscCname.zstd), + coordinates=("inline", "crossline", "depth", "cdp_x", "cdp_y"), metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), @@ -210,7 +211,7 @@ def test_add_variable_full_parameters() -> None: ) assert v.long_name == "Amplitude (dimensionless)" assert isinstance(v.compressor, Blosc) - assert v.compressor.algorithm == "zstd" + assert v.compressor.cname == BloscCname.zstd assert len(v.coordinates) == 5 assert v.metadata.stats_v1.count == 100 assert isinstance(v.metadata, VariableMetadata) @@ -218,7 +219,7 @@ def test_add_variable_full_parameters() -> None: assert v.metadata.attributes["MGA"] == 51 assert v.metadata.attributes["UnitSystem"] == "Imperial" assert v.metadata.chunk_grid.name == "regular" - assert v.metadata.chunk_grid.configuration.chunk_shape == [20] + assert v.metadata.chunk_grid.configuration.chunk_shape == (20,) assert v.metadata.stats_v1.count == 100 assert v.metadata.stats_v1.sum == 1215.1 assert v.metadata.stats_v1.sum_squares == 125.12 diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 71763e269..06ef2d620 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -1,5 +1,6 @@ """Tests the schema v1 dataset_builder.build() public API.""" +from mdio.schemas.compressors import BloscCname from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredField from mdio.schemas.dtype import StructuredType @@ -18,15 +19,15 @@ def test_build() -> None: MDIODatasetBuilder("test_dataset") .add_dimension("inline", 100) .add_dimension("crossline", 200) - .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) - .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) - .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + .add_coordinate("inline", dimensions=("inline",), data_type=ScalarType.FLOAT64) + .add_coordinate("crossline", dimensions=("crossline",), data_type=ScalarType.FLOAT64) + .add_coordinate("x_coord", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) + .add_coordinate("y_coord", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) .add_variable( "data", long_name="Test Data", - dimensions=["inline", "crossline"], - coordinates=["inline", "crossline", "x_coord", "y_coord"], + dimensions=("inline", "crossline"), + coordinates=("inline", "crossline", "x_coord", "y_coord"), data_type=ScalarType.FLOAT32, ) .build() @@ -99,8 +100,8 @@ def test_build_seismic_poststack_3d_acceptance_dataset() -> None: # noqa: PLR09 dtype=ScalarType.FLOAT32, ) assert image.metadata.units_v1 is None # No units defined for image - assert image.compressor.algorithm == "zstd" - assert image.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] + assert image.compressor.cname == BloscCname.zstd + assert image.metadata.chunk_grid.configuration.chunk_shape == (128, 128, 128) assert image.metadata.stats_v1.count == 100 velocity = validate_variable( @@ -111,7 +112,7 @@ def test_build_seismic_poststack_3d_acceptance_dataset() -> None: # noqa: PLR09 dtype=ScalarType.FLOAT16, ) assert velocity.compressor is None - assert velocity.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] + assert velocity.metadata.chunk_grid.configuration.chunk_shape == (128, 128, 128) assert velocity.metadata.units_v1.speed == SpeedUnitEnum.METER_PER_SECOND image_inline = validate_variable( @@ -122,8 +123,8 @@ def test_build_seismic_poststack_3d_acceptance_dataset() -> None: # noqa: PLR09 dtype=ScalarType.FLOAT32, ) assert image_inline.long_name == "inline optimized version of 3d_stack" - assert image_inline.compressor.algorithm == "zstd" - assert image_inline.metadata.chunk_grid.configuration.chunk_shape == [4, 512, 512] + assert image_inline.compressor.cname == BloscCname.zstd + assert image_inline.metadata.chunk_grid.configuration.chunk_shape == (4, 512, 512) # Verify image_headers variable headers = next(v for v in dataset.variables if v.name == "image_headers") @@ -145,4 +146,4 @@ def test_build_seismic_poststack_3d_acceptance_dataset() -> None: # noqa: PLR09 ] ), ) - assert headers.metadata.chunk_grid.configuration.chunk_shape == [128, 128] + assert headers.metadata.chunk_grid.configuration.chunk_shape == (128, 128) diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py index b2ba589a6..3d1c2ded6 100644 --- a/tests/unit/v1/test_dataset_serializer.py +++ b/tests/unit/v1/test_dataset_serializer.py @@ -2,15 +2,13 @@ from pathlib import Path +import numpy as np import pytest -from dask import array as dask_array -from numpy import array as np_array -from numpy import dtype as np_dtype -from numpy import isnan as np_isnan -from numpy import zeros as np_zeros from xarray import DataArray as xr_DataArray from zarr import zeros as zarr_zeros +from zarr.codecs import BloscCodec +from mdio import to_mdio from mdio.constants import fill_value_map from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape @@ -45,12 +43,11 @@ ZFPY = None HAS_ZFPY = False -from numcodecs import Blosc as nc_Blosc from mdio.schemas.compressors import ZFP as MDIO_ZFP from mdio.schemas.compressors import Blosc as mdio_Blosc -from mdio.schemas.compressors import BloscAlgorithm as mdio_BloscAlgorithm -from mdio.schemas.compressors import BloscShuffle as mdio_BloscShuffle +from mdio.schemas.compressors import BloscCname +from mdio.schemas.compressors import BloscShuffle from mdio.schemas.compressors import ZFPMode as mdio_ZFPMode @@ -190,7 +187,7 @@ def test_get_fill_value() -> None: ScalarType.FLOAT64, ] for scalar_type in scalar_types: - assert np_isnan(_get_fill_value(scalar_type)) + assert np.isnan(_get_fill_value(scalar_type)) scalar_types = [ ScalarType.UINT8, @@ -212,8 +209,8 @@ def test_get_fill_value() -> None: for scalar_type in scalar_types: val = _get_fill_value(scalar_type) assert isinstance(val, complex) - assert np_isnan(val.real) - assert np_isnan(val.imag) + assert np.isnan(val.real) + assert np.isnan(val.imag) # Test 2: StructuredType f1 = StructuredField(name="cdp_x", format=ScalarType.INT32) @@ -222,9 +219,9 @@ def test_get_fill_value() -> None: f4 = StructuredField(name="some_scalar", format=ScalarType.FLOAT16) structured_type = StructuredType(fields=[f1, f2, f3, f4]) - expected = np_array( + expected = np.array( (0, 0, 0.0, 0.0), - dtype=np_dtype([("cdp_x", " None: assert result_none is None # Test 2: mdio_Blosc compressor - should return nc_Blosc - result_blosc = _convert_compressor( - mdio_Blosc( - algorithm=mdio_BloscAlgorithm.LZ4, - level=5, - shuffle=mdio_BloscShuffle.AUTOSHUFFLE, - blocksize=1024, - ) - ) - assert isinstance(result_blosc, nc_Blosc) - assert result_blosc.cname == "lz4" # BloscAlgorithm.LZ4.value + mdio_compressor = mdio_Blosc(cname=BloscCname.lz4, clevel=5, shuffle=BloscShuffle.bitshuffle, blocksize=1024) + result_blosc = _convert_compressor(mdio_compressor) + + assert isinstance(result_blosc, BloscCodec) + assert result_blosc.cname == BloscCname.lz4 assert result_blosc.clevel == 5 - assert result_blosc.shuffle == -1 # BloscShuffle.UTOSHUFFLE = -1 + assert result_blosc.shuffle == BloscShuffle.bitshuffle assert result_blosc.blocksize == 1024 - # Test 3: mdio_Blosc with blocksize 0 - should use 0 as blocksize - result_blosc_zero = _convert_compressor( - mdio_Blosc( - algorithm=mdio_BloscAlgorithm.ZSTD, - level=3, - shuffle=mdio_BloscShuffle.AUTOSHUFFLE, - blocksize=0, - ) - ) - assert isinstance(result_blosc_zero, nc_Blosc) - assert result_blosc_zero.blocksize == 0 - - # Test 4: mdio_ZFP compressor - should return zfpy_ZFPY if available + # Test 3: mdio_ZFP compressor - should return zfpy_ZFPY if available zfp_compressor = MDIO_ZFP(mode=mdio_ZFPMode.FIXED_RATE, tolerance=0.01, rate=8.0, precision=16) if HAS_ZFPY: @@ -308,15 +288,15 @@ def test_to_xarray_dataset(tmp_path: Path) -> None: .add_dimension("inline", 100) .add_dimension("crossline", 200) .add_dimension("depth", 300) - .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) - .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) - .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + .add_coordinate("inline", dimensions=("inline",), data_type=ScalarType.FLOAT64) + .add_coordinate("crossline", dimensions=("crossline",), data_type=ScalarType.FLOAT64) + .add_coordinate("x_coord", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) + .add_coordinate("y_coord", dimensions=("inline", "crossline"), data_type=ScalarType.FLOAT32) .add_variable( "data", long_name="Test Data", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline", "x_coord", "y_coord"], + dimensions=("inline", "crossline", "depth"), + coordinates=("inline", "crossline", "x_coord", "y_coord"), data_type=ScalarType.FLOAT32, ) .build() @@ -325,7 +305,7 @@ def test_to_xarray_dataset(tmp_path: Path) -> None: xr_ds = to_xarray_dataset(dataset) file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False) - xr_ds.to_zarr(store=file_path, mode="w", zarr_format=2, compute=False) + xr_ds.to_zarr(store=file_path, mode="w", compute=False) def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> None: @@ -335,35 +315,7 @@ def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path: Path) -> No xr_ds = to_xarray_dataset(dataset) file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False) - xr_ds.to_zarr(store=file_path, mode="w", zarr_format=2, compute=False) - - -@pytest.mark.skip(reason="Bug reproducer for the issue 582") -def test_buf_reproducer_dask_to_zarr(tmp_path: Path) -> None: - """Bug reproducer for the issue https://github.com/TGSAI/mdio-python/issues/582.""" - # TODO(Dmitriy Repin): Remove this test after the bug is fixed - # https://github.com/TGSAI/mdio-python/issues/582 - - # Create a data type and the fill value - dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")]) - dtype_fill_value = np_zeros((), dtype=dtype) - - # Use '_FillValue' instead of 'fill_value' - # 'fill_value' is not a valid encoding key in Zarr v2 - my_attr_encoding = { - "_FillValue": dtype_fill_value, - "chunk_key_encoding": {"name": "v2", "separator": "/"}, - } - - # Create a dask array using the data type - # Do not specify encoding as the array attribute - data = dask_array.zeros((36,), dtype=dtype, chunks=(36,)) - aa = xr_DataArray(name="myattr", data=data) - - # Specify encoding per array - encoding = {"myattr": my_attr_encoding} - file_path = output_path(tmp_path, "to_zarr/zarr_dask", debugging=False) - aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False) + to_mdio(xr_ds, output_path=file_path, mode="w-", compute=False) def test_to_zarr_from_zarr_zeros_1(tmp_path: Path) -> None: @@ -372,24 +324,18 @@ def test_to_zarr_from_zarr_zeros_1(tmp_path: Path) -> None: Set encoding in as DataArray attributes """ # Create a data type and the fill value - dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")]) - dtype_fill_value = np_zeros((), dtype=dtype) + dtype = np.dtype([("inline", "int32"), ("cdp_x", "float64")]) - # Use '_FillValue' instead of 'fill_value' - # 'fill_value' is not a valid encoding key in Zarr v2 - my_attr_encoding = { - "_FillValue": dtype_fill_value, - "chunk_key_encoding": {"name": "v2", "separator": "/"}, - } + my_attr_encoding = {"fill_value": np.void((0, 0), dtype=dtype)} # Create a zarr array using the data type, # Specify encoding as the array attribute - data = zarr_zeros((36, 36), dtype=dtype, zarr_format=2) + data = zarr_zeros((36, 36), dtype=dtype) aa = xr_DataArray(name="myattr", data=data) aa.encoding = my_attr_encoding file_path = output_path(tmp_path, "to_zarr/zarr_zarr_zerros_1", debugging=False) - aa.to_zarr(file_path, mode="w", zarr_format=2, compute=False) + aa.to_zarr(file_path, mode="w", compute=False) def test_to_zarr_from_zarr_zeros_2(tmp_path: Path) -> None: @@ -398,46 +344,34 @@ def test_to_zarr_from_zarr_zeros_2(tmp_path: Path) -> None: Set encoding in the to_zar method """ # Create a data type and the fill value - dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")]) - dtype_fill_value = np_zeros((), dtype=dtype) + dtype = np.dtype([("inline", "int32"), ("cdp_x", "float64")]) - # Use '_FillValue' instead of 'fill_value' - # 'fill_value' is not a valid encoding key in Zarr v2 - my_attr_encoding = { - "_FillValue": dtype_fill_value, - "chunk_key_encoding": {"name": "v2", "separator": "/"}, - } + my_attr_encoding = {"fill_value": np.void((0, 0), dtype=dtype)} # Create a zarr array using the data type, # Do not specify encoding as the array attribute - data = zarr_zeros((36, 36), dtype=dtype, zarr_format=2) + data = zarr_zeros((36, 36), dtype=dtype) aa = xr_DataArray(name="myattr", data=data) file_path = output_path(tmp_path, "to_zarr/zarr_zarr_zerros_2", debugging=False) # Specify encoding per array encoding = {"myattr": my_attr_encoding} - aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False) + aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False) def test_to_zarr_from_np(tmp_path: Path) -> None: """Test writing XArray dataset with data as NumPy array to Zarr.""" # Create a data type and the fill value - dtype = np_dtype([("inline", "int32"), ("cdp_x", "float64")]) - dtype_fill_value = np_zeros((), dtype=dtype) + dtype = np.dtype([("inline", "int32"), ("cdp_x", "float64")]) - # Use '_FillValue' instead of 'fill_value' - # 'fill_value' is not a valid encoding key in Zarr v2 - my_attr_encoding = { - "_FillValue": dtype_fill_value, - "chunk_key_encoding": {"name": "v2", "separator": "/"}, - } + my_attr_encoding = {"fill_value": np.void((0, 0), dtype=dtype)} # Create a zarr array using the data type # Do not specify encoding as the array attribute - data = np_zeros((36, 36), dtype=dtype) + data = np.zeros((36, 36), dtype=dtype) aa = xr_DataArray(name="myattr", data=data) file_path = output_path(tmp_path, "to_zarr/zarr_np", debugging=False) # Specify encoding per array encoding = {"myattr": my_attr_encoding} - aa.to_zarr(file_path, mode="w", zarr_format=2, encoding=encoding, compute=False) + aa.to_zarr(file_path, mode="w", encoding=encoding, compute=False) diff --git a/uv.lock b/uv.lock index c172a301c..27f05a006 100644 --- a/uv.lock +++ b/uv.lock @@ -38,7 +38,7 @@ wheels = [ [[package]] name = "aiobotocore" -version = "2.24.1" +version = "2.24.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -49,9 +49,9 @@ dependencies = [ { name = "python-dateutil" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1b/02/b4ed1af4b3437c2fc6e6111e7fdee011b34cf1c0cc8f314474f843e10019/aiobotocore-2.24.1.tar.gz", hash = "sha256:59237f1b2d4ff619f9a9e78360b691d59b92fdd4d03d054dbd2eeff8ada5667e", size = 119754, upload-time = "2025-08-15T15:49:53.209Z" } +sdist = { url = "https://files.pythonhosted.org/packages/05/93/9f5243c2fd2fc22cff92f8d8a7e98d3080171be60778d49aeabb555a463d/aiobotocore-2.24.2.tar.gz", hash = "sha256:dfb21bdb2610e8de4d22f401e91a24d50f1330a302d03c62c485757becd439a9", size = 119837, upload-time = "2025-09-05T12:13:46.963Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/20/26/c3c93209084e24990ad1b4214f67dce1c0183454cec9cd2cad9433f493bb/aiobotocore-2.24.1-py3-none-any.whl", hash = "sha256:557922823455ca65bbd065b363b54846f16b9c4b6bd0b61ecdfa01ca13a04531", size = 85216, upload-time = "2025-08-15T15:49:51.442Z" }, + { url = "https://files.pythonhosted.org/packages/87/03/2330062ac4ea9fa6447e02b0625f24efd6f05b6c44d61d86610b3555ee66/aiobotocore-2.24.2-py3-none-any.whl", hash = "sha256:808c63b2bd344b91e2f2acb874831118a9f53342d248acd16a68455a226e283a", size = 85441, upload-time = "2025-09-05T12:13:45.378Z" }, ] [[package]] @@ -316,16 +316,16 @@ wheels = [ [[package]] name = "botocore" -version = "1.39.11" +version = "1.40.18" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6d/d0/9d64261186cff650fe63168441edb4f4cd33f085a74c0c54455630a71f91/botocore-1.39.11.tar.gz", hash = "sha256:953b12909d6799350e346ab038e55b6efe622c616f80aef74d7a6683ffdd972c", size = 14217749, upload-time = "2025-07-22T19:26:40.723Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/91/2e745382793fa7d30810a7d5ca3e05f6817b6db07601ca5aaab12720caf9/botocore-1.40.18.tar.gz", hash = "sha256:afd69bdadd8c55cc89d69de0799829e555193a352d87867f746e19020271cc0f", size = 14375007, upload-time = "2025-08-26T19:21:24.996Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1c/2c/8a0b02d60a1dbbae7faa5af30484b016aa3023f9833dfc0d19b0b770dd6a/botocore-1.39.11-py3-none-any.whl", hash = "sha256:1545352931a8a186f3e977b1e1a4542d7d434796e274c3c62efd0210b5ea76dc", size = 13876276, upload-time = "2025-07-22T19:26:35.164Z" }, + { url = "https://files.pythonhosted.org/packages/1a/f5/bd57bf21fdcc4e500cc406ed2c296e626ddd160f0fee2a4932256e5d62d8/botocore-1.40.18-py3-none-any.whl", hash = "sha256:57025c46ca00cf8cec25de07a759521bfbfb3036a0f69b272654a354615dc45f", size = 14039935, upload-time = "2025-08-26T19:21:19.085Z" }, ] [[package]] @@ -1684,6 +1684,7 @@ dependencies = [ { name = "rich" }, { name = "segy" }, { name = "tqdm" }, + { name = "universal-pathlib" }, { name = "xarray" }, { name = "zarr" }, ] @@ -1744,6 +1745,7 @@ requires-dist = [ { name = "s3fs", marker = "extra == 'cloud'", specifier = ">=2025.7.0" }, { name = "segy", specifier = ">=0.4.2,<0.5.0" }, { name = "tqdm", specifier = ">=4.67.1,<5.0.0" }, + { name = "universal-pathlib", specifier = ">=0.2.6" }, { name = "xarray", specifier = ">=2025.7.1" }, { name = "zarr", specifier = ">=3.1.2,<4.0.0" }, { name = "zfpy", marker = "extra == 'lossy'", specifier = ">=1.0.1,<2.0.0" }, @@ -2535,7 +2537,7 @@ crypto = [ [[package]] name = "pytest" -version = "8.4.1" +version = "8.4.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -2544,9 +2546,9 @@ dependencies = [ { name = "pluggy" }, { name = "pygments" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/08/ba/45911d754e8eba3d5a841a5ce61a65a685ff1798421ac054f85aa8747dfb/pytest-8.4.1.tar.gz", hash = "sha256:7c67fd69174877359ed9371ec3af8a3d2b04741818c51e5e99cc1742251fa93c", size = 1517714, upload-time = "2025-06-18T05:48:06.109Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/29/16/c8a903f4c4dffe7a12843191437d7cd8e32751d5de349d45d3fe69544e87/pytest-8.4.1-py3-none-any.whl", hash = "sha256:539c70ba6fcead8e78eebbf1115e8b589e7565830d7d006a8723f19ac8a0afb7", size = 365474, upload-time = "2025-06-18T05:48:03.955Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, ] [[package]] @@ -2953,28 +2955,28 @@ wheels = [ [[package]] name = "ruff" -version = "0.12.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/de/55/16ab6a7d88d93001e1ae4c34cbdcfb376652d761799459ff27c1dc20f6fa/ruff-0.12.11.tar.gz", hash = "sha256:c6b09ae8426a65bbee5425b9d0b82796dbb07cb1af045743c79bfb163001165d", size = 5347103, upload-time = "2025-08-28T13:59:08.87Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/a2/3b3573e474de39a7a475f3fbaf36a25600bfeb238e1a90392799163b64a0/ruff-0.12.11-py3-none-linux_armv6l.whl", hash = "sha256:93fce71e1cac3a8bf9200e63a38ac5c078f3b6baebffb74ba5274fb2ab276065", size = 11979885, upload-time = "2025-08-28T13:58:26.654Z" }, - { url = "https://files.pythonhosted.org/packages/76/e4/235ad6d1785a2012d3ded2350fd9bc5c5af8c6f56820e696b0118dfe7d24/ruff-0.12.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b8e33ac7b28c772440afa80cebb972ffd823621ded90404f29e5ab6d1e2d4b93", size = 12742364, upload-time = "2025-08-28T13:58:30.256Z" }, - { url = "https://files.pythonhosted.org/packages/2c/0d/15b72c5fe6b1e402a543aa9d8960e0a7e19dfb079f5b0b424db48b7febab/ruff-0.12.11-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d69fb9d4937aa19adb2e9f058bc4fbfe986c2040acb1a4a9747734834eaa0bfd", size = 11920111, upload-time = "2025-08-28T13:58:33.677Z" }, - { url = "https://files.pythonhosted.org/packages/3e/c0/f66339d7893798ad3e17fa5a1e587d6fd9806f7c1c062b63f8b09dda6702/ruff-0.12.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:411954eca8464595077a93e580e2918d0a01a19317af0a72132283e28ae21bee", size = 12160060, upload-time = "2025-08-28T13:58:35.74Z" }, - { url = "https://files.pythonhosted.org/packages/03/69/9870368326db26f20c946205fb2d0008988aea552dbaec35fbacbb46efaa/ruff-0.12.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6a2c0a2e1a450f387bf2c6237c727dd22191ae8c00e448e0672d624b2bbd7fb0", size = 11799848, upload-time = "2025-08-28T13:58:38.051Z" }, - { url = "https://files.pythonhosted.org/packages/25/8c/dd2c7f990e9b3a8a55eee09d4e675027d31727ce33cdb29eab32d025bdc9/ruff-0.12.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ca4c3a7f937725fd2413c0e884b5248a19369ab9bdd850b5781348ba283f644", size = 13536288, upload-time = "2025-08-28T13:58:40.046Z" }, - { url = "https://files.pythonhosted.org/packages/7a/30/d5496fa09aba59b5e01ea76775a4c8897b13055884f56f1c35a4194c2297/ruff-0.12.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4d1df0098124006f6a66ecf3581a7f7e754c4df7644b2e6704cd7ca80ff95211", size = 14490633, upload-time = "2025-08-28T13:58:42.285Z" }, - { url = "https://files.pythonhosted.org/packages/9b/2f/81f998180ad53445d403c386549d6946d0748e536d58fce5b5e173511183/ruff-0.12.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a8dd5f230efc99a24ace3b77e3555d3fbc0343aeed3fc84c8d89e75ab2ff793", size = 13888430, upload-time = "2025-08-28T13:58:44.641Z" }, - { url = "https://files.pythonhosted.org/packages/87/71/23a0d1d5892a377478c61dbbcffe82a3476b050f38b5162171942a029ef3/ruff-0.12.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4dc75533039d0ed04cd33fb8ca9ac9620b99672fe7ff1533b6402206901c34ee", size = 12913133, upload-time = "2025-08-28T13:58:47.039Z" }, - { url = "https://files.pythonhosted.org/packages/80/22/3c6cef96627f89b344c933781ed38329bfb87737aa438f15da95907cbfd5/ruff-0.12.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fc58f9266d62c6eccc75261a665f26b4ef64840887fc6cbc552ce5b29f96cc8", size = 13169082, upload-time = "2025-08-28T13:58:49.157Z" }, - { url = "https://files.pythonhosted.org/packages/05/b5/68b3ff96160d8b49e8dd10785ff3186be18fd650d356036a3770386e6c7f/ruff-0.12.11-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5a0113bd6eafd545146440225fe60b4e9489f59eb5f5f107acd715ba5f0b3d2f", size = 13139490, upload-time = "2025-08-28T13:58:51.593Z" }, - { url = "https://files.pythonhosted.org/packages/59/b9/050a3278ecd558f74f7ee016fbdf10591d50119df8d5f5da45a22c6afafc/ruff-0.12.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:0d737b4059d66295c3ea5720e6efc152623bb83fde5444209b69cd33a53e2000", size = 11958928, upload-time = "2025-08-28T13:58:53.943Z" }, - { url = "https://files.pythonhosted.org/packages/f9/bc/93be37347db854806904a43b0493af8d6873472dfb4b4b8cbb27786eb651/ruff-0.12.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:916fc5defee32dbc1fc1650b576a8fed68f5e8256e2180d4d9855aea43d6aab2", size = 11764513, upload-time = "2025-08-28T13:58:55.976Z" }, - { url = "https://files.pythonhosted.org/packages/7a/a1/1471751e2015a81fd8e166cd311456c11df74c7e8769d4aabfbc7584c7ac/ruff-0.12.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c984f07d7adb42d3ded5be894fb4007f30f82c87559438b4879fe7aa08c62b39", size = 12745154, upload-time = "2025-08-28T13:58:58.16Z" }, - { url = "https://files.pythonhosted.org/packages/68/ab/2542b14890d0f4872dd81b7b2a6aed3ac1786fae1ce9b17e11e6df9e31e3/ruff-0.12.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e07fbb89f2e9249f219d88331c833860489b49cdf4b032b8e4432e9b13e8a4b9", size = 13227653, upload-time = "2025-08-28T13:59:00.276Z" }, - { url = "https://files.pythonhosted.org/packages/22/16/2fbfc61047dbfd009c58a28369a693a1484ad15441723be1cd7fe69bb679/ruff-0.12.11-py3-none-win32.whl", hash = "sha256:c792e8f597c9c756e9bcd4d87cf407a00b60af77078c96f7b6366ea2ce9ba9d3", size = 11944270, upload-time = "2025-08-28T13:59:02.347Z" }, - { url = "https://files.pythonhosted.org/packages/08/a5/34276984705bfe069cd383101c45077ee029c3fe3b28225bf67aa35f0647/ruff-0.12.11-py3-none-win_amd64.whl", hash = "sha256:a3283325960307915b6deb3576b96919ee89432ebd9c48771ca12ee8afe4a0fd", size = 13046600, upload-time = "2025-08-28T13:59:04.751Z" }, - { url = "https://files.pythonhosted.org/packages/84/a8/001d4a7c2b37623a3fd7463208267fb906df40ff31db496157549cfd6e72/ruff-0.12.11-py3-none-win_arm64.whl", hash = "sha256:bae4d6e6a2676f8fb0f98b74594a048bae1b944aab17e9f5d504062303c6dbea", size = 12135290, upload-time = "2025-08-28T13:59:06.933Z" }, +version = "0.12.12" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a8/f0/e0965dd709b8cabe6356811c0ee8c096806bb57d20b5019eb4e48a117410/ruff-0.12.12.tar.gz", hash = "sha256:b86cd3415dbe31b3b46a71c598f4c4b2f550346d1ccf6326b347cc0c8fd063d6", size = 5359915, upload-time = "2025-09-04T16:50:18.273Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/79/8d3d687224d88367b51c7974cec1040c4b015772bfbeffac95face14c04a/ruff-0.12.12-py3-none-linux_armv6l.whl", hash = "sha256:de1c4b916d98ab289818e55ce481e2cacfaad7710b01d1f990c497edf217dafc", size = 12116602, upload-time = "2025-09-04T16:49:18.892Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c3/6e599657fe192462f94861a09aae935b869aea8a1da07f47d6eae471397c/ruff-0.12.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:7acd6045e87fac75a0b0cdedacf9ab3e1ad9d929d149785903cff9bb69ad9727", size = 12868393, upload-time = "2025-09-04T16:49:23.043Z" }, + { url = "https://files.pythonhosted.org/packages/e8/d2/9e3e40d399abc95336b1843f52fc0daaceb672d0e3c9290a28ff1a96f79d/ruff-0.12.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:abf4073688d7d6da16611f2f126be86523a8ec4343d15d276c614bda8ec44edb", size = 12036967, upload-time = "2025-09-04T16:49:26.04Z" }, + { url = "https://files.pythonhosted.org/packages/e9/03/6816b2ed08836be272e87107d905f0908be5b4a40c14bfc91043e76631b8/ruff-0.12.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:968e77094b1d7a576992ac078557d1439df678a34c6fe02fd979f973af167577", size = 12276038, upload-time = "2025-09-04T16:49:29.056Z" }, + { url = "https://files.pythonhosted.org/packages/9f/d5/707b92a61310edf358a389477eabd8af68f375c0ef858194be97ca5b6069/ruff-0.12.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42a67d16e5b1ffc6d21c5f67851e0e769517fb57a8ebad1d0781b30888aa704e", size = 11901110, upload-time = "2025-09-04T16:49:32.07Z" }, + { url = "https://files.pythonhosted.org/packages/9d/3d/f8b1038f4b9822e26ec3d5b49cf2bc313e3c1564cceb4c1a42820bf74853/ruff-0.12.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b216ec0a0674e4b1214dcc998a5088e54eaf39417327b19ffefba1c4a1e4971e", size = 13668352, upload-time = "2025-09-04T16:49:35.148Z" }, + { url = "https://files.pythonhosted.org/packages/98/0e/91421368ae6c4f3765dd41a150f760c5f725516028a6be30e58255e3c668/ruff-0.12.12-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:59f909c0fdd8f1dcdbfed0b9569b8bf428cf144bec87d9de298dcd4723f5bee8", size = 14638365, upload-time = "2025-09-04T16:49:38.892Z" }, + { url = "https://files.pythonhosted.org/packages/74/5d/88f3f06a142f58ecc8ecb0c2fe0b82343e2a2b04dcd098809f717cf74b6c/ruff-0.12.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ac93d87047e765336f0c18eacad51dad0c1c33c9df7484c40f98e1d773876f5", size = 14060812, upload-time = "2025-09-04T16:49:42.732Z" }, + { url = "https://files.pythonhosted.org/packages/13/fc/8962e7ddd2e81863d5c92400820f650b86f97ff919c59836fbc4c1a6d84c/ruff-0.12.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:01543c137fd3650d322922e8b14cc133b8ea734617c4891c5a9fccf4bfc9aa92", size = 13050208, upload-time = "2025-09-04T16:49:46.434Z" }, + { url = "https://files.pythonhosted.org/packages/53/06/8deb52d48a9a624fd37390555d9589e719eac568c020b27e96eed671f25f/ruff-0.12.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2afc2fa864197634e549d87fb1e7b6feb01df0a80fd510d6489e1ce8c0b1cc45", size = 13311444, upload-time = "2025-09-04T16:49:49.931Z" }, + { url = "https://files.pythonhosted.org/packages/2a/81/de5a29af7eb8f341f8140867ffb93f82e4fde7256dadee79016ac87c2716/ruff-0.12.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:0c0945246f5ad776cb8925e36af2438e66188d2b57d9cf2eed2c382c58b371e5", size = 13279474, upload-time = "2025-09-04T16:49:53.465Z" }, + { url = "https://files.pythonhosted.org/packages/7f/14/d9577fdeaf791737ada1b4f5c6b59c21c3326f3f683229096cccd7674e0c/ruff-0.12.12-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a0fbafe8c58e37aae28b84a80ba1817f2ea552e9450156018a478bf1fa80f4e4", size = 12070204, upload-time = "2025-09-04T16:49:56.882Z" }, + { url = "https://files.pythonhosted.org/packages/77/04/a910078284b47fad54506dc0af13839c418ff704e341c176f64e1127e461/ruff-0.12.12-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:b9c456fb2fc8e1282affa932c9e40f5ec31ec9cbb66751a316bd131273b57c23", size = 11880347, upload-time = "2025-09-04T16:49:59.729Z" }, + { url = "https://files.pythonhosted.org/packages/df/58/30185fcb0e89f05e7ea82e5817b47798f7fa7179863f9d9ba6fd4fe1b098/ruff-0.12.12-py3-none-musllinux_1_2_i686.whl", hash = "sha256:5f12856123b0ad0147d90b3961f5c90e7427f9acd4b40050705499c98983f489", size = 12891844, upload-time = "2025-09-04T16:50:02.591Z" }, + { url = "https://files.pythonhosted.org/packages/21/9c/28a8dacce4855e6703dcb8cdf6c1705d0b23dd01d60150786cd55aa93b16/ruff-0.12.12-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:26a1b5a2bf7dd2c47e3b46d077cd9c0fc3b93e6c6cc9ed750bd312ae9dc302ee", size = 13360687, upload-time = "2025-09-04T16:50:05.8Z" }, + { url = "https://files.pythonhosted.org/packages/c8/fa/05b6428a008e60f79546c943e54068316f32ec8ab5c4f73e4563934fbdc7/ruff-0.12.12-py3-none-win32.whl", hash = "sha256:173be2bfc142af07a01e3a759aba6f7791aa47acf3604f610b1c36db888df7b1", size = 12052870, upload-time = "2025-09-04T16:50:09.121Z" }, + { url = "https://files.pythonhosted.org/packages/85/60/d1e335417804df452589271818749d061b22772b87efda88354cf35cdb7a/ruff-0.12.12-py3-none-win_amd64.whl", hash = "sha256:e99620bf01884e5f38611934c09dd194eb665b0109104acae3ba6102b600fd0d", size = 13178016, upload-time = "2025-09-04T16:50:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/28/7e/61c42657f6e4614a4258f1c3b0c5b93adc4d1f8575f5229d1906b483099b/ruff-0.12.12-py3-none-win_arm64.whl", hash = "sha256:2a8199cab4ce4d72d158319b63370abf60991495fb733db96cd923a34c52d093", size = 12256762, upload-time = "2025-09-04T16:50:15.737Z" }, ] [[package]] @@ -3458,6 +3460,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/37/87/1f677586e8ac487e29672e4b17455758fce261de06a0d086167bb760361a/uc_micro_py-1.0.3-py3-none-any.whl", hash = "sha256:db1dffff340817673d7b466ec86114a9dc0e9d4d9b5ba229d9d60e5c12600cd5", size = 6229, upload-time = "2024-02-09T16:52:00.371Z" }, ] +[[package]] +name = "universal-pathlib" +version = "0.2.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/21/dd871495af3933e585261adce42678dcdf1168c9d6fa0a8f7b6565e54472/universal_pathlib-0.2.6.tar.gz", hash = "sha256:50817aaeaa9f4163cb1e76f5bdf84207fa05ce728b23fd779479b3462e5430ac", size = 175427, upload-time = "2024-12-13T00:58:27.514Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/4d/2e577f6db7aa0f932d19f799c18f604b2b302c65f733419b900ec07dbade/universal_pathlib-0.2.6-py3-none-any.whl", hash = "sha256:700dec2b58ef34b87998513de6d2ae153b22f083197dfafb8544744edabd1b18", size = 50087, upload-time = "2024-12-13T00:58:24.582Z" }, +] + [[package]] name = "urllib3" version = "2.5.0" @@ -3659,16 +3673,16 @@ wheels = [ [[package]] name = "xarray" -version = "2025.8.0" +version = "2025.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, { name = "packaging" }, { name = "pandas" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d2/55/18055bc943029d25fb8f260b7e3b1485c30646ccf503a5e4a744d31a3b78/xarray-2025.8.0.tar.gz", hash = "sha256:323d4169ce72d4ef849de2b0bd122f9cd2905b82c7558169930dc16070982bab", size = 3034425, upload-time = "2025-08-14T16:52:13.872Z" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/0b/bbb76e05c8e2099baf90e259c29cafe6a525524b1d1da8bfbc39577c043e/xarray-2025.9.0.tar.gz", hash = "sha256:7dd6816fe0062c49c5e9370dd483843bc13e5ed80a47a9ff10baff2b51e070fb", size = 3040318, upload-time = "2025-09-04T04:20:26.296Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e6/c8/0f8db9d9478de8d70cbcae2056588401e26168e269d6d9919bf2ecb01f78/xarray-2025.8.0-py3-none-any.whl", hash = "sha256:1c454f32b38c93df68e450238c9473fe21248b8572d42ddd58c5170bb30934ee", size = 1342279, upload-time = "2025-08-14T16:52:10.956Z" }, + { url = "https://files.pythonhosted.org/packages/8d/f0/73c24457c941b8b08f7d090853e40f4b2cdde88b5da721f3f28e98df77c9/xarray-2025.9.0-py3-none-any.whl", hash = "sha256:79f0e25fb39571f612526ee998ee5404d8725a1db3951aabffdb287388885df0", size = 1349595, upload-time = "2025-09-04T04:20:24.36Z" }, ] [[package]]