TGSAI · tasansal · Aug 8, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/src/mdio/api/convenience.py b/src/mdio/api/convenience.py
@@ -79,7 +79,11 @@ def copy_mdio(  # noqa: PLR0913
 
         writer.live_mask[:] = reader.live_mask[:]
 
-        iterator = ChunkIterator(reader._traces, chunk_samples=False)
+        shape = reader._traces.shape
+        chunks = reader._traces.chunks
+        chunks = chunks[:-1] + (shape[-1],)  # don't chunk samples
+
+        iterator = ChunkIterator(shape=shape, chunks=chunks)
         progress = tqdm(iterator, unit="block")
         progress.set_description(desc=f"Copying data for '{access_pattern=}'")
         for slice_ in progress:
@@ -177,7 +181,10 @@ def create_rechunk_plan(
 
     n_dimension = len(data_array.shape)
     dummy_array = zarr.empty(shape=data_array.shape, chunks=(MAX_BUFFER,) * n_dimension)
-    iterator = ChunkIterator(dummy_array)
+
+    shape = dummy_array.shape
+    chunks = dummy_array.chunks
+    iterator = ChunkIterator(shape=shape, chunks=chunks)
 
     return metadata_arrs, data_arrs, live_mask, iterator
 

diff --git a/src/mdio/converters/numpy.py b/src/mdio/converters/numpy.py
@@ -7,7 +7,6 @@
 import numpy as np
 
 from mdio.api.accessor import MDIOWriter
-from mdio.converters.segy import get_compressor
 from mdio.core.dimension import Dimension
 from mdio.core.factory import MDIOCreateConfig
 from mdio.core.factory import MDIOVariableConfig
@@ -137,6 +136,11 @@ def numpy_to_mdio(  # noqa: PLR0913
     suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None]
     suffix = "".join(suffix)
 
+    # TODO(Dmitrit Repin): Implement Numpy converted in MDIO v1
+    # https://github.com/TGSAI/mdio-python/issues/596
+    def get_compressor(lossless: bool, tolerance: float) -> list[str]:
+        pass
+
     compressors = get_compressor(lossless, compression_tolerance)
     mdio_var = MDIOVariableConfig(
         name=f"chunked_{suffix}",

diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
diff --git a/src/mdio/converters/type_converter.py b/src/mdio/converters/type_converter.py
@@ -0,0 +1,85 @@
+"""A module for converting numpy dtypes to MDIO scalar and structured types."""
+
+from numpy import dtype as np_dtype
+
+from mdio.schemas.dtype import ScalarType
+from mdio.schemas.dtype import StructuredField
+from mdio.schemas.dtype import StructuredType
+
+
+def to_scalar_type(data_type: np_dtype) -> ScalarType:
+    """Convert numpy dtype to MDIO ScalarType.
+
+    Out of the 24 built-in numpy scalar type objects
+    (see https://numpy.org/doc/stable/reference/arrays.dtypes.html)
+    this function supports only a limited subset:
+        ScalarType.INT8 <-> int8
+        ScalarType.INT16 <-> int16
+        ScalarType.INT32 <-> int32
+        ScalarType.INT64 <-> int64
+        ScalarType.UINT8 <-> uint8
+        ScalarType.UINT16 <-> uint16
+        ScalarType.UINT32 <-> uint32
+        ScalarType.UINT64 <-> uint64
+        ScalarType.FLOAT32 <-> float32
+        ScalarType.FLOAT64 <-> float64
+        ScalarType.COMPLEX64 <-> complex64
+        ScalarType.COMPLEX128 <-> complex128
+        ScalarType.BOOL <-> bool
+
+    Args:
+        data_type: numpy dtype to convert
+
+    Returns:
+        ScalarType: corresponding MDIO scalar type
+
+    Raises:
+        ValueError: if dtype is not supported
+    """
+    try:
+        return ScalarType(data_type.name)
+    except ValueError as exc:
+        err = f"Unsupported numpy dtype '{data_type.name}' for conversion to ScalarType."
+        raise ValueError(err) from exc
+
+
+def to_structured_type(data_type: np_dtype) -> StructuredType:
+    """Convert numpy dtype to MDIO StructuredType.
+
+    This function supports only a limited subset of structured types.
+    In particular:
+    It does not support nested structured types.
+    It supports fields of only 13 out of 24 built-in numpy scalar types.
+    (see `to_scalar_type` for details).
+
+    Args:
+        data_type: numpy dtype to convert
+
+    Returns:
+        StructuredType: corresponding MDIO structured type
+
+    Raises:
+        ValueError: if dtype is not structured or has no fields
+
+    """
+    if data_type is None or len(data_type.names or []) == 0:
+        err = "None or empty dtype provided, cannot convert to StructuredType."
+        raise ValueError(err)
+
+    fields = []
+    for field_name in data_type.names:
+        field_dtype = data_type.fields[field_name][0]
+        scalar_type = to_scalar_type(field_dtype)
+        structured_field = StructuredField(name=field_name, format=scalar_type)
+        fields.append(structured_field)
+    return StructuredType(fields=fields)
+
+
+def to_numpy_dtype(data_type: ScalarType | StructuredType) -> np_dtype:
+    """Get the numpy dtype for a variable."""
+    if isinstance(data_type, ScalarType):
+        return np_dtype(data_type.value)
+    if isinstance(data_type, StructuredType):
+        return np_dtype([(f.name, f.format.value) for f in data_type.fields])
+    msg = f"Expected ScalarType or StructuredType, got '{type(data_type).__name__}'"
+    raise ValueError(msg)
diff --git a/src/mdio/core/indexing.py b/src/mdio/core/indexing.py
@@ -4,78 +4,83 @@
 from math import ceil
 
 import numpy as np
-from zarr import Array
 
 
 class ChunkIterator:
-    """Iterator for traversing a Zarr array in chunks.
+    """Chunk iterator for multi-dimensional arrays.
 
-    This iterator yields tuples of slices corresponding to the chunk boundaries of a Zarr array.
-    It supports chunking all dimensions or taking the full extent of the last dimension.
+    This iterator takes an array shape and chunks and every time it is iterated, it returns
+    a dictionary (if dimensions are provided) or a tuple of slices that align with
+    chunk boundaries. When dimensions are provided, they are used as the dictionary keys.
 
     Args:
-        array: The Zarr array to iterate, providing shape and chunk sizes.
-        chunk_samples: If True, chunks all dimensions. If False, takes the full extent of the
-            last dimension. Defaults to True.
-
-
-    Example:
-        >>> import zarr
-        >>> arr = zarr.array(np.zeros((10, 20)), chunks=(3, 4))
-        >>> it = ChunkIterator(arr)
-        >>> for slices in it:
-        ...     print(slices)
-        (slice(0, 3, None), slice(0, 4, None))
-        (slice(0, 3, None), slice(4, 8, None))
-        ...
-        >>> it = ChunkIterator(arr, chunk_samples=False)
-        >>> for slices in it:
-        ...     print(slices)
-        (slice(0, 3, None), slice(0, 20, None))
-        (slice(3, 6, None), slice(0, 20, None))
-        ...
+        shape: The shape of the array.
+        chunks: The chunk sizes for each dimension.
+        dim_names: The names of the array dimensions, to be used with DataArray.isel().
+                   If the dim_names are not provided, a tuple of the slices will be returned.
+
+    Attributes:             # noqa: DOC602
+        arr_shape: Shape of the array.
+        len_chunks: Length of chunks in each dimension.
+        dim_chunks: Number of chunks in each dimension.
+        num_chunks: Total number of chunks.
+
+    Examples:
+        >> chunks = (3, 4, 5)
+        >> shape = (5, 11, 19)
+        >> dims = ["inline", "crossline", "depth"]
+        >>
+        >> iter = ChunkIterator(shape=shape, chunks=chunks, dim_names=dims)
+        >> for i in range(13):
+        >>    region = iter.__next__()
+        >> print(region)
+        { "inline": slice(3,6, None), "crossline": slice(0,4, None), "depth": slice(0,5, None) }
+
+        >> iter = ChunkIterator(shape=shape, chunks=chunks, dim_names=None)
+        >> for i in range(13):
+        >>    region = iter.__next__()
+        >> print(region)
+        (slice(3,6,None), slice(0,4,None), slice(0,5,None))
     """
 
-    def __init__(self, array: Array, chunk_samples: bool = True):
-        self.arr_shape = array.shape
-        self.len_chunks = array.chunks
-
-        # If chunk_samples is False, set the last dimension's chunk size to its full extent
-        if not chunk_samples:
-            self.len_chunks = self.len_chunks[:-1] + (self.arr_shape[-1],)
-
-        # Calculate the number of chunks per dimension
-        self.dim_chunks = [
-            ceil(len_dim / chunk)
-            for len_dim, chunk in zip(self.arr_shape, self.len_chunks, strict=True)
-        ]
+    def __init__(
+        self, shape: tuple[int, ...], chunks: tuple[int, ...], dim_names: tuple[str, ...] = None
+    ):
+        self.arr_shape = tuple(shape)  # Deep copy to ensure immutability
+        self.len_chunks = tuple(chunks)  # Deep copy to ensure immutability
+        self.dims = dim_names
+
+        # Compute number of chunks per dimension, and total number of chunks
+        self.dim_chunks = tuple(
+            [
+                ceil(len_dim / chunk)
+                for len_dim, chunk in zip(self.arr_shape, self.len_chunks, strict=True)
+            ]
+        )
         self.num_chunks = np.prod(self.dim_chunks)
 
-        # Set up chunk index combinations using ranges for each dimension
+        # Under the hood stuff for the iterator. This generates C-ordered
+        # permutation of chunk indices.
         dim_ranges = [range(dim_len) for dim_len in self.dim_chunks]
         self._ranges = itertools.product(*dim_ranges)
         self._idx = 0
 
     def __iter__(self) -> "ChunkIterator":
-        """Return the iterator object itself."""
+        """Iteration context."""
         return self
 
     def __len__(self) -> int:
-        """Return the total number of chunks."""
+        """Get total number of chunks."""
         return self.num_chunks
 
-    def __next__(self) -> tuple[slice, ...]:
-        """Yield the next set of chunk slices.
-
-        Returns:
-            A tuple of slice objects for each dimension.
-
-        Raises:
-            StopIteration: When all chunks have been iterated over.
-        """
-        if self._idx < self.num_chunks:
+    def __next__(self) -> dict[str, slice]:
+        """Iteration logic."""
+        if self._idx <= self.num_chunks:
+            # We build slices here. It is dimension agnostic
             current_start = next(self._ranges)
 
+            # TODO (Dmitriy Repin): Enhance ChunkIterator to make the last slice, if needed, smaller
+            # https://github.com/TGSAI/mdio-python/issues/586
             start_indices = tuple(
                 dim * chunk for dim, chunk in zip(current_start, self.len_chunks, strict=True)
             )
@@ -88,7 +93,17 @@ def __next__(self) -> tuple[slice, ...]:
                 slice(start, stop) for start, stop in zip(start_indices, stop_indices, strict=True)
             )
 
+            if self.dims:  # noqa SIM108
+                # Example
+                # {"inline":slice(3,6,None), "crossline":slice(0,4,None), "depth":slice(0,5,None)}
+                result = dict(zip(self.dims, slices, strict=False))
+            else:
+                # Example
+                # (slice(3,6,None), slice(0,4,None), slice(0,5,None))
+                result = slices
+
             self._idx += 1
-            return slices
+
+            return result
 
         raise StopIteration
diff --git a/src/mdio/core/storage_location.py b/src/mdio/core/storage_location.py
@@ -0,0 +1,87 @@
+"""StorageLocation class for managing local and cloud storage locations."""
+
+from pathlib import Path
+from typing import Any
+
+import fsspec
+
+
+# TODO(Dmitriy Repin): Reuse fsspec functions for some methods we implemented here
+# https://github.com/TGSAI/mdio-python/issues/597
+class StorageLocation:
+    """A class to represent a local or cloud storage location for SEG-Y or MDIO files.
+
+    This class abstracts the storage location, allowing for both local file paths and
+    cloud storage URIs (e.g., S3, GCS). It uses fsspec to check existence and manage options.
+    Note, we do not want to make it a dataclass because we want the uri and the options to
+    be read-only immutable properties.
+
+        uri: The URI of the storage location (e.g., '/path/to/file', 'file:///path/to/file',
+                's3://bucket/path', 'gs://bucket/path').
+        options: Optional dictionary of options for the cloud, such as credentials.
+
+    """
+
+    def __init__(self, uri: str = "", options: dict[str, Any] = None):
+        self._uri = uri
+        self._options = options or {}
+        self._fs = None
+
+        if uri.startswith(("s3://", "gs://")):
+            return
+
+        if uri.startswith("file://"):
+            self._uri = self._uri.removeprefix("file://")
+        # For local paths, ensure they are absolute and resolved
+        self._uri = str(Path(self._uri).resolve())
+        return
+
+    @property
+    def uri(self) -> str:
+        """Get the URI (read-only)."""
+        return self._uri
+
+    @property
+    def options(self) -> dict[str, Any]:
+        """Get the options (read-only)."""
+        # Return a copy to prevent external modification
+        return self._options.copy()
+
+    @property
+    def _filesystem(self) -> fsspec.AbstractFileSystem:
+        """Get the fsspec filesystem instance for this storage location."""
+        if self._fs is None:
+            self._fs = fsspec.filesystem(self._protocol, **self._options)
+        return self._fs
+
+    @property
+    def _path(self) -> str:
+        """Extract the path portion from the URI."""
+        if "://" in self._uri:
+            return self._uri.split("://", 1)[1]
+        return self._uri  # For local paths without file:// prefix
+
+    @property
+    def _protocol(self) -> str:
+        """Extract the protocol/scheme from the URI."""
+        if "://" in self._uri:
+            return self._uri.split("://", 1)[0]
+        return "file"  # Default to file protocol
+
+    def exists(self) -> bool:
+        """Check if the storage location exists using fsspec."""
+        try:
+            return self._filesystem.exists(self._path)
+        except Exception as e:
+            # Log the error and return False for safety
+            # In a production environment, you might want to use proper logging
+            print(f"Error checking existence of {self._uri}: {e}")
+            return False
+
+    def __str__(self) -> str:
+        """String representation of the storage location."""
+        return self._uri
+
+    def __repr__(self) -> str:
+        """Developer representation of the storage location."""
+        return f"StorageLocation(uri='{self._uri}', options={self._options})"