v1 implementation of AutoChannelWrap grid override (#632)

* AutoChannelWrap over updated-v1 * Fix test * rename function for new behaviour and improve type hint for grid_overrides * simplify metadata handling * lint * gridOverride is not required * remove unnecessary byte order change, handled upstream. * remove rtol adds, tests pass. * remove expected behaviour comment * clean up tests * use grouped assignments to fix PLR915 * add comments to clarify --------- Co-authored-by: Altay Sansal <tasansal@users.noreply.github.com>
TGSAI · tasansal · Sep 8, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 29, 2025
commit db33c2aa6bc08efeb1bdb1534a28baa516f31945
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
@@ -26,6 +26,8 @@
 from mdio.segy.utilities import get_grid_plan
 
 if TYPE_CHECKING:
+    from typing import Any
+
     from segy.arrays import HeaderArray as SegyHeaderArray
     from segy.schema import SegySpec
     from xarray import Dataset as xr_Dataset
@@ -113,25 +115,30 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
 
 
 def _scan_for_headers(
-    segy_file: SegyFile, template: AbstractDatasetTemplate
+    segy_file: SegyFile,
+    template: AbstractDatasetTemplate,
+    grid_overrides: dict[str, Any] | None = None,
 ) -> tuple[list[Dimension], SegyHeaderArray]:
     """Extract trace dimensions and index headers from the SEG-Y file.
 
     This is an expensive operation.
     It scans the SEG-Y file in chunks by using ProcessPoolExecutor
     """
-    # TODO(Dmitriy): implement grid overrides
-    # https://github.com/TGSAI/mdio-python/issues/585
-    # The 'grid_chunksize' is used only for grid_overrides
-    # While we do not support grid override, we can set it to None
-    grid_chunksize = None
-    segy_dimensions, chunksize, segy_headers = get_grid_plan(
+    full_chunk_size = template.full_chunk_size
+    segy_dimensions, chunk_size, segy_headers = get_grid_plan(
         segy_file=segy_file,
         return_headers=True,
         template=template,
-        chunksize=grid_chunksize,
-        grid_overrides=None,
+        chunksize=full_chunk_size,
+        grid_overrides=grid_overrides,
     )
+    if full_chunk_size != chunk_size:
+        # TODO(Dmitriy): implement grid overrides
+        # https://github.com/TGSAI/mdio-python/issues/585
+        # The returned 'chunksize' is used only for grid_overrides. We will need to use it when full
+        # support for grid overrides is implemented
+        err = "Support for changing full_chunk_size in grid overrides is not yet implemented"
+        raise NotImplementedError(err)
     return segy_dimensions, segy_headers
 
 
@@ -278,7 +285,7 @@ def _populate_coordinates(
     return dataset, drop_vars_delayed
 
 
-def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:
+def _add_segy_ingest_attributes(dataset: Dataset, segy_file: SegyFile, grid_overrides: dict[str, Any] | None) -> None:
     text_header = segy_file.text_header.splitlines()
     # Validate:
     # text_header this should be a 40-items array of strings with width of 80 characters.
@@ -301,23 +308,27 @@ def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:
 
     # Handle case where it may not have any metadata yet
     if dataset.metadata.attributes is None:
-        dataset.attrs["attributes"] = {}
+        dataset.metadata.attributes = {}
+
+    segy_attributes = {
+        "textHeader": text_header,
+        "binaryHeader": segy_file.binary_header.to_dict(),
+    }
+
+    if grid_overrides is not None:
+        segy_attributes["gridOverrides"] = grid_overrides
 
     # Update the attributes with the text and binary headers.
-    dataset.metadata.attributes.update(
-        {
-            "textHeader": text_header,
-            "binaryHeader": segy_file.binary_header.to_dict(),
-        }
-    )
+    dataset.metadata.attributes.update(segy_attributes)
 
 
-def segy_to_mdio(
+def segy_to_mdio(  # noqa PLR0913
     segy_spec: SegySpec,
     mdio_template: AbstractDatasetTemplate,
     input_location: StorageLocation,
     output_location: StorageLocation,
     overwrite: bool = False,
+    grid_overrides: dict[str, Any] | None = None,
 ) -> None:
     """A function that converts a SEG-Y file to an MDIO v1 file.
 
@@ -329,6 +340,7 @@ def segy_to_mdio(
         input_location: The storage location of the input SEG-Y file.
         output_location: The storage location for the output MDIO v1 file.
         overwrite: Whether to overwrite the output file if it already exists. Defaults to False.
+        grid_overrides: Option to add grid overrides.
 
     Raises:
         FileExistsError: If the output location already exists and overwrite is False.
@@ -340,12 +352,11 @@ def segy_to_mdio(
     segy_settings = SegySettings(storage_options=input_location.options)
     segy_file = SegyFile(url=input_location.uri, spec=segy_spec, settings=segy_settings)
 
-    # Scan the SEG-Y file for headers
-    segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template)
+    segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template, grid_overrides)
 
     grid = _build_and_check_grid(segy_dimensions, segy_file, segy_headers)
 
-    dimensions, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
+    _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
     # TODO(Altay): Turn this dtype into packed representation
     # https://github.com/TGSAI/mdio-python/issues/601
     headers = to_structured_type(segy_spec.trace.header.dtype)
@@ -358,7 +369,7 @@ def segy_to_mdio(
         headers=headers,
     )
 
-    _add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file)
+    _add_segy_ingest_attributes(dataset=mdio_ds, segy_file=segy_file, grid_overrides=grid_overrides)
 
     xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)
 

diff --git a/src/mdio/schemas/v1/templates/abstract_dataset_template.py b/src/mdio/schemas/v1/templates/abstract_dataset_template.py
@@ -120,6 +120,11 @@ def coordinate_names(self) -> list[str]:
         """Returns the names of the coordinates."""
         return copy.deepcopy(self._coord_names)
 
+    @property
+    def full_chunk_size(self) -> list[int]:
+        """Returns the chunk size for the variables."""
+        return copy.deepcopy(self._var_chunk_shape)
+
     @property
     @abstractmethod
     def _name(self) -> str:

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -19,8 +19,8 @@
     from segy.schema import SegySpec
 
 
-def _segy_spec_mock_4d() -> SegySpec:
-    """Create a mock SEG-Y spec for 4D data."""
+def get_segy_mock_4d_spec() -> SegySpec:
+    """Create a mock 4D SEG-Y specification."""
     trace_header_fields = [
         HeaderField(name="field_rec_no", byte=9, format="int32"),
         HeaderField(name="channel", byte=13, format="int32"),
@@ -31,6 +31,13 @@ def _segy_spec_mock_4d() -> SegySpec:
         HeaderField(name="shot_line", byte=133, format="int16"),
         HeaderField(name="cable", byte=137, format="int16"),
         HeaderField(name="gun", byte=171, format="int16"),
+        HeaderField(name="coordinate_scalar", byte=71, format="int16"),
+        HeaderField(name="source_coord_x", byte=73, format="int32"),
+        HeaderField(name="source_coord_y", byte=77, format="int32"),
+        HeaderField(name="group_coord_x", byte=81, format="int32"),
+        HeaderField(name="group_coord_y", byte=85, format="int32"),
+        HeaderField(name="cdp_x", byte=181, format="int32"),
+        HeaderField(name="cdp_y", byte=185, format="int32"),
     ]
     rev1_spec = get_segy_standard(1.0)
     spec = rev1_spec.customize(trace_header_fields=trace_header_fields)
@@ -83,33 +90,45 @@ def create_segy_mock_4d(  # noqa: PLR0913
     channel_headers = np.tile(channel_headers, shot_count)
 
     factory = SegyFactory(
-        spec=_segy_spec_mock_4d(),
+        spec=get_segy_mock_4d_spec(),
         sample_interval=1000,
         samples_per_trace=num_samples,
     )
 
     headers = factory.create_trace_header_template(trace_count)
     samples = factory.create_trace_sample_template(trace_count)
 
-    for trc_idx in range(trace_count):
-        shot = shot_headers[trc_idx]
-        gun = gun_headers[trc_idx]
-        cable = cable_headers[trc_idx]
-        channel = channel_headers[trc_idx]
-        shot_line = 1
-        offset = 0
-
-        if index_receivers is False:
-            channel, gun, shot_line = 0, 0, 0
-
-        header_data = (shot, channel, shot, offset, shot_line, cable, gun)
-
-        fields = list(headers.dtype.names)
-        fields.remove("samples_per_trace")
-        fields.remove("sample_interval")
-
-        headers[fields][trc_idx] = header_data
-        samples[trc_idx] = np.linspace(start=shot, stop=shot + 1, num=num_samples)
+    start_x = 700000
+    start_y = 4000000
+    step_x = 100
+    step_y = 100
+
+    for trc_shot_idx in range(shot_count):
+        for trc_chan_idx in range(total_chan):
+            trc_idx = trc_shot_idx * total_chan + trc_chan_idx
+
+            shot = shot_headers[trc_idx]
+            gun = gun_headers[trc_idx]
+            cable = cable_headers[trc_idx]
+            channel = channel_headers[trc_idx]
+            shot_line = 1
+            offset = 0
+
+            if index_receivers is False:
+                channel, gun, shot_line = 0, 0, 0
+
+            # Assign dimension coordinate fields with calculated mock data
+            header_fields = ["field_rec_no", "channel", "shot_point", "offset", "shot_line", "cable", "gun"]
+            headers[header_fields][trc_idx] = (shot, channel, shot, offset, shot_line, cable, gun)
+
+            # Assign coordinate fields with mock data
+            x = start_x + step_x * trc_shot_idx
+            y = start_y + step_y * trc_chan_idx
+            headers["coordinate_scalar"][trc_idx] = -100
+            coord_fields = ["source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y", "cdp_x", "cdp_y"]
+            headers[coord_fields][trc_idx] = (x, y) * 3
+
+            samples[trc_idx] = np.linspace(start=shot, stop=shot + 1, num=num_samples)
 
     with segy_path.open(mode="wb") as fp:
         fp.write(factory.create_textual_header())