diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 84393b264..946d78a68 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -14,7 +14,6 @@ from mdio.api.io import _normalize_path from mdio.api.io import to_mdio -from mdio.constants import UINT32_MAX from mdio.converters.exceptions import EnvironmentFormatError from mdio.converters.exceptions import GridTraceCountError from mdio.converters.exceptions import GridTraceSparsityError @@ -224,18 +223,89 @@ def populate_dim_coordinates( return dataset, drop_vars_delayed -def populate_non_dim_coordinates( +def _check_dimensions_values_identical(arr: np.ndarray, axes_to_check: tuple[int, ...]) -> bool: + """Check if all values along specified dimensions are identical. + + Check if all values along specified dimensions are identical for each sub-array + defined by the other dimensions. + + Args: + arr: an N-dimensional array. For example, an array of all 'cdp_x' segy + header values for coordinates "inline", "crossline", "offset", "azimuth". + axes_to_check: A tuple of integers representing the axes to check for + identical values. For example, (2, 3) would check the "offset", "azimuth" + dimensions. + + Returns: + True indicates the all values in the dimensions selected by axes_to_check + are identical, and False otherwise. + """ + # Create a slicing tuple to get the first element along the axes to check + full_slice = [slice(None)] * arr.ndim + for axis in axes_to_check: + full_slice[axis] = 0 + + # Broadcast the first element along the specified axes for comparison + first_element_slice = arr[tuple(full_slice)] + + # Add new axes to the slice to enable broadcasting against the original array + for axis in axes_to_check: + first_element_slice = np.expand_dims(first_element_slice, axis) + + # Compare the array with the broadcasted slice and use np.all() + # to collapse the dimensions being checked + identical = np.all(np.isclose(arr, first_element_slice, rtol=1e-05, atol=1e-08), axis=axes_to_check) + return np.all(identical).item() + + +def _populate_non_dim_coordinates( dataset: xr_Dataset, grid: Grid, - coordinates: dict[str, SegyHeaderArray], + coordinate_headers: dict[str, SegyHeaderArray], drop_vars_delayed: list[str], ) -> tuple[xr_Dataset, list[str]]: """Populate the xarray dataset with coordinate variables.""" - not_null = grid.map[:] != UINT32_MAX - for c_name, c_values in coordinates.items(): - c_tmp_array = dataset[c_name].values - c_tmp_array[not_null] = c_values - dataset[c_name][:] = c_tmp_array + # Load the grid map values into memory. + for c_name, coord_headers_values in coordinate_headers.items(): + headers_dims = grid.dim_names[:-1] # e.g.: "inline", "crossline", "offset", "azimuth" + coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline" + axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims) + if axes_to_check == (): + # In case the coordinate has the same dimensions as grid map + not_null = grid.map[:] != grid.map.fill_value + tmp_coord_values = dataset[c_name].values + tmp_coord_values[not_null] = coord_headers_values + else: + # In the case of Coca and some other templates, the coordinate header values, + # coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline", + # "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g., + # dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of + # "inline", "crossline"). + # Thus, every coordinate 2-tuple has multiple duplicate values of the "cdp_x" coordinates + # stored in coord_headers_values. Those needs to be reduced to a unique value. + # We assume (and check) that all the duplicate values are (near) identical. + # + # The following will create a temporary array in memory with the same shape as the + # coordinate defined in the dataset. Since the coordinate variable has not yet been + # populated, the temporary array will be populated with _FillValue from the current + # coordinate values. + tmp_coord_values = dataset[c_name].values + # Create slices for the all grid dimensions that are also the coordinate dimensions. + # For other dimension, select the first element (with index 0) + slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims) + # Create a boolean mask for the live trace values with the dimensions of the coordinate + not_null = grid.map[slices] != grid.map.fill_value + ch_reshaped = coord_headers_values.reshape(grid.map.shape) + # Select a subset of the coordinate_headers that have unique values + # and save the unique coordinate values for live traces only + tmp_coord_values[not_null] = ch_reshaped[slices].ravel() + + # Validate the all reduced dimensions had identical values + if not _check_dimensions_values_identical(ch_reshaped, axes_to_check): + err = f"Coordinate '{c_name}' has non-identical values along reduced dimensions." + raise ValueError(err) + + dataset[c_name][:] = tmp_coord_values drop_vars_delayed.append(c_name) return dataset, drop_vars_delayed @@ -274,15 +344,16 @@ def _populate_coordinates( coords: The non-dim coordinates to populate. Returns: - Xarray dataset with filled coordinates and updated variables to drop after writing + A tuple of the Xarray dataset with filled coordinates and updated variables to drop + after writing """ drop_vars_delayed = [] # Populate the dimension coordinate variables (1-D arrays) - dataset, vars_to_drop_later = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed) + dataset, drop_vars_delayed = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed) # Populate the non-dimension coordinate variables (N-dim arrays) - dataset, vars_to_drop_later = populate_non_dim_coordinates( - dataset, grid, coordinates=coords, drop_vars_delayed=drop_vars_delayed + dataset, drop_vars_delayed = _populate_non_dim_coordinates( + dataset, grid, coordinate_headers=coords, drop_vars_delayed=drop_vars_delayed ) return dataset, drop_vars_delayed diff --git a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py index 808b849d1..d66413237 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py @@ -12,8 +12,8 @@ class Seismic3DPreStackCocaTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ("inline", "crossline", "offset", "azimuth") - self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_dim_names = ("inline", "crossline") + self._dim_names = (*self._coord_dim_names, "offset", "azimuth", self._trace_domain) self._coord_names = ("cdp_x", "cdp_y") self._var_chunk_shape = (8, 8, 32, 1, 1024) @@ -64,13 +64,13 @@ def _add_coordinates(self) -> None: # Add non-dimension coordinates self._builder.add_coordinate( "cdp_x", - dimensions=("inline", "crossline"), + dimensions=self._coord_dim_names, data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "cdp_y", - dimensions=("inline", "crossline"), + dimensions=self._coord_dim_names, data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 9290dae80..083d6e9f1 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -200,16 +200,30 @@ def mock_nd_segy(path: str, grid_conf: GridConfig, segy_factory_conf: SegyFactor # Fill coordinates (e.g. {SRC-REC-CDP}-X/Y headers["coordinate_scalar"] = -100 - for field in ["cdp_x", "source_coord_x", "group_coord_x"]: - start = 700000 - step = 100 - stop = start + step * (trace_numbers.size - 0) - headers[field] = np.arange(start=start, stop=stop, step=step) - for field in ["cdp_y", "source_coord_y", "group_coord_y"]: - start = 4000000 - step = 100 - stop = start + step * (trace_numbers.size - 0) - headers[field] = np.arange(start=start, stop=stop, step=step) + if grid_conf.name == "3d_coca": + coords = [] + for name in ["inline", "crossline", "offset", "azimuth"]: + dim = next(dim for dim in grid_conf.dims if dim.name == name) + coords.append(np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step)) + inline, crossline, _, _ = np.meshgrid(*coords, indexing="ij") + # Make sure that multiple cdp_x and cdp_y values are the same for each (il, xl) + cdp_x = 700000 + 100 * inline + crossline + cdp_y = 4000000 + inline + 1000 * crossline + for field in ["cdp_x", "source_coord_x", "group_coord_x"]: + headers[field] = cdp_x.ravel() + for field in ["cdp_y", "source_coord_y", "group_coord_y"]: + headers[field] = cdp_y.ravel() + else: + for field in ["cdp_x", "source_coord_x", "group_coord_x"]: + start = 700000 + step = 100 + stop = start + step * (trace_numbers.size - 0) + headers[field] = np.arange(start=start, stop=stop, step=step) + for field in ["cdp_y", "source_coord_y", "group_coord_y"]: + start = 4000000 + step = 100 + stop = start + step * (trace_numbers.size - 0) + headers[field] = np.arange(start=start, stop=stop, step=step) # Array filled with repeating sequence sequence = np.array([1, 2, 3]) @@ -285,8 +299,8 @@ def test_import(self, test_conf: MaskedExportConfig, export_masked_path: Path) - template_name = "PreStackShotGathers2D" + domain case "3d_streamer": template_name = "PreStackShotGathers3D" + domain - # case "3d_coca": - # templateName = "PostStack3D" + domain + case "3d_coca": + template_name = "PreStackCocaGathers3D" + domain case _: err = f"Unsupported test configuration: {grid_conf.name}" raise ValueError(err) @@ -337,11 +351,28 @@ def test_ingested_mdio(self, test_conf: MaskedExportConfig, export_masked_path: assert headers.shape == live_mask.shape assert set(headers.dims) == {dim.name for dim in grid_conf.dims} # Validate header values - trace_index = 0 - trace_header = headers.values.flatten()[trace_index] - expected_x = 700000 + trace_index * 100 - expected_y = 4000000 + trace_index * 100 + if grid_conf.name == "3d_coca": + # Select an index of the trace, which header to check: + il, xl, of, az = 0, 0, 0, 0 + trace_index = np.ravel_multi_index((il, xl, of, az), dims=live_mask.shape) + # Get the trace header + trace_header = headers.values[il][xl][of][az] + # calculate the expected values + coords = {} + for name in ["inline", "crossline", "offset", "azimuth"]: + dim = next(dim for dim in grid_conf.dims if dim.name == name) + coords[name] = np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step) + inline = coords["inline"][il] + crossline = coords["crossline"][xl] + expected_x = 700000 + 100*inline + crossline + expected_y = 4000000 + inline + 1000*crossline + else: + trace_index = 0 + trace_header = headers.values.flatten()[trace_index] + expected_x = 700000 + trace_index * 100 + expected_y = 4000000 + trace_index * 100 expected_gun = 1 + (trace_index % 3) + # validate assert trace_header["coordinate_scalar"] == -100 assert trace_header["source_coord_x"] == expected_x assert trace_header["source_coord_y"] == expected_y diff --git a/tests/unit/v1/converters/test_segy_to_mdio.py b/tests/unit/v1/converters/test_segy_to_mdio.py new file mode 100644 index 000000000..6c06d6ac0 --- /dev/null +++ b/tests/unit/v1/converters/test_segy_to_mdio.py @@ -0,0 +1,89 @@ +"""Tests for segy-to_mdio convenience functions.""" + +import numpy as np +import pytest +import xarray as xr +from segy.arrays import HeaderArray as SegyHeaderArray + +from mdio.converters.segy import _populate_non_dim_coordinates +from mdio.core.dimension import Dimension +from mdio.core.grid import Grid + + +def test__populate_non_dim_coordinates() -> None: + """Test population of non-dimensional coordinates.""" + dim_in, dim_xl, dim_of, dim_az, dim_tm = 2, 3, 4, 5, 1 + il = Dimension(name="inline", coords=np.arange(dim_in)) + xl = Dimension(name="crossline", coords=np.arange(dim_xl)) + of = Dimension(name="offset", coords=np.arange(dim_of)) + az = Dimension(name="azimuth", coords=np.arange(dim_az)) + tm = Dimension(name="time", coords=np.arange(dim_tm)) + + r = np.random.rand(dim_in, dim_xl, dim_of, dim_az) + + il_, xl_, of_, az_ = np.meshgrid(il.coords, xl.coords, of.coords, az.coords, indexing="ij") + diff = 1000 * il_ + 100 * xl_ + 10 * of_ + 1 * az_ # Different values for the same (il, xl) + same = 1000 * il_ + 100 * xl_ # Same values for the same (il, xl) + near = 1000 * il_ + 100 * xl_ + 1e-09 * r # Near same values for the same (il, xl) + # NOTE: near[1][1][1][1]: np.float64(1100.0000000003308) + + data_type = [ + ("inline", " None: t = Seismic3DPreStackCocaTemplate(domain="time") # Template attributes - assert t._coord_dim_names == ("inline", "crossline", "offset", "azimuth") + assert t._coord_dim_names == ("inline", "crossline") assert t._dim_names == ("inline", "crossline", "offset", "azimuth", "time") assert t._coord_names == ("cdp_x", "cdp_y") assert t._var_chunk_shape == (8, 8, 32, 1, 1024)