From 9fad798fb244333db04415edbc06080a1919f222 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 27 Aug 2025 15:48:07 +0000 Subject: [PATCH 1/7] Coca template take 2 --- src/mdio/converters/segy.py | 97 ++++++++++++++++--- .../v1/templates/seismic_3d_prestack_coca.py | 8 +- .../test_segy_import_export_masked.py | 4 +- tests/unit/v1/converters/test_segy_to_mdio.py | 78 +++++++++++++++ .../test_seismic_3d_prestack_coca.py | 2 +- 5 files changed, 171 insertions(+), 18 deletions(-) create mode 100644 tests/unit/v1/converters/test_segy_to_mdio.py diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 84393b264..b367965cb 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -224,20 +224,89 @@ def populate_dim_coordinates( return dataset, drop_vars_delayed -def populate_non_dim_coordinates( +def _check_dimensions_values_identical(arr: np.ndarray, axes_to_check: tuple[int, ...]) -> np.ndarray: + """ + Check if all values along specified dimensions are identical for each + sub-array defined by the other dimensions. + + Args: + arr (np.ndarray): an N-dimensional array. For example, an array of all 'cdp_x' segy + header values for coordinates "inline", "crossline", "offset", "azimuth". + axes_to_check (tuple): A tuple of integers representing the axes to check for + identical values. For example, (2, 3) would check the "offset", "azimuth" + dimensions. + + Returns: + bool: True indicates the all values in the dimensions selected by axes_to_check + are identical, and False otherwise. + """ + + # Create a slicing tuple to get the first element along the axes to check + full_slice = [slice(None)] * arr.ndim + for axis in axes_to_check: + full_slice[axis] = 0 + + # Broadcast the first element along the specified axes for comparison + first_element_slice = arr[tuple(full_slice)] + + # Add new axes to the slice to enable broadcasting against the original array + for axis in axes_to_check: + first_element_slice = np.expand_dims(first_element_slice, axis) + + # Compare the array with the broadcasted slice and use np.all() + # to collapse the dimensions being checked + identical = np.all(np.isclose(arr, first_element_slice), axis=axes_to_check) + return np.all(identical).item() + + +def _populate_non_dim_coordinates( dataset: xr_Dataset, grid: Grid, - coordinates: dict[str, SegyHeaderArray], + coordinate_headers: dict[str, SegyHeaderArray], drop_vars_delayed: list[str], ) -> tuple[xr_Dataset, list[str]]: """Populate the xarray dataset with coordinate variables.""" - not_null = grid.map[:] != UINT32_MAX - for c_name, c_values in coordinates.items(): - c_tmp_array = dataset[c_name].values - c_tmp_array[not_null] = c_values - dataset[c_name][:] = c_tmp_array + coord_is_good = {} + # Load the grid map values into memory. + # Q: Should we be using the trace_mask boolean array instead of grid map? + grid_map_values = grid.map[:] + for c_name, coord_headers_values in coordinate_headers.items(): + + # In the case of Coca and some other templates, the coordinate header values, + # coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline", + # "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g., + # dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of + # "inline", "crossline"). + # Thus, every coordinate 2-tuple has multiple duplicate values of the "cdp_x" coordinates + # stored in coord_headers_values. Those needs to be reduced to a unique value. + # We assume (and check) that all the duplicate values are (near) identical. + headers_dims = grid.dim_names[:-1] # e.g.: "inline", "crossline", "offset", "azimuth" + coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline" + # This will create a temporary array in memory with the same shape as the coordinate + # defined in the dataset. Since the coordinate variable has not yet been populated, + # the temporary array will be populated with NaN from the current coordinate values. + tmp_nd_coord_values = dataset[c_name].values + # Create slices for the all grid dimensions that are also the coordinate dimensions. + # For other dimension, select the first element (with index 0) + slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims) + # Create a boolean mask for the live trace values with the dimensions of the coordinate + # Q: Should we be using the trace_mask boolean array instead of grid map? + not_null = grid_map_values[slices] != UINT32_MAX + + ch_reshaped = coord_headers_values.reshape(grid_map_values.shape) + # Select a subset of the coordinate_headers that have unique values + cr_reduced = ch_reshaped[slices] + + # Validate the all reduced dimensions had identical values + axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims) + coord_is_good[c_name] = _check_dimensions_values_identical(ch_reshaped, axes_to_check) + + # Save the unique coordinate values for live traces only + tmp_nd_coord_values[not_null] = cr_reduced.ravel() + dataset[c_name][:] = tmp_nd_coord_values + drop_vars_delayed.append(c_name) - return dataset, drop_vars_delayed + return coord_is_good, dataset, drop_vars_delayed def _get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> AllUnits | None: @@ -278,13 +347,19 @@ def _populate_coordinates( """ drop_vars_delayed = [] # Populate the dimension coordinate variables (1-D arrays) - dataset, vars_to_drop_later = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed) + dataset, drop_vars_delayed = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed) # Populate the non-dimension coordinate variables (N-dim arrays) - dataset, vars_to_drop_later = populate_non_dim_coordinates( - dataset, grid, coordinates=coords, drop_vars_delayed=drop_vars_delayed + is_good, dataset, drop_vars_delayed = _populate_non_dim_coordinates( + dataset, grid, coordinate_headers=coords, drop_vars_delayed=drop_vars_delayed ) + if not all(is_good.values()): + failed_dims = [key for key, value in is_good.items() if not value] + err = f"Non-dim coordinate(s) {failed_dims} have non-identical values " + \ + f"along reduced dimensions." + raise ValueError(err) + dataset = dataset.drop_vars(drop_vars_delayed) return dataset, drop_vars_delayed diff --git a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py index 808b849d1..5b6a49759 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py @@ -12,8 +12,8 @@ class Seismic3DPreStackCocaTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ("inline", "crossline", "offset", "azimuth") - self._dim_names = (*self._coord_dim_names, self._trace_domain) + self._coord_dim_names = ("inline", "crossline") # This could be a dictionary specific to every coord + self._dim_names = (*self._coord_dim_names, "offset", "azimuth", self._trace_domain) self._coord_names = ("cdp_x", "cdp_y") self._var_chunk_shape = (8, 8, 32, 1, 1024) @@ -64,13 +64,13 @@ def _add_coordinates(self) -> None: # Add non-dimension coordinates self._builder.add_coordinate( "cdp_x", - dimensions=("inline", "crossline"), + dimensions=self._coord_dim_names, data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) self._builder.add_coordinate( "cdp_y", - dimensions=("inline", "crossline"), + dimensions=self._coord_dim_names, data_type=ScalarType.FLOAT64, metadata_info=[self._horizontal_coord_unit], ) diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 9290dae80..9fd1485b0 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -285,8 +285,8 @@ def test_import(self, test_conf: MaskedExportConfig, export_masked_path: Path) - template_name = "PreStackShotGathers2D" + domain case "3d_streamer": template_name = "PreStackShotGathers3D" + domain - # case "3d_coca": - # templateName = "PostStack3D" + domain + case "3d_coca": + template_name = "PreStackCocaGathers3D" + domain case _: err = f"Unsupported test configuration: {grid_conf.name}" raise ValueError(err) diff --git a/tests/unit/v1/converters/test_segy_to_mdio.py b/tests/unit/v1/converters/test_segy_to_mdio.py new file mode 100644 index 000000000..e0d983899 --- /dev/null +++ b/tests/unit/v1/converters/test_segy_to_mdio.py @@ -0,0 +1,78 @@ +import math +import random +import time +import numpy as np +import xarray as xr + +from segy.arrays import HeaderArray as SegyHeaderArray + +from mdio.converters.segy import _populate_non_dim_coordinates +from mdio.core.dimension import Dimension +from mdio.core.grid import Grid + +def test__populate_non_dim_coordinates() ->None: + + dim_in, dim_xl, dim_of, dim_az, dim_tm = 2, 3, 4, 5, 1 + il = Dimension(name="inline", coords=np.arange(dim_in)) + xl = Dimension(name="crossline", coords=np.arange(dim_xl)) + of = Dimension(name="offset", coords=np.arange(dim_of)) + az = Dimension(name="azimuth", coords=np.arange(dim_az)) + tm = Dimension(name="time", coords=np.arange(dim_tm)) + + r = np.random.rand(dim_in, dim_xl, dim_of, dim_az) + + il_, xl_, of_, az_ = np.meshgrid(il.coords, xl.coords, of.coords, az.coords, indexing='ij') + diff = 1000*il_ + 100*xl_ + 10*of_ + 1*az_ # Different values for the same (il, xl) + same = 1000*il_ + 100*xl_ # Same values for the same (il, xl) + near = 1000*il_ + 100*xl_ + 1e-09 * r # Near same values for the same (il, xl) + # NOTE: near[1][1][1][1]: np.float64(1100.0000000003308) + + data_type=[('inline', ' None: t = Seismic3DPreStackCocaTemplate(domain="time") # Template attributes - assert t._coord_dim_names == ("inline", "crossline", "offset", "azimuth") + assert t._coord_dim_names == ("inline", "crossline") assert t._dim_names == ("inline", "crossline", "offset", "azimuth", "time") assert t._coord_names == ("cdp_x", "cdp_y") assert t._var_chunk_shape == (8, 8, 32, 1, 1024) From f385a721c3a34e94ec9d86c180bc2cdc6890324d Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 27 Aug 2025 16:16:17 +0000 Subject: [PATCH 2/7] Fix the normal templates --- src/mdio/converters/segy.py | 70 +++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index b367965cb..521ceb9d3 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -271,40 +271,48 @@ def _populate_non_dim_coordinates( # Q: Should we be using the trace_mask boolean array instead of grid map? grid_map_values = grid.map[:] for c_name, coord_headers_values in coordinate_headers.items(): - - # In the case of Coca and some other templates, the coordinate header values, - # coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline", - # "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g., - # dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of - # "inline", "crossline"). - # Thus, every coordinate 2-tuple has multiple duplicate values of the "cdp_x" coordinates - # stored in coord_headers_values. Those needs to be reduced to a unique value. - # We assume (and check) that all the duplicate values are (near) identical. headers_dims = grid.dim_names[:-1] # e.g.: "inline", "crossline", "offset", "azimuth" coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline" - # This will create a temporary array in memory with the same shape as the coordinate - # defined in the dataset. Since the coordinate variable has not yet been populated, - # the temporary array will be populated with NaN from the current coordinate values. - tmp_nd_coord_values = dataset[c_name].values - # Create slices for the all grid dimensions that are also the coordinate dimensions. - # For other dimension, select the first element (with index 0) - slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims) - # Create a boolean mask for the live trace values with the dimensions of the coordinate - # Q: Should we be using the trace_mask boolean array instead of grid map? - not_null = grid_map_values[slices] != UINT32_MAX - - ch_reshaped = coord_headers_values.reshape(grid_map_values.shape) - # Select a subset of the coordinate_headers that have unique values - cr_reduced = ch_reshaped[slices] - - # Validate the all reduced dimensions had identical values axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims) - coord_is_good[c_name] = _check_dimensions_values_identical(ch_reshaped, axes_to_check) - - # Save the unique coordinate values for live traces only - tmp_nd_coord_values[not_null] = cr_reduced.ravel() - dataset[c_name][:] = tmp_nd_coord_values - + if axes_to_check == (): + # In case the coordinate has the same dimensions as grid map + coord_is_good[c_name] = True + not_null = grid_map_values != UINT32_MAX + tmp_coord_values = dataset[c_name].values + tmp_coord_values[not_null] = coord_headers_values + else: + # In the case of Coca and some other templates, the coordinate header values, + # coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline", + # "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g., + # dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of + # "inline", "crossline"). + # Thus, every coordinate 2-tuple has multiple duplicate values of the "cdp_x" coordinates + # stored in coord_headers_values. Those needs to be reduced to a unique value. + # We assume (and check) that all the duplicate values are (near) identical. + # + # The following will create a temporary array in memory with the same shape as the + # coordinate defined in the dataset. Since the coordinate variable has not yet been + # populated, the temporary array will be populated with NaN from the current coordinate + # values. + tmp_coord_values = dataset[c_name].values + # Create slices for the all grid dimensions that are also the coordinate dimensions. + # For other dimension, select the first element (with index 0) + slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims) + # Create a boolean mask for the live trace values with the dimensions of the coordinate + # Q: Should we be using the trace_mask boolean array instead of grid map? + not_null = grid_map_values[slices] != UINT32_MAX + + ch_reshaped = coord_headers_values.reshape(grid_map_values.shape) + # Select a subset of the coordinate_headers that have unique values + cr_reduced = ch_reshaped[slices] + + # Validate the all reduced dimensions had identical values + coord_is_good[c_name] = _check_dimensions_values_identical(ch_reshaped, axes_to_check) + + # Save the unique coordinate values for live traces only + tmp_coord_values[not_null] = cr_reduced.ravel() + + dataset[c_name][:] = tmp_coord_values drop_vars_delayed.append(c_name) return coord_is_good, dataset, drop_vars_delayed From eb43121facc81ab77474e7dec936d17f7f1dcf76 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 27 Aug 2025 19:36:20 +0000 Subject: [PATCH 3/7] Update tests --- src/mdio/converters/segy.py | 1 - .../test_segy_import_export_masked.py | 59 ++++++++++++++----- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 521ceb9d3..7dba2021e 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -367,7 +367,6 @@ def _populate_coordinates( f"along reduced dimensions." raise ValueError(err) - dataset = dataset.drop_vars(drop_vars_delayed) return dataset, drop_vars_delayed diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 9fd1485b0..1a9b134ed 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -200,16 +200,30 @@ def mock_nd_segy(path: str, grid_conf: GridConfig, segy_factory_conf: SegyFactor # Fill coordinates (e.g. {SRC-REC-CDP}-X/Y headers["coordinate_scalar"] = -100 - for field in ["cdp_x", "source_coord_x", "group_coord_x"]: - start = 700000 - step = 100 - stop = start + step * (trace_numbers.size - 0) - headers[field] = np.arange(start=start, stop=stop, step=step) - for field in ["cdp_y", "source_coord_y", "group_coord_y"]: - start = 4000000 - step = 100 - stop = start + step * (trace_numbers.size - 0) - headers[field] = np.arange(start=start, stop=stop, step=step) + if grid_conf.name == "3d_coca": + coords = [] + for name in ["inline", "crossline", "offset", "azimuth"]: + dim = next(dim for dim in grid_conf.dims if dim.name == name) + coords.append(np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step)) + inline, crossline, _, _ = np.meshgrid(*coords, indexing='ij') + # Make sure that multiple cdp_x and cdp_y values are the same for each (il, xl) + cdp_x = 700000 + 100*inline + crossline + cdp_y = 4000000 + inline + 1000*crossline + for field in ["cdp_x", "source_coord_x", "group_coord_x"]: + headers[field] = cdp_x.ravel() + for field in ["cdp_y", "source_coord_y", "group_coord_y"]: + headers[field] = cdp_y.ravel() + else: + for field in ["cdp_x", "source_coord_x", "group_coord_x"]: + start = 700000 + step = 100 + stop = start + step * (trace_numbers.size - 0) + headers[field] = np.arange(start=start, stop=stop, step=step) + for field in ["cdp_y", "source_coord_y", "group_coord_y"]: + start = 4000000 + step = 100 + stop = start + step * (trace_numbers.size - 0) + headers[field] = np.arange(start=start, stop=stop, step=step) # Array filled with repeating sequence sequence = np.array([1, 2, 3]) @@ -337,11 +351,28 @@ def test_ingested_mdio(self, test_conf: MaskedExportConfig, export_masked_path: assert headers.shape == live_mask.shape assert set(headers.dims) == {dim.name for dim in grid_conf.dims} # Validate header values - trace_index = 0 - trace_header = headers.values.flatten()[trace_index] - expected_x = 700000 + trace_index * 100 - expected_y = 4000000 + trace_index * 100 + if grid_conf.name == "3d_coca": + # Select an index of the trace, which header to check: + il, xl, of, az = 0, 0, 0, 0 + trace_index = np.ravel_multi_index((il, xl, of, az), dims=live_mask.shape) + # Get the trace header + trace_header = headers.values[il][xl][of][az] + # calculate the expected values + coords = {} + for name in ["inline", "crossline", "offset", "azimuth"]: + dim = next(dim for dim in grid_conf.dims if dim.name == name) + coords[name] = np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step) + inline = coords["inline"][il] + crossline = coords["crossline"][xl] + expected_x = 700000 + 100*inline + crossline + expected_y = 4000000 + inline + 1000*crossline + else: + trace_index = 0 + trace_header = headers.values.flatten()[trace_index] + expected_x = 700000 + trace_index * 100 + expected_y = 4000000 + trace_index * 100 expected_gun = 1 + (trace_index % 3) + # validate assert trace_header["coordinate_scalar"] == -100 assert trace_header["source_coord_x"] == expected_x assert trace_header["source_coord_y"] == expected_y From 36af1679afc5183e3508781867460bb59e921636 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 27 Aug 2025 20:06:24 +0000 Subject: [PATCH 4/7] Fix pre-commit warnings --- src/mdio/converters/segy.py | 45 +++++----- .../test_segy_import_export_masked.py | 6 +- tests/unit/v1/converters/test_segy_to_mdio.py | 89 ++++++++++--------- 3 files changed, 75 insertions(+), 65 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 7dba2021e..ac42016ec 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -224,23 +224,23 @@ def populate_dim_coordinates( return dataset, drop_vars_delayed -def _check_dimensions_values_identical(arr: np.ndarray, axes_to_check: tuple[int, ...]) -> np.ndarray: - """ - Check if all values along specified dimensions are identical for each - sub-array defined by the other dimensions. +def _check_dimensions_values_identical(arr: np.ndarray, axes_to_check: tuple[int, ...]) -> bool: + """Check if all values along specified dimensions are identical. + + Check if all values along specified dimensions are identical for each sub-array + defined by the other dimensions. Args: - arr (np.ndarray): an N-dimensional array. For example, an array of all 'cdp_x' segy + arr: an N-dimensional array. For example, an array of all 'cdp_x' segy header values for coordinates "inline", "crossline", "offset", "azimuth". - axes_to_check (tuple): A tuple of integers representing the axes to check for + axes_to_check: A tuple of integers representing the axes to check for identical values. For example, (2, 3) would check the "offset", "azimuth" dimensions. Returns: - bool: True indicates the all values in the dimensions selected by axes_to_check + True indicates the all values in the dimensions selected by axes_to_check are identical, and False otherwise. """ - # Create a slicing tuple to get the first element along the axes to check full_slice = [slice(None)] * arr.ndim for axis in axes_to_check: @@ -272,7 +272,7 @@ def _populate_non_dim_coordinates( grid_map_values = grid.map[:] for c_name, coord_headers_values in coordinate_headers.items(): headers_dims = grid.dim_names[:-1] # e.g.: "inline", "crossline", "offset", "azimuth" - coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline" + coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline" axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims) if axes_to_check == (): # In case the coordinate has the same dimensions as grid map @@ -281,18 +281,18 @@ def _populate_non_dim_coordinates( tmp_coord_values = dataset[c_name].values tmp_coord_values[not_null] = coord_headers_values else: - # In the case of Coca and some other templates, the coordinate header values, - # coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline", - # "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g., - # dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of - # "inline", "crossline"). + # In the case of Coca and some other templates, the coordinate header values, + # coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline", + # "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g., + # dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of + # "inline", "crossline"). # Thus, every coordinate 2-tuple has multiple duplicate values of the "cdp_x" coordinates - # stored in coord_headers_values. Those needs to be reduced to a unique value. + # stored in coord_headers_values. Those needs to be reduced to a unique value. # We assume (and check) that all the duplicate values are (near) identical. # - # The following will create a temporary array in memory with the same shape as the - # coordinate defined in the dataset. Since the coordinate variable has not yet been - # populated, the temporary array will be populated with NaN from the current coordinate + # The following will create a temporary array in memory with the same shape as the + # coordinate defined in the dataset. Since the coordinate variable has not yet been + # populated, the temporary array will be populated with NaN from the current coordinate # values. tmp_coord_values = dataset[c_name].values # Create slices for the all grid dimensions that are also the coordinate dimensions. @@ -350,8 +350,12 @@ def _populate_coordinates( grid: The grid object containing the grid map. coords: The non-dim coordinates to populate. + Raises: + ValueError: If coordinate headers have non-identical values for the reduced dimensions. + Returns: - Xarray dataset with filled coordinates and updated variables to drop after writing + A tuple of the Xarray dataset with filled coordinates and updated variables to drop + after writing """ drop_vars_delayed = [] # Populate the dimension coordinate variables (1-D arrays) @@ -363,8 +367,7 @@ def _populate_coordinates( ) if not all(is_good.values()): failed_dims = [key for key, value in is_good.items() if not value] - err = f"Non-dim coordinate(s) {failed_dims} have non-identical values " + \ - f"along reduced dimensions." + err = f"Non-dim coordinate(s) {failed_dims} have non-identical values " + "along reduced dimensions." raise ValueError(err) return dataset, drop_vars_delayed diff --git a/tests/integration/test_segy_import_export_masked.py b/tests/integration/test_segy_import_export_masked.py index 1a9b134ed..083d6e9f1 100644 --- a/tests/integration/test_segy_import_export_masked.py +++ b/tests/integration/test_segy_import_export_masked.py @@ -205,10 +205,10 @@ def mock_nd_segy(path: str, grid_conf: GridConfig, segy_factory_conf: SegyFactor for name in ["inline", "crossline", "offset", "azimuth"]: dim = next(dim for dim in grid_conf.dims if dim.name == name) coords.append(np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step)) - inline, crossline, _, _ = np.meshgrid(*coords, indexing='ij') + inline, crossline, _, _ = np.meshgrid(*coords, indexing="ij") # Make sure that multiple cdp_x and cdp_y values are the same for each (il, xl) - cdp_x = 700000 + 100*inline + crossline - cdp_y = 4000000 + inline + 1000*crossline + cdp_x = 700000 + 100 * inline + crossline + cdp_y = 4000000 + inline + 1000 * crossline for field in ["cdp_x", "source_coord_x", "group_coord_x"]: headers[field] = cdp_x.ravel() for field in ["cdp_y", "source_coord_y", "group_coord_y"]: diff --git a/tests/unit/v1/converters/test_segy_to_mdio.py b/tests/unit/v1/converters/test_segy_to_mdio.py index e0d983899..1d1c4fe0c 100644 --- a/tests/unit/v1/converters/test_segy_to_mdio.py +++ b/tests/unit/v1/converters/test_segy_to_mdio.py @@ -1,40 +1,47 @@ -import math -import random -import time +"""Tests for segY-to_mdio methods.""" + import numpy as np import xarray as xr - from segy.arrays import HeaderArray as SegyHeaderArray from mdio.converters.segy import _populate_non_dim_coordinates from mdio.core.dimension import Dimension from mdio.core.grid import Grid -def test__populate_non_dim_coordinates() ->None: +def test__populate_non_dim_coordinates() -> None: + """Test population of non-dimensional coordinates.""" dim_in, dim_xl, dim_of, dim_az, dim_tm = 2, 3, 4, 5, 1 il = Dimension(name="inline", coords=np.arange(dim_in)) xl = Dimension(name="crossline", coords=np.arange(dim_xl)) of = Dimension(name="offset", coords=np.arange(dim_of)) az = Dimension(name="azimuth", coords=np.arange(dim_az)) tm = Dimension(name="time", coords=np.arange(dim_tm)) - + r = np.random.rand(dim_in, dim_xl, dim_of, dim_az) - il_, xl_, of_, az_ = np.meshgrid(il.coords, xl.coords, of.coords, az.coords, indexing='ij') - diff = 1000*il_ + 100*xl_ + 10*of_ + 1*az_ # Different values for the same (il, xl) - same = 1000*il_ + 100*xl_ # Same values for the same (il, xl) - near = 1000*il_ + 100*xl_ + 1e-09 * r # Near same values for the same (il, xl) + il_, xl_, of_, az_ = np.meshgrid(il.coords, xl.coords, of.coords, az.coords, indexing="ij") + diff = 1000 * il_ + 100 * xl_ + 10 * of_ + 1 * az_ # Different values for the same (il, xl) + same = 1000 * il_ + 100 * xl_ # Same values for the same (il, xl) + near = 1000 * il_ + 100 * xl_ + 1e-09 * r # Near same values for the same (il, xl) # NOTE: near[1][1][1][1]: np.float64(1100.0000000003308) - data_type=[('inline', 'None: ds = xr.Dataset( data_vars={ - 'amplitude': (["inline", "crossline", "offset", "azimuth", "time"], - np.zeros((dim_in, dim_xl, dim_of, dim_az, dim_tm), dtype=np.float32)), + "amplitude": ( + ["inline", "crossline", "offset", "azimuth", "time"], + np.zeros((dim_in, dim_xl, dim_of, dim_az, dim_tm), dtype=np.float32), + ), }, coords={ # Define coordinates with their dimensions and values - 'inline': il.coords, - 'crossline': xl.coords, - 'offset': of.coords, - 'azimuth': az.coords, - 'time': tm.coords, - 'cdp_x': (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), - 'cdp_y': (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), - 'cdp_z': (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)) - } + "inline": il.coords, + "crossline": xl.coords, + "offset": of.coords, + "azimuth": az.coords, + "time": tm.coords, + "cdp_x": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), + "cdp_y": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), + "cdp_z": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), + }, ) coordinate_headers: dict[str, SegyHeaderArray] = { - 'cdp_x': segy_headers['cdp_x'], - 'cdp_y': segy_headers['cdp_y'], - 'cdp_z': segy_headers['cdp_z'] + "cdp_x": segy_headers["cdp_x"], + "cdp_y": segy_headers["cdp_y"], + "cdp_z": segy_headers["cdp_z"], } is_good, ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, []) - assert is_good['cdp_x'] is False - assert is_good['cdp_y'] is True - assert is_good['cdp_z'] is True - expected_values = np.array([[0., 100., 200.], [1000., 1100., 1200.]], dtype=np.float32) - assert np.allclose(ds_populated['cdp_x'].values, expected_values) - assert np.allclose(ds_populated['cdp_y'].values, expected_values) - assert np.allclose(ds_populated['cdp_z'].values, expected_values) + assert is_good["cdp_x"] is False + assert is_good["cdp_y"] is True + assert is_good["cdp_z"] is True + expected_values = np.array([[0.0, 100.0, 200.0], [1000.0, 1100.0, 1200.0]], dtype=np.float32) + assert np.allclose(ds_populated["cdp_x"].values, expected_values) + assert np.allclose(ds_populated["cdp_y"].values, expected_values) + assert np.allclose(ds_populated["cdp_z"].values, expected_values) # NOTE: ds_populated['cdp_z'].values[1][1]: np.float64(1100.0000000008847) - - pass From d07a118ad73d5c882b265ef40c954f287dcf866d Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Thu, 28 Aug 2025 20:51:11 +0000 Subject: [PATCH 5/7] Address PR review issues --- src/mdio/converters/segy.py | 39 ++++++----------- .../v1/templates/seismic_3d_prestack_coca.py | 2 +- tests/unit/v1/converters/test_segy_to_mdio.py | 43 +++++++++++-------- 3 files changed, 38 insertions(+), 46 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index ac42016ec..cdd819d38 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -255,7 +255,7 @@ def _check_dimensions_values_identical(arr: np.ndarray, axes_to_check: tuple[int # Compare the array with the broadcasted slice and use np.all() # to collapse the dimensions being checked - identical = np.all(np.isclose(arr, first_element_slice), axis=axes_to_check) + identical = np.all(np.isclose(arr, first_element_slice, rtol=1e-05, atol=1e-08), axis=axes_to_check) return np.all(identical).item() @@ -266,18 +266,14 @@ def _populate_non_dim_coordinates( drop_vars_delayed: list[str], ) -> tuple[xr_Dataset, list[str]]: """Populate the xarray dataset with coordinate variables.""" - coord_is_good = {} # Load the grid map values into memory. - # Q: Should we be using the trace_mask boolean array instead of grid map? - grid_map_values = grid.map[:] for c_name, coord_headers_values in coordinate_headers.items(): headers_dims = grid.dim_names[:-1] # e.g.: "inline", "crossline", "offset", "azimuth" coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline" axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims) if axes_to_check == (): # In case the coordinate has the same dimensions as grid map - coord_is_good[c_name] = True - not_null = grid_map_values != UINT32_MAX + not_null = grid.map[:] != UINT32_MAX tmp_coord_values = dataset[c_name].values tmp_coord_values[not_null] = coord_headers_values else: @@ -292,29 +288,27 @@ def _populate_non_dim_coordinates( # # The following will create a temporary array in memory with the same shape as the # coordinate defined in the dataset. Since the coordinate variable has not yet been - # populated, the temporary array will be populated with NaN from the current coordinate - # values. + # populated, the temporary array will be populated with _FillValue from the current + # coordinate values. tmp_coord_values = dataset[c_name].values # Create slices for the all grid dimensions that are also the coordinate dimensions. # For other dimension, select the first element (with index 0) slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims) # Create a boolean mask for the live trace values with the dimensions of the coordinate - # Q: Should we be using the trace_mask boolean array instead of grid map? - not_null = grid_map_values[slices] != UINT32_MAX - - ch_reshaped = coord_headers_values.reshape(grid_map_values.shape) + not_null = grid.map[slices] != UINT32_MAX + ch_reshaped = coord_headers_values.reshape(grid.map.shape) # Select a subset of the coordinate_headers that have unique values - cr_reduced = ch_reshaped[slices] + # and save the unique coordinate values for live traces only + tmp_coord_values[not_null] = ch_reshaped[slices].ravel() # Validate the all reduced dimensions had identical values - coord_is_good[c_name] = _check_dimensions_values_identical(ch_reshaped, axes_to_check) - - # Save the unique coordinate values for live traces only - tmp_coord_values[not_null] = cr_reduced.ravel() + if not _check_dimensions_values_identical(ch_reshaped, axes_to_check): + err = f"Coordinate '{c_name}' has non-identical values along reduced dimensions." + raise ValueError(err) dataset[c_name][:] = tmp_coord_values drop_vars_delayed.append(c_name) - return coord_is_good, dataset, drop_vars_delayed + return dataset, drop_vars_delayed def _get_horizontal_coordinate_unit(segy_headers: list[Dimension]) -> AllUnits | None: @@ -350,9 +344,6 @@ def _populate_coordinates( grid: The grid object containing the grid map. coords: The non-dim coordinates to populate. - Raises: - ValueError: If coordinate headers have non-identical values for the reduced dimensions. - Returns: A tuple of the Xarray dataset with filled coordinates and updated variables to drop after writing @@ -362,13 +353,9 @@ def _populate_coordinates( dataset, drop_vars_delayed = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed) # Populate the non-dimension coordinate variables (N-dim arrays) - is_good, dataset, drop_vars_delayed = _populate_non_dim_coordinates( + dataset, drop_vars_delayed = _populate_non_dim_coordinates( dataset, grid, coordinate_headers=coords, drop_vars_delayed=drop_vars_delayed ) - if not all(is_good.values()): - failed_dims = [key for key, value in is_good.items() if not value] - err = f"Non-dim coordinate(s) {failed_dims} have non-identical values " + "along reduced dimensions." - raise ValueError(err) return dataset, drop_vars_delayed diff --git a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py index 5b6a49759..d66413237 100644 --- a/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py +++ b/src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py @@ -12,7 +12,7 @@ class Seismic3DPreStackCocaTemplate(AbstractDatasetTemplate): def __init__(self, domain: str): super().__init__(domain=domain) - self._coord_dim_names = ("inline", "crossline") # This could be a dictionary specific to every coord + self._coord_dim_names = ("inline", "crossline") self._dim_names = (*self._coord_dim_names, "offset", "azimuth", self._trace_domain) self._coord_names = ("cdp_x", "cdp_y") self._var_chunk_shape = (8, 8, 32, 1, 1024) diff --git a/tests/unit/v1/converters/test_segy_to_mdio.py b/tests/unit/v1/converters/test_segy_to_mdio.py index 1d1c4fe0c..61f87288b 100644 --- a/tests/unit/v1/converters/test_segy_to_mdio.py +++ b/tests/unit/v1/converters/test_segy_to_mdio.py @@ -1,6 +1,7 @@ -"""Tests for segY-to_mdio methods.""" +"""Tests for segy-to_mdio convenience functions.""" import numpy as np +import pytest import xarray as xr from segy.arrays import HeaderArray as SegyHeaderArray @@ -31,9 +32,9 @@ def test__populate_non_dim_coordinates() -> None: ("crossline", " None: "offset": of.coords, "azimuth": az.coords, "time": tm.coords, - "cdp_x": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), - "cdp_y": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), - "cdp_z": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), + "cdp_diff": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), + "cdp_same": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), + "cdp_near": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)), }, ) + # "cdp_diff" has different values for the same (il, xl) coordinate_headers: dict[str, SegyHeaderArray] = { - "cdp_x": segy_headers["cdp_x"], - "cdp_y": segy_headers["cdp_y"], - "cdp_z": segy_headers["cdp_z"], + "cdp_diff": segy_headers["cdp_diff"], } + with pytest.raises(ValueError) as exc_info: + ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, []) + err = str(exc_info.value) + assert "Coordinate 'cdp_diff' have non-identical values along reduced dimensions." in err - is_good, ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, []) - - assert is_good["cdp_x"] is False - assert is_good["cdp_y"] is True - assert is_good["cdp_z"] is True + # "cdp_same" has identical values for the same (il, xl) + # "cdp_near" has near identical values for the same (il, xl) + coordinate_headers: dict[str, SegyHeaderArray] = { + "cdp_same": segy_headers["cdp_same"], + "cdp_near": segy_headers["cdp_near"], + } + ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, []) expected_values = np.array([[0.0, 100.0, 200.0], [1000.0, 1100.0, 1200.0]], dtype=np.float32) - assert np.allclose(ds_populated["cdp_x"].values, expected_values) - assert np.allclose(ds_populated["cdp_y"].values, expected_values) - assert np.allclose(ds_populated["cdp_z"].values, expected_values) - # NOTE: ds_populated['cdp_z'].values[1][1]: np.float64(1100.0000000008847) + assert np.allclose(ds_populated["cdp_same"].values, expected_values) + assert np.allclose(ds_populated["cdp_near"].values, expected_values) + # NOTE: ds_populated['cdp_near'].values[1][1]: np.float64(1100.0000000008847) From 30e597fbc1261bc79b2beffb514c7d9c1ae3e763 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 2 Sep 2025 15:36:15 +0000 Subject: [PATCH 6/7] Fix PR review comment --- src/mdio/converters/segy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index cdd819d38..3ef84ae65 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -273,7 +273,7 @@ def _populate_non_dim_coordinates( axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims) if axes_to_check == (): # In case the coordinate has the same dimensions as grid map - not_null = grid.map[:] != UINT32_MAX + not_null = grid.map[:] != grid.map.fill_value tmp_coord_values = dataset[c_name].values tmp_coord_values[not_null] = coord_headers_values else: @@ -295,7 +295,7 @@ def _populate_non_dim_coordinates( # For other dimension, select the first element (with index 0) slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims) # Create a boolean mask for the live trace values with the dimensions of the coordinate - not_null = grid.map[slices] != UINT32_MAX + not_null = grid.map[slices] != grid.map.fill_value ch_reshaped = coord_headers_values.reshape(grid.map.shape) # Select a subset of the coordinate_headers that have unique values # and save the unique coordinate values for live traces only From 8e18fe5cd25fc082656d0ef3125aa58db6cbebf1 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 2 Sep 2025 15:42:31 +0000 Subject: [PATCH 7/7] Fix pre-commit error --- src/mdio/converters/segy.py | 1 - tests/unit/v1/converters/test_segy_to_mdio.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 3ef84ae65..946d78a68 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -14,7 +14,6 @@ from mdio.api.io import _normalize_path from mdio.api.io import to_mdio -from mdio.constants import UINT32_MAX from mdio.converters.exceptions import EnvironmentFormatError from mdio.converters.exceptions import GridTraceCountError from mdio.converters.exceptions import GridTraceSparsityError diff --git a/tests/unit/v1/converters/test_segy_to_mdio.py b/tests/unit/v1/converters/test_segy_to_mdio.py index 61f87288b..6c06d6ac0 100644 --- a/tests/unit/v1/converters/test_segy_to_mdio.py +++ b/tests/unit/v1/converters/test_segy_to_mdio.py @@ -72,10 +72,9 @@ def test__populate_non_dim_coordinates() -> None: coordinate_headers: dict[str, SegyHeaderArray] = { "cdp_diff": segy_headers["cdp_diff"], } - with pytest.raises(ValueError) as exc_info: + expected_err = "Coordinate 'cdp_diff' has non-identical values along reduced dimensions." + with pytest.raises(ValueError, match=expected_err): ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, []) - err = str(exc_info.value) - assert "Coordinate 'cdp_diff' have non-identical values along reduced dimensions." in err # "cdp_same" has identical values for the same (il, xl) # "cdp_near" has near identical values for the same (il, xl)