Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 83 additions & 12 deletions src/mdio/converters/segy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from mdio.api.io import _normalize_path
from mdio.api.io import to_mdio
from mdio.constants import UINT32_MAX
from mdio.converters.exceptions import EnvironmentFormatError
from mdio.converters.exceptions import GridTraceCountError
from mdio.converters.exceptions import GridTraceSparsityError
Expand Down Expand Up @@ -224,18 +223,89 @@ def populate_dim_coordinates(
return dataset, drop_vars_delayed


def populate_non_dim_coordinates(
def _check_dimensions_values_identical(arr: np.ndarray, axes_to_check: tuple[int, ...]) -> bool:
"""Check if all values along specified dimensions are identical.

Check if all values along specified dimensions are identical for each sub-array
defined by the other dimensions.

Args:
arr: an N-dimensional array. For example, an array of all 'cdp_x' segy
header values for coordinates "inline", "crossline", "offset", "azimuth".
axes_to_check: A tuple of integers representing the axes to check for
identical values. For example, (2, 3) would check the "offset", "azimuth"
dimensions.

Returns:
True indicates the all values in the dimensions selected by axes_to_check
are identical, and False otherwise.
"""
# Create a slicing tuple to get the first element along the axes to check
full_slice = [slice(None)] * arr.ndim
for axis in axes_to_check:
full_slice[axis] = 0

# Broadcast the first element along the specified axes for comparison
first_element_slice = arr[tuple(full_slice)]

# Add new axes to the slice to enable broadcasting against the original array
for axis in axes_to_check:
first_element_slice = np.expand_dims(first_element_slice, axis)

# Compare the array with the broadcasted slice and use np.all()
# to collapse the dimensions being checked
identical = np.all(np.isclose(arr, first_element_slice, rtol=1e-05, atol=1e-08), axis=axes_to_check)
return np.all(identical).item()


def _populate_non_dim_coordinates(
dataset: xr_Dataset,
grid: Grid,
coordinates: dict[str, SegyHeaderArray],
coordinate_headers: dict[str, SegyHeaderArray],
drop_vars_delayed: list[str],
) -> tuple[xr_Dataset, list[str]]:
"""Populate the xarray dataset with coordinate variables."""
not_null = grid.map[:] != UINT32_MAX
for c_name, c_values in coordinates.items():
c_tmp_array = dataset[c_name].values
c_tmp_array[not_null] = c_values
dataset[c_name][:] = c_tmp_array
# Load the grid map values into memory.
for c_name, coord_headers_values in coordinate_headers.items():
headers_dims = grid.dim_names[:-1] # e.g.: "inline", "crossline", "offset", "azimuth"
coord_dims = dataset[c_name].dims # e.g.: "inline", "crossline"
axes_to_check = tuple(i for i, dim in enumerate(headers_dims) if dim not in coord_dims)
if axes_to_check == ():
# In case the coordinate has the same dimensions as grid map
not_null = grid.map[:] != grid.map.fill_value
tmp_coord_values = dataset[c_name].values
tmp_coord_values[not_null] = coord_headers_values
else:
# In the case of Coca and some other templates, the coordinate header values,
# coord_headers_values, have a full set of dimensions (e.g. a 4-tuple of "inline",
# "crossline", "offset", "azimuth"), while the non-dimensional coordinates, (e.g.,
# dataset["cdp_x"]) are defined over only a subset of the dimensions (e.g. 2-tuple of
# "inline", "crossline").
# Thus, every coordinate 2-tuple has multiple duplicate values of the "cdp_x" coordinates
# stored in coord_headers_values. Those needs to be reduced to a unique value.
# We assume (and check) that all the duplicate values are (near) identical.
#
# The following will create a temporary array in memory with the same shape as the
# coordinate defined in the dataset. Since the coordinate variable has not yet been
# populated, the temporary array will be populated with _FillValue from the current
# coordinate values.
tmp_coord_values = dataset[c_name].values
# Create slices for the all grid dimensions that are also the coordinate dimensions.
# For other dimension, select the first element (with index 0)
slices = tuple(slice(None) if name in coord_dims else 0 for name in headers_dims)
# Create a boolean mask for the live trace values with the dimensions of the coordinate
not_null = grid.map[slices] != grid.map.fill_value
ch_reshaped = coord_headers_values.reshape(grid.map.shape)
# Select a subset of the coordinate_headers that have unique values
# and save the unique coordinate values for live traces only
tmp_coord_values[not_null] = ch_reshaped[slices].ravel()

# Validate the all reduced dimensions had identical values
if not _check_dimensions_values_identical(ch_reshaped, axes_to_check):
err = f"Coordinate '{c_name}' has non-identical values along reduced dimensions."
raise ValueError(err)

dataset[c_name][:] = tmp_coord_values
drop_vars_delayed.append(c_name)
return dataset, drop_vars_delayed

Expand Down Expand Up @@ -274,15 +344,16 @@ def _populate_coordinates(
coords: The non-dim coordinates to populate.

Returns:
Xarray dataset with filled coordinates and updated variables to drop after writing
A tuple of the Xarray dataset with filled coordinates and updated variables to drop
after writing
"""
drop_vars_delayed = []
# Populate the dimension coordinate variables (1-D arrays)
dataset, vars_to_drop_later = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed)
dataset, drop_vars_delayed = populate_dim_coordinates(dataset, grid, drop_vars_delayed=drop_vars_delayed)

# Populate the non-dimension coordinate variables (N-dim arrays)
dataset, vars_to_drop_later = populate_non_dim_coordinates(
dataset, grid, coordinates=coords, drop_vars_delayed=drop_vars_delayed
dataset, drop_vars_delayed = _populate_non_dim_coordinates(
dataset, grid, coordinate_headers=coords, drop_vars_delayed=drop_vars_delayed
)

return dataset, drop_vars_delayed
Expand Down
8 changes: 4 additions & 4 deletions src/mdio/schemas/v1/templates/seismic_3d_prestack_coca.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this implementation is not valid. as i said before the coordinates don't always share the same dimensions we have to handle this more explicitly.

Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Seismic3DPreStackCocaTemplate(AbstractDatasetTemplate):
def __init__(self, domain: str):
super().__init__(domain=domain)

self._coord_dim_names = ("inline", "crossline", "offset", "azimuth")
self._dim_names = (*self._coord_dim_names, self._trace_domain)
self._coord_dim_names = ("inline", "crossline")
self._dim_names = (*self._coord_dim_names, "offset", "azimuth", self._trace_domain)
self._coord_names = ("cdp_x", "cdp_y")
self._var_chunk_shape = (8, 8, 32, 1, 1024)

Expand Down Expand Up @@ -64,13 +64,13 @@ def _add_coordinates(self) -> None:
# Add non-dimension coordinates
self._builder.add_coordinate(
"cdp_x",
dimensions=("inline", "crossline"),
dimensions=self._coord_dim_names,
data_type=ScalarType.FLOAT64,
metadata_info=[self._horizontal_coord_unit],
)
self._builder.add_coordinate(
"cdp_y",
dimensions=("inline", "crossline"),
dimensions=self._coord_dim_names,
data_type=ScalarType.FLOAT64,
metadata_info=[self._horizontal_coord_unit],
)
63 changes: 47 additions & 16 deletions tests/integration/test_segy_import_export_masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,16 +200,30 @@ def mock_nd_segy(path: str, grid_conf: GridConfig, segy_factory_conf: SegyFactor

# Fill coordinates (e.g. {SRC-REC-CDP}-X/Y
headers["coordinate_scalar"] = -100
for field in ["cdp_x", "source_coord_x", "group_coord_x"]:
start = 700000
step = 100
stop = start + step * (trace_numbers.size - 0)
headers[field] = np.arange(start=start, stop=stop, step=step)
for field in ["cdp_y", "source_coord_y", "group_coord_y"]:
start = 4000000
step = 100
stop = start + step * (trace_numbers.size - 0)
headers[field] = np.arange(start=start, stop=stop, step=step)
if grid_conf.name == "3d_coca":
coords = []
for name in ["inline", "crossline", "offset", "azimuth"]:
dim = next(dim for dim in grid_conf.dims if dim.name == name)
coords.append(np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step))
inline, crossline, _, _ = np.meshgrid(*coords, indexing="ij")
# Make sure that multiple cdp_x and cdp_y values are the same for each (il, xl)
cdp_x = 700000 + 100 * inline + crossline
cdp_y = 4000000 + inline + 1000 * crossline
for field in ["cdp_x", "source_coord_x", "group_coord_x"]:
headers[field] = cdp_x.ravel()
for field in ["cdp_y", "source_coord_y", "group_coord_y"]:
headers[field] = cdp_y.ravel()
else:
for field in ["cdp_x", "source_coord_x", "group_coord_x"]:
start = 700000
step = 100
stop = start + step * (trace_numbers.size - 0)
headers[field] = np.arange(start=start, stop=stop, step=step)
for field in ["cdp_y", "source_coord_y", "group_coord_y"]:
start = 4000000
step = 100
stop = start + step * (trace_numbers.size - 0)
headers[field] = np.arange(start=start, stop=stop, step=step)

# Array filled with repeating sequence
sequence = np.array([1, 2, 3])
Expand Down Expand Up @@ -285,8 +299,8 @@ def test_import(self, test_conf: MaskedExportConfig, export_masked_path: Path) -
template_name = "PreStackShotGathers2D" + domain
case "3d_streamer":
template_name = "PreStackShotGathers3D" + domain
# case "3d_coca":
# templateName = "PostStack3D" + domain
case "3d_coca":
template_name = "PreStackCocaGathers3D" + domain
case _:
err = f"Unsupported test configuration: {grid_conf.name}"
raise ValueError(err)
Expand Down Expand Up @@ -337,11 +351,28 @@ def test_ingested_mdio(self, test_conf: MaskedExportConfig, export_masked_path:
assert headers.shape == live_mask.shape
assert set(headers.dims) == {dim.name for dim in grid_conf.dims}
# Validate header values
trace_index = 0
trace_header = headers.values.flatten()[trace_index]
expected_x = 700000 + trace_index * 100
expected_y = 4000000 + trace_index * 100
if grid_conf.name == "3d_coca":
# Select an index of the trace, which header to check:
il, xl, of, az = 0, 0, 0, 0
trace_index = np.ravel_multi_index((il, xl, of, az), dims=live_mask.shape)
# Get the trace header
trace_header = headers.values[il][xl][of][az]
# calculate the expected values
coords = {}
for name in ["inline", "crossline", "offset", "azimuth"]:
dim = next(dim for dim in grid_conf.dims if dim.name == name)
coords[name] = np.arange(start=dim.start, stop=dim.start + dim.size * dim.step, step=dim.step)
inline = coords["inline"][il]
crossline = coords["crossline"][xl]
expected_x = 700000 + 100*inline + crossline
expected_y = 4000000 + inline + 1000*crossline
else:
trace_index = 0
trace_header = headers.values.flatten()[trace_index]
expected_x = 700000 + trace_index * 100
expected_y = 4000000 + trace_index * 100
expected_gun = 1 + (trace_index % 3)
# validate
assert trace_header["coordinate_scalar"] == -100
assert trace_header["source_coord_x"] == expected_x
assert trace_header["source_coord_y"] == expected_y
Expand Down
89 changes: 89 additions & 0 deletions tests/unit/v1/converters/test_segy_to_mdio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""Tests for segy-to_mdio convenience functions."""

import numpy as np
import pytest
import xarray as xr
from segy.arrays import HeaderArray as SegyHeaderArray

from mdio.converters.segy import _populate_non_dim_coordinates
from mdio.core.dimension import Dimension
from mdio.core.grid import Grid


def test__populate_non_dim_coordinates() -> None:
"""Test population of non-dimensional coordinates."""
dim_in, dim_xl, dim_of, dim_az, dim_tm = 2, 3, 4, 5, 1
il = Dimension(name="inline", coords=np.arange(dim_in))
xl = Dimension(name="crossline", coords=np.arange(dim_xl))
of = Dimension(name="offset", coords=np.arange(dim_of))
az = Dimension(name="azimuth", coords=np.arange(dim_az))
tm = Dimension(name="time", coords=np.arange(dim_tm))

r = np.random.rand(dim_in, dim_xl, dim_of, dim_az)

il_, xl_, of_, az_ = np.meshgrid(il.coords, xl.coords, of.coords, az.coords, indexing="ij")
diff = 1000 * il_ + 100 * xl_ + 10 * of_ + 1 * az_ # Different values for the same (il, xl)
same = 1000 * il_ + 100 * xl_ # Same values for the same (il, xl)
near = 1000 * il_ + 100 * xl_ + 1e-09 * r # Near same values for the same (il, xl)
# NOTE: near[1][1][1][1]: np.float64(1100.0000000003308)

data_type = [
("inline", "<i4"),
("crossline", "<i4"),
("offset", "<i4"),
("azimuth", "<i4"),
("cdp_diff", "<f8"),
("cdp_same", "<f8"),
("cdp_near", "<f8"),
]
data_list = [
(i, j, k, m, diff[i, j, k, m], same[i, j, k, m], near[i, j, k, m])
for i in range(dim_in)
for j in range(dim_xl)
for k in range(dim_of)
for m in range(dim_az)
]
segy_headers = SegyHeaderArray(np.array(data_list, dtype=data_type))

grid = Grid(dims=[il, xl, of, az, tm])
grid.build_map(segy_headers)

ds = xr.Dataset(
data_vars={
"amplitude": (
["inline", "crossline", "offset", "azimuth", "time"],
np.zeros((dim_in, dim_xl, dim_of, dim_az, dim_tm), dtype=np.float32),
),
},
coords={
# Define coordinates with their dimensions and values
"inline": il.coords,
"crossline": xl.coords,
"offset": of.coords,
"azimuth": az.coords,
"time": tm.coords,
"cdp_diff": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)),
"cdp_same": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)),
"cdp_near": (["inline", "crossline"], np.zeros((dim_in, dim_xl), dtype=np.float64)),
},
)

# "cdp_diff" has different values for the same (il, xl)
coordinate_headers: dict[str, SegyHeaderArray] = {
"cdp_diff": segy_headers["cdp_diff"],
}
expected_err = "Coordinate 'cdp_diff' has non-identical values along reduced dimensions."
with pytest.raises(ValueError, match=expected_err):
ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, [])

# "cdp_same" has identical values for the same (il, xl)
# "cdp_near" has near identical values for the same (il, xl)
coordinate_headers: dict[str, SegyHeaderArray] = {
"cdp_same": segy_headers["cdp_same"],
"cdp_near": segy_headers["cdp_near"],
}
ds_populated, _ = _populate_non_dim_coordinates(ds, grid, coordinate_headers, [])
expected_values = np.array([[0.0, 100.0, 200.0], [1000.0, 1100.0, 1200.0]], dtype=np.float32)
assert np.allclose(ds_populated["cdp_same"].values, expected_values)
assert np.allclose(ds_populated["cdp_near"].values, expected_values)
# NOTE: ds_populated['cdp_near'].values[1][1]: np.float64(1100.0000000008847)
2 changes: 1 addition & 1 deletion tests/unit/v1/templates/test_seismic_3d_prestack_coca.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_configuration_time(self) -> None:
t = Seismic3DPreStackCocaTemplate(domain="time")

# Template attributes
assert t._coord_dim_names == ("inline", "crossline", "offset", "azimuth")
assert t._coord_dim_names == ("inline", "crossline")
assert t._dim_names == ("inline", "crossline", "offset", "azimuth", "time")
assert t._coord_names == ("cdp_x", "cdp_y")
assert t._var_chunk_shape == (8, 8, 32, 1, 1024)
Expand Down
Loading