Skip to content
55 changes: 33 additions & 22 deletions src/mdio/converters/segy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from mdio.segy.utilities import get_grid_plan

if TYPE_CHECKING:
from typing import Any

from segy.arrays import HeaderArray as SegyHeaderArray
from segy.schema import SegySpec
from xarray import Dataset as xr_Dataset
Expand Down Expand Up @@ -113,25 +115,30 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:


def _scan_for_headers(
segy_file: SegyFile, template: AbstractDatasetTemplate
segy_file: SegyFile,
template: AbstractDatasetTemplate,
grid_overrides: dict[str, Any] | None = None,
) -> tuple[list[Dimension], SegyHeaderArray]:
"""Extract trace dimensions and index headers from the SEG-Y file.

This is an expensive operation.
It scans the SEG-Y file in chunks by using ProcessPoolExecutor
"""
# TODO(Dmitriy): implement grid overrides
# https://github.com/TGSAI/mdio-python/issues/585
# The 'grid_chunksize' is used only for grid_overrides
# While we do not support grid override, we can set it to None
grid_chunksize = None
segy_dimensions, chunksize, segy_headers = get_grid_plan(
full_chunk_size = template.full_chunk_size
segy_dimensions, chunk_size, segy_headers = get_grid_plan(
segy_file=segy_file,
return_headers=True,
template=template,
chunksize=grid_chunksize,
grid_overrides=None,
chunksize=full_chunk_size,
grid_overrides=grid_overrides,
)
if full_chunk_size != chunk_size:
# TODO(Dmitriy): implement grid overrides
# https://github.com/TGSAI/mdio-python/issues/585
# The returned 'chunksize' is used only for grid_overrides. We will need to use it when full
# support for grid overrides is implemented
err = "Support for changing full_chunk_size in grid overrides is not yet implemented"
raise NotImplementedError(err)
return segy_dimensions, segy_headers


Expand Down Expand Up @@ -278,7 +285,7 @@ def _populate_coordinates(
return dataset, drop_vars_delayed


def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:
def _add_segy_ingest_attributes(dataset: Dataset, segy_file: SegyFile, grid_overrides: dict[str, Any] | None) -> None:
text_header = segy_file.text_header.splitlines()
# Validate:
# text_header this should be a 40-items array of strings with width of 80 characters.
Expand All @@ -301,23 +308,27 @@ def _add_text_binary_headers(dataset: Dataset, segy_file: SegyFile) -> None:

# Handle case where it may not have any metadata yet
if dataset.metadata.attributes is None:
dataset.attrs["attributes"] = {}
dataset.metadata.attributes = {}

segy_attributes = {
"textHeader": text_header,
"binaryHeader": segy_file.binary_header.to_dict(),
}

if grid_overrides is not None:
segy_attributes["gridOverrides"] = grid_overrides

# Update the attributes with the text and binary headers.
dataset.metadata.attributes.update(
{
"textHeader": text_header,
"binaryHeader": segy_file.binary_header.to_dict(),
}
)
dataset.metadata.attributes.update(segy_attributes)


def segy_to_mdio(
def segy_to_mdio( # noqa PLR0913
segy_spec: SegySpec,
mdio_template: AbstractDatasetTemplate,
input_location: StorageLocation,
output_location: StorageLocation,
overwrite: bool = False,
grid_overrides: dict[str, Any] | None = None,
) -> None:
"""A function that converts a SEG-Y file to an MDIO v1 file.

Expand All @@ -329,6 +340,7 @@ def segy_to_mdio(
input_location: The storage location of the input SEG-Y file.
output_location: The storage location for the output MDIO v1 file.
overwrite: Whether to overwrite the output file if it already exists. Defaults to False.
grid_overrides: Option to add grid overrides.

Raises:
FileExistsError: If the output location already exists and overwrite is False.
Expand All @@ -340,12 +352,11 @@ def segy_to_mdio(
segy_settings = SegySettings(storage_options=input_location.options)
segy_file = SegyFile(url=input_location.uri, spec=segy_spec, settings=segy_settings)

# Scan the SEG-Y file for headers
segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template)
segy_dimensions, segy_headers = _scan_for_headers(segy_file, mdio_template, grid_overrides)

grid = _build_and_check_grid(segy_dimensions, segy_file, segy_headers)

dimensions, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
_, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template)
# TODO(Altay): Turn this dtype into packed representation
# https://github.com/TGSAI/mdio-python/issues/601
headers = to_structured_type(segy_spec.trace.header.dtype)
Expand All @@ -358,7 +369,7 @@ def segy_to_mdio(
headers=headers,
)

_add_text_binary_headers(dataset=mdio_ds, segy_file=segy_file)
_add_segy_ingest_attributes(dataset=mdio_ds, segy_file=segy_file, grid_overrides=grid_overrides)

xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)

Expand Down
5 changes: 5 additions & 0 deletions src/mdio/schemas/v1/templates/abstract_dataset_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ def coordinate_names(self) -> list[str]:
"""Returns the names of the coordinates."""
return copy.deepcopy(self._coord_names)

@property
def full_chunk_size(self) -> list[int]:
"""Returns the chunk size for the variables."""
return copy.deepcopy(self._var_chunk_shape)

@property
@abstractmethod
def _name(self) -> str:
Expand Down
63 changes: 41 additions & 22 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from segy.schema import SegySpec


def _segy_spec_mock_4d() -> SegySpec:
"""Create a mock SEG-Y spec for 4D data."""
def get_segy_mock_4d_spec() -> SegySpec:
"""Create a mock 4D SEG-Y specification."""
trace_header_fields = [
HeaderField(name="field_rec_no", byte=9, format="int32"),
HeaderField(name="channel", byte=13, format="int32"),
Expand All @@ -31,6 +31,13 @@ def _segy_spec_mock_4d() -> SegySpec:
HeaderField(name="shot_line", byte=133, format="int16"),
HeaderField(name="cable", byte=137, format="int16"),
HeaderField(name="gun", byte=171, format="int16"),
HeaderField(name="coordinate_scalar", byte=71, format="int16"),
HeaderField(name="source_coord_x", byte=73, format="int32"),
HeaderField(name="source_coord_y", byte=77, format="int32"),
HeaderField(name="group_coord_x", byte=81, format="int32"),
HeaderField(name="group_coord_y", byte=85, format="int32"),
HeaderField(name="cdp_x", byte=181, format="int32"),
HeaderField(name="cdp_y", byte=185, format="int32"),
]
rev1_spec = get_segy_standard(1.0)
spec = rev1_spec.customize(trace_header_fields=trace_header_fields)
Expand Down Expand Up @@ -83,33 +90,45 @@ def create_segy_mock_4d( # noqa: PLR0913
channel_headers = np.tile(channel_headers, shot_count)

factory = SegyFactory(
spec=_segy_spec_mock_4d(),
spec=get_segy_mock_4d_spec(),
sample_interval=1000,
samples_per_trace=num_samples,
)

headers = factory.create_trace_header_template(trace_count)
samples = factory.create_trace_sample_template(trace_count)

for trc_idx in range(trace_count):
shot = shot_headers[trc_idx]
gun = gun_headers[trc_idx]
cable = cable_headers[trc_idx]
channel = channel_headers[trc_idx]
shot_line = 1
offset = 0

if index_receivers is False:
channel, gun, shot_line = 0, 0, 0

header_data = (shot, channel, shot, offset, shot_line, cable, gun)

fields = list(headers.dtype.names)
fields.remove("samples_per_trace")
fields.remove("sample_interval")

headers[fields][trc_idx] = header_data
samples[trc_idx] = np.linspace(start=shot, stop=shot + 1, num=num_samples)
start_x = 700000
start_y = 4000000
step_x = 100
step_y = 100

for trc_shot_idx in range(shot_count):
for trc_chan_idx in range(total_chan):
trc_idx = trc_shot_idx * total_chan + trc_chan_idx

shot = shot_headers[trc_idx]
gun = gun_headers[trc_idx]
cable = cable_headers[trc_idx]
channel = channel_headers[trc_idx]
shot_line = 1
offset = 0

if index_receivers is False:
channel, gun, shot_line = 0, 0, 0

# Assign dimension coordinate fields with calculated mock data
header_fields = ["field_rec_no", "channel", "shot_point", "offset", "shot_line", "cable", "gun"]
headers[header_fields][trc_idx] = (shot, channel, shot, offset, shot_line, cable, gun)

# Assign coordinate fields with mock data
x = start_x + step_x * trc_shot_idx
y = start_y + step_y * trc_chan_idx
headers["coordinate_scalar"][trc_idx] = -100
coord_fields = ["source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y", "cdp_x", "cdp_y"]
headers[coord_fields][trc_idx] = (x, y) * 3

samples[trc_idx] = np.linspace(start=shot, stop=shot + 1, num=num_samples)

with segy_path.open(mode="wb") as fp:
fp.write(factory.create_textual_header())
Expand Down
Loading
Loading