Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
117 commits
Select commit Hold shift + click to select a range
1bd92e8
Nearly-working impl
janbridley Dec 20, 2024
da87f8c
Full working example
janbridley Dec 20, 2024
b28cc05
Clean up layout
janbridley Dec 20, 2024
2ae2d31
Further cleanup
janbridley Dec 20, 2024
a05d8b8
Lint OO
janbridley Dec 20, 2024
5bc5299
Run pre-commit on _errors.py
janbridley Dec 21, 2024
06fabc7
Add oo.py temp implementation
janbridley Dec 21, 2024
727f604
Undo changes to sample data
janbridley Dec 21, 2024
2752632
Lint oo.py
janbridley Dec 21, 2024
b643d5f
Remove change to sample data
janbridley Dec 21, 2024
c949095
Add oo to init.py
janbridley Dec 21, 2024
04280cc
Handle edge cases
janbridley Dec 21, 2024
0d53a30
Test parsing real files
janbridley Dec 21, 2024
e7855f1
Improve robustness of table reader
janbridley Dec 21, 2024
f7a9d75
Lint oo and conftest
janbridley Dec 21, 2024
9c63222
Clean up text and remove comments
janbridley Dec 21, 2024
aa78a68
Port initial test to new style
janbridley Dec 21, 2024
7d07b59
Port remaining key tests
janbridley Dec 21, 2024
56a4d50
Minor fixes
janbridley Dec 21, 2024
16f1655
Clean up test_key_reader.py
janbridley Dec 21, 2024
5e92141
Progress toward transition to recarray
janbridley Dec 21, 2024
a44a925
Increase tests and fix memory layout bug
janbridley Dec 22, 2024
b020262
Fixes to memory layout
janbridley Dec 22, 2024
ccb2aea
Convert table_reader tests
janbridley Dec 22, 2024
1e81654
Linting and doc fixes
janbridley Dec 22, 2024
97e0cbd
Clean up docs
janbridley Dec 22, 2024
33134e8
Finish porting tests
janbridley Dec 23, 2024
7bc9f86
Lints
janbridley Dec 23, 2024
c80a86a
Fix for scalar array inputs
janbridley Dec 24, 2024
e23762e
Remove unnecessary filterwarning
janbridley Dec 24, 2024
917685f
Expand on tests
janbridley Dec 24, 2024
933d4b9
Clean up unitcells
janbridley Dec 24, 2024
42151fa
Lint tests
janbridley Dec 24, 2024
8a5e73f
Finalize lints
janbridley Dec 24, 2024
12b73c5
Restructure patterns
janbridley Dec 24, 2024
8f945fd
Update test_patterns
janbridley Dec 24, 2024
0d50bc8
Lint and clean up
janbridley Dec 24, 2024
d39ce71
Final lint
janbridley Dec 24, 2024
4bfe1b3
Improve a few tests
janbridley Dec 24, 2024
cded448
Add symops to example cif
janbridley Dec 24, 2024
faf9816
Remove package-unitcells deprecated docs
janbridley Dec 24, 2024
efb80e8
Fix link in package-parse
janbridley Dec 24, 2024
86faca3
Update quickstart tutorial
janbridley Dec 24, 2024
867a37d
Move oo.py to parsnip.py
janbridley Dec 24, 2024
31a5b6c
Update README
janbridley Dec 24, 2024
9ceb2f9
Update Unitcells test imports
janbridley Dec 24, 2024
8ef7150
Lint
janbridley Dec 24, 2024
2dba4be
Lazily load file
janbridley Dec 24, 2024
65d6890
Remove unused files
janbridley Dec 24, 2024
8b6f99f
Skip bad_cif test
janbridley Dec 25, 2024
05895ea
Clean up tests
janbridley Dec 25, 2024
305a37a
Lint
janbridley Dec 25, 2024
3592f1e
Add tests for table_labels and cast_numerics
janbridley Dec 26, 2024
0cc212e
Clean up tests
janbridley Dec 26, 2024
79a1c85
Lint
janbridley Dec 26, 2024
a16c7e8
Clean up docstrings
janbridley Dec 26, 2024
716cc4c
Lint and update docstrings
janbridley Dec 27, 2024
f211007
Further docs
janbridley Dec 27, 2024
def08df
Codespell
janbridley Dec 27, 2024
a001b48
Tests for cell
janbridley Dec 27, 2024
a40fa7e
Lint
janbridley Dec 27, 2024
cf83de1
Update errors for read_unit_cell
janbridley Dec 27, 2024
ab12074
Clean up tests and todos
janbridley Dec 28, 2024
46d5577
More TODOs
janbridley Dec 28, 2024
b1e4388
Lint
janbridley Dec 28, 2024
e27c00b
Add test for cell property
janbridley Dec 28, 2024
4ced6ed
Remove modindex from sidebar
janbridley Dec 28, 2024
b8eb804
Consolidate logic for nonsimple data
janbridley Dec 28, 2024
7369f84
Lint
janbridley Dec 28, 2024
8b8a0b5
Fix type annotation in cast_array function
janbridley Dec 28, 2024
d6b4da4
Add more-itertools as official dependency
janbridley Dec 28, 2024
343150d
Clean up dependency documentation
janbridley Dec 28, 2024
30d7776
Add index for ase backward compatibility
janbridley Dec 28, 2024
e46fdf4
Change wording in development.rst
janbridley Dec 28, 2024
1871e50
Replace index specification
janbridley Dec 28, 2024
66701ff
Disable ASE test on python3.7
janbridley Dec 28, 2024
67e6b22
Fix version check
janbridley Dec 28, 2024
31eb83d
Add additional lints
janbridley Dec 28, 2024
31a9907
Document additional rules in pyproject.toml
janbridley Dec 28, 2024
2828a3b
Move PATTERNS dict to end of docs
janbridley Dec 28, 2024
54f316c
Clean up development.rst
janbridley Dec 28, 2024
dadb0b4
Expand with tests from additional databases
janbridley Dec 28, 2024
8672046
Disable lint that causes warning
janbridley Dec 28, 2024
08b35c1
Fix for multiline data entries
janbridley Dec 29, 2024
10a60fa
Progress toward multiline string parsing
janbridley Dec 29, 2024
f7402d4
Working impl that fails for blocks containing a semicolon
janbridley Dec 29, 2024
58339c5
Clean up
janbridley Dec 29, 2024
993f698
Messy working impl
janbridley Dec 29, 2024
88c3a1a
Clean up
janbridley Dec 29, 2024
602edf9
Retain newlines
janbridley Dec 29, 2024
0230b52
Lint
janbridley Dec 29, 2024
82f31c6
Add TODO
janbridley Dec 29, 2024
f029663
Add missing multiline keys
janbridley Dec 29, 2024
e648ce4
Wrap accumulator into a function
janbridley Dec 29, 2024
d643fad
Clean up _accumulate_nonsimple_data
janbridley Dec 29, 2024
85c39be
Clean up unused comments
janbridley Dec 29, 2024
a7a2468
Update changelog.rst
janbridley Dec 30, 2024
67b59d4
Fix version headings in changelog
janbridley Dec 30, 2024
0c14506
Update README to reflect correct CIF2.0 status
janbridley Dec 31, 2024
af0325d
Add CIFTEST data to gitignore
janbridley Dec 31, 2024
19a8173
Escape dash in regex and allow forward slash in data name
janbridley Dec 31, 2024
2fec769
Swap namedtuple to dataclass and clean up provided keys
janbridley Dec 31, 2024
5eb4bbf
Auto detect cif keys
janbridley Dec 31, 2024
6f30930
Allow pdb matrix keys
janbridley Dec 31, 2024
0ff9e9a
Generalize nonsimple data delimiters
janbridley Dec 31, 2024
bb143d3
Add architecture.md
janbridley Dec 31, 2024
c383131
Update table tests and fix regex for nonsimple data in tabs
janbridley Dec 31, 2024
e84e94a
Add pycifrw to test reqs
janbridley Dec 31, 2024
327d3f1
Lint tests
janbridley Dec 31, 2024
65a0015
Verify all table content
janbridley Jan 1, 2025
fdf1e21
Lint
janbridley Jan 1, 2025
f6051c9
import annotations
janbridley Jan 1, 2025
8b8df48
Remove unused pattern
janbridley Jan 1, 2025
d031882
Rename tables to loops
janbridley Jan 1, 2025
55e8817
Remove extra character from regex
janbridley Jan 1, 2025
f6517b0
Clean up table reader
janbridley Jan 1, 2025
357615a
Clean up
janbridley Jan 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Finish porting tests
  • Loading branch information
janbridley committed Dec 23, 2024
commit 33134e8f3e1faa7293bfc9d7019d50b7e7e8761f
159 changes: 155 additions & 4 deletions parsnip/oo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from numpy.lib.recfunctions import structured_to_unstructured

from parsnip._errors import ParseWarning
from parsnip.parse import _parsed_line_generator

from parsnip.unitcells import _matrix_from_lengths_and_angles
from parsnip.parse import cast_array_to_float, _safe_eval, _write_debug_output
NONTABLE_LINE_PREFIXES = ("_", "#")


Expand Down Expand Up @@ -162,7 +162,7 @@ def _find_slice_in_tables(self, index: str):
if index in table.dtype.names:
return table[index]

def get_from_tables(self, index: str | list[str]):
def get_from_tables(self, index: str | Iterable[str]):
"""Return a column or columns from the matching table in :meth:`~.self.tables`.

If index is a single string, a single column will be returned from the matching
Expand Down Expand Up @@ -229,7 +229,7 @@ def get_from_tables(self, index: str | list[str]):
"space_delimited_data": r"(\'[^\']*\'|\"[^\"]*\"]|[^\'\" \t]*)[ | \t]*",
}

def __getitem__(self, key: str | list[str]):
def __getitem__(self, key: str | Iterable[str]):
"""Return an item from the dictionary of key-value pairs.

Indexing with a string returns the value from the :meth:`~.pairs` dict. Indexing
Expand All @@ -241,6 +241,157 @@ def __getitem__(self, key: str | list[str]):

return self.pairs[key]

def read_symmetry_operations(self):
r"""Extract the symmetry operations from a CIF file.

Args:
filename (str): The name of the .cif file to be parsed.

Returns:
:math:`(N,)` :class:`numpy.ndarray[str]`:
Symmetry operations as strings.
"""
symmetry_keys = (
"_symmetry_equiv_pos_as_xyz",
"_space_group_symop_operation_xyz",
)

# Only one of the two keys will be matched. We can safely ignore that warning.
warnings.filterwarnings("ignore", "Keys {'_", category=ParseWarning)
return self.get_from_tables(symmetry_keys)

def read_wyckoff_positions(self):
r"""Extract symmetry-irreducible, fractional X,Y,Z coordinates from a CIF file.

Parameters:
-----------
filename (str): The name of the .cif file to be parsed.

Returns:
--------
:math:`(N, 3)` :class:`numpy.ndarray[np.float32]`:
Fractional X,Y,Z coordinates of the unit cell.
"""
xyz_keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")
xyz_data = self.get_from_tables(xyz_keys)
xyz_data = cast_array_to_float(arr=xyz_data, dtype=np.float64)

return xyz_data

def read_cell_params(self, degrees: bool = True, mmcif: bool = False):
r"""Read the cell lengths and angles from a CIF file.

Paramters:
----------
degrees (bool, optional):
When True, angles are returned in degrees (as per the cif spec). When False,
angles are converted to radians.
Default value = ``True``
mmcif (bool, optional):
When False, the standard CIF key naming is used (e.g. _cell_angle_alpha).
When True, the mmCIF standard is used instead (e.g. cell.angle_alpha).
Default value = ``False``

Returns:
--------
tuple:
The box vector lengths and angles in degrees or radians
:math:`(L_1, L_2, L_3, \alpha, \beta, \gamma)`.
"""
if mmcif:
angle_keys = ("_cell.angle_alpha", "_cell.angle_beta", "_cell.angle_gamma")
box_keys = ("_cell.length_a", "_cell.length_b", "_cell.length_c") + angle_keys
else:
angle_keys = ("_cell_angle_alpha", "_cell_angle_beta", "_cell_angle_gamma")
box_keys = ("_cell_length_a", "_cell_length_b", "_cell_length_c") + angle_keys
cell_data = cast_array_to_float(arr=self[box_keys], dtype=np.float64)

assert all(value is not None for value in cell_data)
assert all(
0 < key < 180 for key in cell_data[3:]
), "Read cell params were not in the expected range (0 < angle < 180 degrees)."

if not degrees:
cell_data[3:] = np.deg2rad(cell_data[3:])

return tuple(cell_data)

def extract_atomic_positions(
self,
fractional: bool = True,
n_decimal_places: int = 4,
verbose: bool = False,
):
"""Reconstruct atomic positions from Wyckoff sites and symmetry operations.

.. warning::

Reconstructing positions requires several floating point calculations that can
be impacted by low-precision data in CIF files. Typically, at least four decimal
places are required to accurately reconstruct complicated unit cells: less
precision than this can yield cells with duplicate or missing positions.

Args:
fractional (bool, optional):
Whether to return fractional or absolute coordinates.
Default value = ``True``
n_decimal_places (int, optional):
The number of decimal places to round each position to for the uniqueness
comparison. Values higher than 4 may not work for all CIF files.
Default value = ``4``
verbose (bool, optional):
Whether to print debug information about the uniqueness checks.
Default value = ``False``

Returns:
:math:`(N, 3)` :class:`numpy.ndarray[np.float32]`:
The full unit cell of the crystal structure.
"""
fractional_positions = self.read_wyckoff_positions()

# Read the cell params and conver to a matrix of basis vectors
cell = self.read_cell_params(degrees=False, mmcif=False)
cell_matrix = _matrix_from_lengths_and_angles(*cell)

symops = self.read_symmetry_operations()
symops_str = np.array2string(
symops,
separator=",", # Place a comma after each line in the array. Required for eval
threshold=np.inf, # Ensure that every line is included in the string
floatmode="unique", # Ensures strings can uniquely represent each float number
)

all_frac_positions = [_safe_eval(symops_str, *xyz) for xyz in fractional_positions]

pos = np.vstack(all_frac_positions)
pos %= 1 # Wrap particles into the box

# Filter unique points. This takese some time, but makes the method faster overall
_, unique_indices, unique_counts = np.unique(
pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0
)

if verbose:
_write_debug_output(unique_indices, unique_counts, pos, check="Initial")

# Remove initial duplicates, then map to real space for a second check
pos = pos[unique_indices]
real_space_positions = pos @ cell_matrix

_, unique_indices, unique_counts = np.unique(
real_space_positions.round(n_decimal_places),
return_index=True,
return_counts=True,
axis=0,
)

if verbose:
_write_debug_output(unique_indices, unique_counts, pos, check="Secondary")

return pos[unique_indices] if fractional else real_space_positions[unique_indices]



def _parse(self):
"""Parse the cif file into python objects."""
data_iter = peekable(self._data.split("\n"))
Expand Down
65 changes: 65 additions & 0 deletions parsnip/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@

"""

from __future__ import annotations

import re
import warnings

Expand Down Expand Up @@ -317,3 +319,66 @@ def read_key_value_pairs(
)

return data



def _safe_eval(str_input: str, x: int | float, y: int | float, z: int | float):
"""Attempt to safely evaluate a string of symmetry equivalent positions.

Python's ``eval`` is notoriously unsafe. While we could evaluate the entire list at
once, doing so carries some risk. The typical alternative, ``ast.literal_eval``,
doesnot work because we need to evaluate mathematical operations.

We first replace the x,y,z values with ordered fstring inputs, to simplify the input
of fractional coordinate data. This is done for convenience more than security.

Once we substitute in the x,y,z values, we should have a string version of a list
containing only numerics and math operators. We apply a substitution to ensure this
is the case, then perform one final check. If it passes, we evaluate the list. Note
that __builtins__ is set to {}, meaning importing functions is not possible. The
__locals__ dict is also set to {}, so no variables are accessible in the evaluation.

I cannot guarantee this is fully safe, but it at the very least makes it extremely
difficult to do any funny business.

Args:
str_input (str): String to be evaluated.
x (int|float): Fractional coordinate in :math:`x`.
y (int|float): Fractional coordinate in :math:`y`.
z (int|float): Fractional coordinate in :math:`z`.

Returns:
list[list[int|float,int|float,int|float]]:
:math:`(N,3)` list of fractional coordinates.

"""
ordered_inputs = {"x": "{0:.20f}", "y": "{1:.20f}", "z": "{2:.20f}"}
# Replace any x, y, or z with the same character surrounded by curly braces. Then,
# perform substitutions to insert the actual values.
substituted_string = (
re.sub(r"([xyz])", r"{\1}", str_input).format(**ordered_inputs).format(x, y, z)
)

# Remove any unexpected characters from the string.
safe_string = re.sub(r"[^\d\[\]\,\+\-\/\*\.]", "", substituted_string)
# Double check to be sure:
assert all(char in ",.0123456789+-/*[]" for char in safe_string), (
"Evaluation aborted. Check that symmetry operation string only contains "
"numerics or characters in { [],.+-/ } and adjust `regex_filter` param "
"accordingly."
)
return eval(safe_string, {"__builtins__": {}}, {}) # noqa: S307

def _write_debug_output(unique_indices, unique_counts, pos, check="Initial"):
print(f"{check} uniqueness check:")
if len(unique_indices) == len(pos):
print("... all points are unique (within tolerance).")
else:
print("(duplicate point, number of occurences)")
[
print(pt, count)
for pt, count in zip(pos[unique_indices], unique_counts)
if count > 1
]

print()
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ optional-dependencies = {tests = { file = ["tests/requirements.in"] }, doc = { f
# Add percentage progress bar to the pytest console output
console_output_style = "progress"
# Specify the tests folder to speed up collection.
testpaths = ["tests"]
testpaths = ["parsnip", "tests"]
addopts = "--doctest-modules"

[tool.ruff]
include = ["*.py", "*.ipynb"]
Expand Down
3 changes: 2 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest

from parsnip.oo import CifFile
from parsnip._errors import ParseWarning

# ruff: noqa: N816. Allow mixed-case global variables

Expand Down Expand Up @@ -149,7 +150,7 @@ def random_keys_mark(n_samples=10):
file=CifFile(data_file_path + "COD_1540955_aP16.cif"),
)

with warnings.catch_warnings(record=False):
with pytest.warns(ParseWarning, match="cannot be resolved into a table"):
pdb_4INS = CifData(
filename=data_file_path + "PDB_4INS_head.cif",
symop_keys=("_pdbx_struct_oper_list.symmetry_operation",),
Expand Down
18 changes: 7 additions & 11 deletions tests/test_unitcells.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,18 @@ def test_read_wyckoff_positions(cif_data):
if "PDB_4INS_head.cif" in cif_data.filename:
return
keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")
parsnip_data = cif_data.file.get_from_tables(keys)
# parsnip_data = read_wyckoff_positions(filename=cif_data.filename)
parsnip_data = cif_data.file.read_wyckoff_positions()
gemmi_data = _gemmi_read_table(cif_data.filename, keys)
# gemmi_data = [[cif.as_number(val) for val in row] for row in gemmi_data]
gemmi_data = [[cif.as_number(val) for val in row] for row in gemmi_data]
np.testing.assert_array_equal(parsnip_data, gemmi_data)


@cif_files_mark
def test_read_cell_params(cif_data, keys=box_keys):
mmcif = "PDB_4INS_head.cif" in cif_data.filename
# parsnip_data = read_cell_params(filename=cif_data.filename, mmcif=mmcif)
if mmcif:
keys = (key[0] + key[1:].replace("_", ".", 1) for key in keys)
parsnip_data = cif_data.file[keys]
parsnip_data = cif_data.file.read_cell_params(mmcif=mmcif)
gemmi_data = _gemmi_read_keys(cif_data.filename, keys)
np.testing.assert_array_equal(parsnip_data, gemmi_data)

Expand All @@ -49,10 +47,8 @@ def test_read_symmetry_operations(cif_data):
if "PDB_4INS_head.cif" in cif_data.filename:
return

parsnip_data = read_symmetry_operations(filename=cif_data.filename)
parsnip_data = cif_data.file.read_symmetry_operations()
gemmi_data = _gemmi_read_table(filename=cif_data.filename, keys=cif_data.symop_keys)
# We clean up the data for easier processing: apply the same transformation to gemmi
gemmi_data = [[item.replace(" ", "") for item in row] for row in gemmi_data]
np.testing.assert_array_equal(parsnip_data, gemmi_data)


Expand All @@ -69,8 +65,8 @@ def test_extract_atomic_positions(cif_data, n_decimal_places):
if "PDB_4INS_head.cif" in cif_data.filename:
pytest.skip("Function not compatible with PDB data.")

parsnip_positions = extract_atomic_positions(
filename=cif_data.filename, n_decimal_places=n_decimal_places, fractional=False
parsnip_positions = cif_data.file.extract_atomic_positions(
n_decimal_places=n_decimal_places, fractional=False
)

# Read the structure, then extract to Python builtin types. Then, wrap into the box
Expand All @@ -87,6 +83,6 @@ def test_extract_atomic_positions(cif_data, n_decimal_places):

parsnip_minmax = [parsnip_positions.min(axis=0), parsnip_positions.max(axis=0)]
ase_minmax = [ase_positions.min(axis=0), ase_positions.max(axis=0)]
np.testing.assert_allclose(parsnip_minmax, ase_minmax, atol=1e-6)
np.testing.assert_allclose(parsnip_minmax, ase_minmax, atol=1e-12)

np.testing.assert_allclose(parsnip_positions, ase_positions, atol=1e-12)
Loading