Finish porting tests

glotzerlab · janbridley · Jan 13, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
commit 33134e8f3e1faa7293bfc9d7019d50b7e7e8761f
diff --git a/parsnip/oo.py b/parsnip/oo.py
@@ -12,8 +12,8 @@
 from numpy.lib.recfunctions import structured_to_unstructured
 
 from parsnip._errors import ParseWarning
-from parsnip.parse import _parsed_line_generator
-
+from parsnip.unitcells import _matrix_from_lengths_and_angles
+from parsnip.parse import cast_array_to_float, _safe_eval, _write_debug_output
 NONTABLE_LINE_PREFIXES = ("_", "#")
 
 
@@ -162,7 +162,7 @@ def _find_slice_in_tables(self, index: str):
             if index in table.dtype.names:
                 return table[index]
 
-    def get_from_tables(self, index: str | list[str]):
+    def get_from_tables(self, index: str | Iterable[str]):
         """Return a column or columns from the matching table in :meth:`~.self.tables`.
 
         If index is a single string, a single column will be returned from the matching
@@ -229,7 +229,7 @@ def get_from_tables(self, index: str | list[str]):
         "space_delimited_data": r"(\'[^\']*\'|\"[^\"]*\"]|[^\'\" \t]*)[ | \t]*",
     }
 
-    def __getitem__(self, key: str | list[str]):
+    def __getitem__(self, key: str | Iterable[str]):
         """Return an item from the dictionary of key-value pairs.
 
         Indexing with a string returns the value from the :meth:`~.pairs` dict. Indexing
@@ -241,6 +241,157 @@ def __getitem__(self, key: str | list[str]):
 
         return self.pairs[key]
 
+    def read_symmetry_operations(self):
+        r"""Extract the symmetry operations from a CIF file.
+
+        Args:
+            filename (str): The name of the .cif file to be parsed.
+
+        Returns:
+            :math:`(N,)` :class:`numpy.ndarray[str]`:
+                Symmetry operations as strings.
+        """
+        symmetry_keys = (
+            "_symmetry_equiv_pos_as_xyz",
+            "_space_group_symop_operation_xyz",
+        )
+
+        # Only one of the two keys will be matched. We can safely ignore that warning.
+        warnings.filterwarnings("ignore", "Keys {'_", category=ParseWarning)
+        return self.get_from_tables(symmetry_keys)
+
+    def read_wyckoff_positions(self):
+        r"""Extract symmetry-irreducible, fractional X,Y,Z coordinates from a CIF file.
+
+        Parameters:
+        -----------
+            filename (str): The name of the .cif file to be parsed.
+
+        Returns:
+        --------
+            :math:`(N, 3)` :class:`numpy.ndarray[np.float32]`:
+                Fractional X,Y,Z coordinates of the unit cell.
+        """
+        xyz_keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")
+        xyz_data = self.get_from_tables(xyz_keys)
+        xyz_data = cast_array_to_float(arr=xyz_data, dtype=np.float64)
+
+        return xyz_data
+
+    def read_cell_params(self, degrees: bool = True, mmcif: bool = False):
+        r"""Read the cell lengths and angles from a CIF file.
+
+        Paramters:
+        ----------
+            degrees (bool, optional):
+                When True, angles are returned in degrees (as per the cif spec). When False,
+                angles are converted to radians.
+                Default value = ``True``
+            mmcif (bool, optional):
+                When False, the standard CIF key naming is used (e.g. _cell_angle_alpha).
+                When True, the mmCIF standard is used instead (e.g. cell.angle_alpha).
+                Default value = ``False``
+
+        Returns:
+        --------
+            tuple:
+                The box vector lengths and angles in degrees or radians
+                :math:`(L_1, L_2, L_3, \alpha, \beta, \gamma)`.
+        """
+        if mmcif:
+            angle_keys = ("_cell.angle_alpha", "_cell.angle_beta", "_cell.angle_gamma")
+            box_keys = ("_cell.length_a", "_cell.length_b", "_cell.length_c") + angle_keys
+        else:
+            angle_keys = ("_cell_angle_alpha", "_cell_angle_beta", "_cell_angle_gamma")
+            box_keys = ("_cell_length_a", "_cell_length_b", "_cell_length_c") + angle_keys
+        cell_data = cast_array_to_float(arr=self[box_keys], dtype=np.float64)
+
+        assert all(value is not None for value in cell_data)
+        assert all(
+            0 < key < 180 for key in cell_data[3:]
+        ), "Read cell params were not in the expected range (0 < angle < 180 degrees)."
+
+        if not degrees:
+            cell_data[3:] = np.deg2rad(cell_data[3:])
+
+        return tuple(cell_data)
+
+    def extract_atomic_positions(
+        self,
+        fractional: bool = True,
+        n_decimal_places: int = 4,
+        verbose: bool = False,
+    ):
+        """Reconstruct atomic positions from Wyckoff sites and symmetry operations.
+
+        .. warning::
+
+            Reconstructing positions requires several floating point calculations that can
+            be impacted by low-precision data in CIF files. Typically, at least four decimal
+            places are required to accurately reconstruct complicated unit cells: less
+            precision than this can yield cells with duplicate or missing positions.
+
+        Args:
+            fractional (bool, optional):
+                Whether to return fractional or absolute coordinates.
+                Default value = ``True``
+            n_decimal_places (int, optional):
+                The number of decimal places to round each position to for the uniqueness
+                comparison. Values higher than 4 may not work for all CIF files.
+                Default value = ``4``
+            verbose (bool, optional):
+                Whether to print debug information about the uniqueness checks.
+                Default value = ``False``
+
+        Returns:
+            :math:`(N, 3)` :class:`numpy.ndarray[np.float32]`:
+                The full unit cell of the crystal structure.
+        """
+        fractional_positions = self.read_wyckoff_positions()
+
+        # Read the cell params and conver to a matrix of basis vectors
+        cell = self.read_cell_params(degrees=False, mmcif=False)
+        cell_matrix = _matrix_from_lengths_and_angles(*cell)
+
+        symops = self.read_symmetry_operations()
+        symops_str = np.array2string(
+            symops,
+            separator=",",  # Place a comma after each line in the array. Required for eval
+            threshold=np.inf,  # Ensure that every line is included in the string
+            floatmode="unique",  # Ensures strings can uniquely represent each float number
+        )
+
+        all_frac_positions = [_safe_eval(symops_str, *xyz) for xyz in fractional_positions]
+
+        pos = np.vstack(all_frac_positions)
+        pos %= 1  # Wrap particles into the box
+
+        # Filter unique points. This takese some time, but makes the method faster overall
+        _, unique_indices, unique_counts = np.unique(
+            pos.round(n_decimal_places), return_index=True, return_counts=True, axis=0
+        )
+
+        if verbose:
+            _write_debug_output(unique_indices, unique_counts, pos, check="Initial")
+
+        # Remove initial duplicates, then map to real space for a second check
+        pos = pos[unique_indices]
+        real_space_positions = pos @ cell_matrix
+
+        _, unique_indices, unique_counts = np.unique(
+            real_space_positions.round(n_decimal_places),
+            return_index=True,
+            return_counts=True,
+            axis=0,
+        )
+
+        if verbose:
+            _write_debug_output(unique_indices, unique_counts, pos, check="Secondary")
+
+        return pos[unique_indices] if fractional else real_space_positions[unique_indices]
+
+
+
     def _parse(self):
         """Parse the cif file into python objects."""
         data_iter = peekable(self._data.split("\n"))

diff --git a/parsnip/parse.py b/parsnip/parse.py
@@ -46,6 +46,8 @@
 
 """
 
+from __future__ import annotations
+
 import re
 import warnings
 
@@ -317,3 +319,66 @@ def read_key_value_pairs(
         )
 
     return data
+
+
+
+def _safe_eval(str_input: str, x: int | float, y: int | float, z: int | float):
+    """Attempt to safely evaluate a string of symmetry equivalent positions.
+
+    Python's ``eval`` is notoriously unsafe. While we could evaluate the entire list at
+    once, doing so carries some risk. The typical alternative, ``ast.literal_eval``,
+    doesnot work because we need to evaluate mathematical operations.
+
+    We first replace the x,y,z values with ordered fstring inputs, to simplify the input
+    of fractional coordinate data. This is done for convenience more than security.
+
+    Once we substitute in the x,y,z values, we should have a string version of a list
+    containing only numerics and math operators. We apply a substitution to ensure this
+    is the case, then perform one final check. If it passes, we evaluate the list. Note
+    that __builtins__ is set to {}, meaning importing functions is not possible. The
+    __locals__ dict is also set to {}, so no variables are accessible in the evaluation.
+
+    I cannot guarantee this is fully safe, but it at the very least makes it extremely
+    difficult to do any funny business.
+
+    Args:
+        str_input (str): String to be evaluated.
+        x (int|float): Fractional coordinate in :math:`x`.
+        y (int|float): Fractional coordinate in :math:`y`.
+        z (int|float): Fractional coordinate in :math:`z`.
+
+    Returns:
+        list[list[int|float,int|float,int|float]]:
+            :math:`(N,3)` list of fractional coordinates.
+
+    """
+    ordered_inputs = {"x": "{0:.20f}", "y": "{1:.20f}", "z": "{2:.20f}"}
+    # Replace any x, y, or z with the same character surrounded by curly braces. Then,
+    # perform substitutions to insert the actual values.
+    substituted_string = (
+        re.sub(r"([xyz])", r"{\1}", str_input).format(**ordered_inputs).format(x, y, z)
+    )
+
+    # Remove any unexpected characters from the string.
+    safe_string = re.sub(r"[^\d\[\]\,\+\-\/\*\.]", "", substituted_string)
+    # Double check to be sure:
+    assert all(char in ",.0123456789+-/*[]" for char in safe_string), (
+        "Evaluation aborted. Check that symmetry operation string only contains "
+        "numerics or characters in { [],.+-/ } and adjust `regex_filter` param "
+        "accordingly."
+    )
+    return eval(safe_string, {"__builtins__": {}}, {})  # noqa: S307
+
+def _write_debug_output(unique_indices, unique_counts, pos, check="Initial"):
+    print(f"{check} uniqueness check:")
+    if len(unique_indices) == len(pos):
+        print("... all points are unique (within tolerance).")
+    else:
+        print("(duplicate point, number of occurences)")
+        [
+            print(pt, count)
+            for pt, count in zip(pos[unique_indices], unique_counts)
+            if count > 1
+        ]
+
+    print()
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,8 @@ optional-dependencies = {tests = { file = ["tests/requirements.in"] }, doc = { f
 # Add percentage progress bar to the pytest console output
 console_output_style = "progress"
 # Specify the tests folder to speed up collection.
-testpaths = ["tests"]
+testpaths = ["parsnip", "tests"]
+addopts = "--doctest-modules"
 
 [tool.ruff]
 include = ["*.py", "*.ipynb"]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,6 +6,7 @@
 import pytest
 
 from parsnip.oo import CifFile
+from parsnip._errors import ParseWarning
 
 # ruff: noqa: N816. Allow mixed-case global variables
 
@@ -149,7 +150,7 @@ def random_keys_mark(n_samples=10):
     file=CifFile(data_file_path + "COD_1540955_aP16.cif"),
 )
 
-with warnings.catch_warnings(record=False):
+with pytest.warns(ParseWarning, match="cannot be resolved into a table"):
     pdb_4INS = CifData(
         filename=data_file_path + "PDB_4INS_head.cif",
         symop_keys=("_pdbx_struct_oper_list.symmetry_operation",),

diff --git a/tests/test_unitcells.py b/tests/test_unitcells.py
@@ -26,20 +26,18 @@ def test_read_wyckoff_positions(cif_data):
     if "PDB_4INS_head.cif" in cif_data.filename:
         return
     keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")
-    parsnip_data = cif_data.file.get_from_tables(keys)
-    # parsnip_data = read_wyckoff_positions(filename=cif_data.filename)
+    parsnip_data = cif_data.file.read_wyckoff_positions()
     gemmi_data = _gemmi_read_table(cif_data.filename, keys)
-    # gemmi_data = [[cif.as_number(val) for val in row] for row in gemmi_data]
+    gemmi_data = [[cif.as_number(val) for val in row] for row in gemmi_data]
     np.testing.assert_array_equal(parsnip_data, gemmi_data)
 
 
 @cif_files_mark
 def test_read_cell_params(cif_data, keys=box_keys):
     mmcif = "PDB_4INS_head.cif" in cif_data.filename
-    # parsnip_data = read_cell_params(filename=cif_data.filename, mmcif=mmcif)
     if mmcif:
         keys = (key[0] + key[1:].replace("_", ".", 1) for key in keys)
-    parsnip_data = cif_data.file[keys]
+    parsnip_data = cif_data.file.read_cell_params(mmcif=mmcif)
     gemmi_data = _gemmi_read_keys(cif_data.filename, keys)
     np.testing.assert_array_equal(parsnip_data, gemmi_data)
 
@@ -49,10 +47,8 @@ def test_read_symmetry_operations(cif_data):
     if "PDB_4INS_head.cif" in cif_data.filename:
         return
 
-    parsnip_data = read_symmetry_operations(filename=cif_data.filename)
+    parsnip_data = cif_data.file.read_symmetry_operations()
     gemmi_data = _gemmi_read_table(filename=cif_data.filename, keys=cif_data.symop_keys)
-    # We clean up the data for easier processing: apply the same transformation to gemmi
-    gemmi_data = [[item.replace(" ", "") for item in row] for row in gemmi_data]
     np.testing.assert_array_equal(parsnip_data, gemmi_data)
 
 
@@ -69,8 +65,8 @@ def test_extract_atomic_positions(cif_data, n_decimal_places):
     if "PDB_4INS_head.cif" in cif_data.filename:
         pytest.skip("Function not compatible with PDB data.")
 
-    parsnip_positions = extract_atomic_positions(
-        filename=cif_data.filename, n_decimal_places=n_decimal_places, fractional=False
+    parsnip_positions = cif_data.file.extract_atomic_positions(
+        n_decimal_places=n_decimal_places, fractional=False
     )
 
     # Read the structure, then extract to Python builtin types. Then, wrap into the box
@@ -87,6 +83,6 @@ def test_extract_atomic_positions(cif_data, n_decimal_places):
 
     parsnip_minmax = [parsnip_positions.min(axis=0), parsnip_positions.max(axis=0)]
     ase_minmax = [ase_positions.min(axis=0), ase_positions.max(axis=0)]
-    np.testing.assert_allclose(parsnip_minmax, ase_minmax, atol=1e-6)
+    np.testing.assert_allclose(parsnip_minmax, ase_minmax, atol=1e-12)
 
     np.testing.assert_allclose(parsnip_positions, ase_positions, atol=1e-12)