Restructure patterns

glotzerlab · janbridley · Jan 13, 2025 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
commit 12b73c5d61ed7a6e6276495f0d64b0c6ec40ba91
diff --git a/parsnip/oo.py b/parsnip/oo.py
@@ -1,5 +1,49 @@
 # Copyright (c) 2024, Glotzer Group
 # This file is from the parsnip project, released under the BSD 3-Clause License.
+r"""An interface for reading CIF files in Python.
+
+.. include:: ../../README.rst
+    :start-after: .. _parse:
+    :end-before: .. _installing:
+
+.. admonition:: The CIF Format
+
+    This is an example of a simple CIF file. A `key`_ (data name or tag) must start with
+    an underscore, and is seperated from the data value with whitespace characters.
+    A `table`_ begins with the ``loop_`` keyword, and contain a header block and a data
+    block. The vertical position of a tag in the table headings corresponds with the
+    horizontal position of the associated column in the table values.
+
+    .. code-block:: text
+
+        # Key-value pairs describing the unit cell:
+        _cell_length_a  5.40
+        _cell_length_b  3.43
+        _cell_length_c  5.08
+        _cell_angle_alpha  90.0
+        _cell_angle_beta  132.3
+        _cell_angle_gamma  90.0
+
+        # A table with two columns and eight rows:
+        loop_
+        _symmetry_equiv_pos_site_id
+        _symmetry_equiv_pos_as_xyz
+        1  x,y,z
+        2  -x,y,-z
+        3  -x,-y,-z
+        4  x,-y,z
+        5  x+1/2,y+1/2,z
+        6  -x+1/2,y+1/2,-z
+        7  -x+1/2,-y+1/2,-z
+        8  x+1/2,-y+1/2,z
+
+        _symmetry_space_group_name_H-M  'C2 / m' # One more key-value pair
+
+
+.. _key: https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax#definitions
+.. _table: https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax#onelevel
+
+"""
 
 from __future__ import annotations
 
@@ -12,69 +56,23 @@
 from numpy.lib.recfunctions import structured_to_unstructured
 
 from parsnip._errors import ParseWarning
-from parsnip.parse import (
-    _parsed_line_generator,
+from parsnip.patterns import (
+    _dtype_from_int,
+    _is_data,
+    _is_key,
+    _line_is_continued,
     _safe_eval,
+    _semicolon_to_string,
+    _strip_comments,
     _write_debug_output,
     cast_array_to_float,
 )
 from parsnip.unitcells import _matrix_from_lengths_and_angles
+# from parsnip.patterns import 
 
 NONTABLE_LINE_PREFIXES = ("_", "#")
 
 
-def _is_key(line: str | None):
-    return line is not None and line.strip()[:1] == "_"
-
-
-def _is_data(line: str | None):
-    return line is not None and line.strip()[:1] != "_" and line.strip()[:5] != "loop_"
-
-
-def _strip_comments(s: str):
-    return s.split("#")[0].strip()
-
-
-def _strip_quotes(s: str):
-    return s.replace("'", "").replace('"', "")
-
-
-def _dtype_from_int(i: int):
-    return f"<U{i}"
-
-
-def _semicolon_to_string(line: str):
-    if "'" in line and '"' in line:
-        warnings.warn(
-            (
-                "String contains single and double quotes - "
-                "line may be parsed incorrectly"
-            ),
-            stacklevel=2,
-        )
-    # WARNING: because we split our string, we strip "\n" implicitly
-    # This is technically against spec, but is almost never meaningful
-    return line.replace(";", "'" if "'" not in line else '"')
-
-
-def _line_is_continued(line: str | None):
-    return line is not None and line.strip()[:1] == ";"
-
-
-def _try_cast_to_numeric(s: str):
-    """Attempt to cast a string to a number, returning the original string if invalid.
-
-    This method attempts to convert to a float first, followed by an int. Precision
-    measurements and indicators of significant digits are stripped.
-    """
-    parsed = re.match(r"(\d+\.?\d*)", s.strip())
-    if parsed is None or re.search(r"[^0-9\.\(\)]", s):
-        return s
-    elif "." in parsed.group(0):
-        return float(parsed.group(0))
-    else:
-        return int(parsed.group(0))
-
 
 class CifFile:
     """Parser for CIF files."""

diff --git a/parsnip/patterns.py b/parsnip/patterns.py
@@ -8,10 +8,76 @@
 of string data extracted from CIF files by methods in ``parsnip.parse``.
 
 """
+from __future__ import annotations
+
 import re
+import warnings
 
 import numpy as np
 
+from parsnip._errors import ParseWarning
+
+def _safe_eval(str_input: str, x: int | float, y: int | float, z: int | float):
+    """Attempt to safely evaluate a string of symmetry equivalent positions.
+
+    Python's ``eval`` is notoriously unsafe. While we could evaluate the entire list at
+    once, doing so carries some risk. The typical alternative, ``ast.literal_eval``,
+    doesnot work because we need to evaluate mathematical operations.
+
+    We first replace the x,y,z values with ordered fstring inputs, to simplify the input
+    of fractional coordinate data. This is done for convenience more than security.
+
+    Once we substitute in the x,y,z values, we should have a string version of a list
+    containing only numerics and math operators. We apply a substitution to ensure this
+    is the case, then perform one final check. If it passes, we evaluate the list. Note
+    that __builtins__ is set to {}, meaning importing functions is not possible. The
+    __locals__ dict is also set to {}, so no variables are accessible in the evaluation.
+
+    I cannot guarantee this is fully safe, but it at the very least makes it extremely
+    difficult to do any funny business.
+
+    Args:
+        str_input (str): String to be evaluated.
+        x (int|float): Fractional coordinate in :math:`x`.
+        y (int|float): Fractional coordinate in :math:`y`.
+        z (int|float): Fractional coordinate in :math:`z`.
+
+    Returns:
+        list[list[int|float,int|float,int|float]]:
+            :math:`(N,3)` list of fractional coordinates.
+
+    """
+    ordered_inputs = {"x": "{0:.20f}", "y": "{1:.20f}", "z": "{2:.20f}"}
+    # Replace any x, y, or z with the same character surrounded by curly braces. Then,
+    # perform substitutions to insert the actual values.
+    substituted_string = (
+        re.sub(r"([xyz])", r"{\1}", str_input).format(**ordered_inputs).format(x, y, z)
+    )
+
+    # Remove any unexpected characters from the string.
+    safe_string = re.sub(r"[^\d\[\]\,\+\-\/\*\.]", "", substituted_string)
+    # Double check to be sure:
+    assert all(char in ",.0123456789+-/*[]" for char in safe_string), (
+        "Evaluation aborted. Check that symmetry operation string only contains "
+        "numerics or characters in { [],.+-/ } and adjust `regex_filter` param "
+        "accordingly."
+    )
+    return eval(safe_string, {"__builtins__": {}}, {})  # noqa: S307
+
+
+def _write_debug_output(unique_indices, unique_counts, pos, check="Initial"):
+    print(f"{check} uniqueness check:")
+    if len(unique_indices) == len(pos):
+        print("... all points are unique (within tolerance).")
+    else:
+        print("(duplicate point, number of occurences)")
+        [
+            print(pt, count)
+            for pt, count in zip(pos[unique_indices], unique_counts)
+            if count > 1
+        ]
+
+    print()
 
 def cast_array_to_float(arr: np.ndarray, dtype: type = np.float32):
     """Cast a Numpy array to a dtype, pruning significant digits from numerical values.
@@ -100,3 +166,56 @@ def __call__(self, line: str):
         for pattern, replacement in zip(self.patterns, self.replacements):
             line = pattern.sub(replacement, line)
         return line
+
+def _is_key(line: str | None):
+    return line is not None and line.strip()[:1] == "_"
+
+
+def _is_data(line: str | None):
+    return line is not None and line.strip()[:1] != "_" and line.strip()[:5] != "loop_"
+
+
+def _strip_comments(s: str):
+    return s.split("#")[0].strip()
+
+
+def _strip_quotes(s: str):
+    return s.replace("'", "").replace('"', "")
+
+
+def _dtype_from_int(i: int):
+    return f"<U{i}"
+
+
+def _semicolon_to_string(line: str):
+    if "'" in line and '"' in line:
+        warnings.warn(
+            (
+                "String contains single and double quotes - "
+                "line may be parsed incorrectly"
+            ),
+            ParseWarning,
+            stacklevel=2,
+        )
+    # WARNING: because we split our string, we strip "\n" implicitly
+    # This is technically against spec, but is almost never meaningful
+    return line.replace(";", "'" if "'" not in line else '"')
+
+
+def _line_is_continued(line: str | None):
+    return line is not None and line.strip()[:1] == ";"
+
+
+def _try_cast_to_numeric(s: str):
+    """Attempt to cast a string to a number, returning the original string if invalid.
+
+    This method attempts to convert to a float first, followed by an int. Precision
+    measurements and indicators of significant digits are stripped.
+    """
+    parsed = re.match(r"(\d+\.?\d*)", s.strip())
+    if parsed is None or re.search(r"[^0-9\.\(\)]", s):
+        return s
+    elif "." in parsed.group(0):
+        return float(parsed.group(0))
+    else:
+        return int(parsed.group(0))