Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
117 commits
Select commit Hold shift + click to select a range
1bd92e8
Nearly-working impl
janbridley Dec 20, 2024
da87f8c
Full working example
janbridley Dec 20, 2024
b28cc05
Clean up layout
janbridley Dec 20, 2024
2ae2d31
Further cleanup
janbridley Dec 20, 2024
a05d8b8
Lint OO
janbridley Dec 20, 2024
5bc5299
Run pre-commit on _errors.py
janbridley Dec 21, 2024
06fabc7
Add oo.py temp implementation
janbridley Dec 21, 2024
727f604
Undo changes to sample data
janbridley Dec 21, 2024
2752632
Lint oo.py
janbridley Dec 21, 2024
b643d5f
Remove change to sample data
janbridley Dec 21, 2024
c949095
Add oo to init.py
janbridley Dec 21, 2024
04280cc
Handle edge cases
janbridley Dec 21, 2024
0d53a30
Test parsing real files
janbridley Dec 21, 2024
e7855f1
Improve robustness of table reader
janbridley Dec 21, 2024
f7a9d75
Lint oo and conftest
janbridley Dec 21, 2024
9c63222
Clean up text and remove comments
janbridley Dec 21, 2024
aa78a68
Port initial test to new style
janbridley Dec 21, 2024
7d07b59
Port remaining key tests
janbridley Dec 21, 2024
56a4d50
Minor fixes
janbridley Dec 21, 2024
16f1655
Clean up test_key_reader.py
janbridley Dec 21, 2024
5e92141
Progress toward transition to recarray
janbridley Dec 21, 2024
a44a925
Increase tests and fix memory layout bug
janbridley Dec 22, 2024
b020262
Fixes to memory layout
janbridley Dec 22, 2024
ccb2aea
Convert table_reader tests
janbridley Dec 22, 2024
1e81654
Linting and doc fixes
janbridley Dec 22, 2024
97e0cbd
Clean up docs
janbridley Dec 22, 2024
33134e8
Finish porting tests
janbridley Dec 23, 2024
7bc9f86
Lints
janbridley Dec 23, 2024
c80a86a
Fix for scalar array inputs
janbridley Dec 24, 2024
e23762e
Remove unnecessary filterwarning
janbridley Dec 24, 2024
917685f
Expand on tests
janbridley Dec 24, 2024
933d4b9
Clean up unitcells
janbridley Dec 24, 2024
42151fa
Lint tests
janbridley Dec 24, 2024
8a5e73f
Finalize lints
janbridley Dec 24, 2024
12b73c5
Restructure patterns
janbridley Dec 24, 2024
8f945fd
Update test_patterns
janbridley Dec 24, 2024
0d50bc8
Lint and clean up
janbridley Dec 24, 2024
d39ce71
Final lint
janbridley Dec 24, 2024
4bfe1b3
Improve a few tests
janbridley Dec 24, 2024
cded448
Add symops to example cif
janbridley Dec 24, 2024
faf9816
Remove package-unitcells deprecated docs
janbridley Dec 24, 2024
efb80e8
Fix link in package-parse
janbridley Dec 24, 2024
86faca3
Update quickstart tutorial
janbridley Dec 24, 2024
867a37d
Move oo.py to parsnip.py
janbridley Dec 24, 2024
31a5b6c
Update README
janbridley Dec 24, 2024
9ceb2f9
Update Unitcells test imports
janbridley Dec 24, 2024
8ef7150
Lint
janbridley Dec 24, 2024
2dba4be
Lazily load file
janbridley Dec 24, 2024
65d6890
Remove unused files
janbridley Dec 24, 2024
8b6f99f
Skip bad_cif test
janbridley Dec 25, 2024
05895ea
Clean up tests
janbridley Dec 25, 2024
305a37a
Lint
janbridley Dec 25, 2024
3592f1e
Add tests for table_labels and cast_numerics
janbridley Dec 26, 2024
0cc212e
Clean up tests
janbridley Dec 26, 2024
79a1c85
Lint
janbridley Dec 26, 2024
a16c7e8
Clean up docstrings
janbridley Dec 26, 2024
716cc4c
Lint and update docstrings
janbridley Dec 27, 2024
f211007
Further docs
janbridley Dec 27, 2024
def08df
Codespell
janbridley Dec 27, 2024
a001b48
Tests for cell
janbridley Dec 27, 2024
a40fa7e
Lint
janbridley Dec 27, 2024
cf83de1
Update errors for read_unit_cell
janbridley Dec 27, 2024
ab12074
Clean up tests and todos
janbridley Dec 28, 2024
46d5577
More TODOs
janbridley Dec 28, 2024
b1e4388
Lint
janbridley Dec 28, 2024
e27c00b
Add test for cell property
janbridley Dec 28, 2024
4ced6ed
Remove modindex from sidebar
janbridley Dec 28, 2024
b8eb804
Consolidate logic for nonsimple data
janbridley Dec 28, 2024
7369f84
Lint
janbridley Dec 28, 2024
8b8a0b5
Fix type annotation in cast_array function
janbridley Dec 28, 2024
d6b4da4
Add more-itertools as official dependency
janbridley Dec 28, 2024
343150d
Clean up dependency documentation
janbridley Dec 28, 2024
30d7776
Add index for ase backward compatibility
janbridley Dec 28, 2024
e46fdf4
Change wording in development.rst
janbridley Dec 28, 2024
1871e50
Replace index specification
janbridley Dec 28, 2024
66701ff
Disable ASE test on python3.7
janbridley Dec 28, 2024
67e6b22
Fix version check
janbridley Dec 28, 2024
31eb83d
Add additional lints
janbridley Dec 28, 2024
31a9907
Document additional rules in pyproject.toml
janbridley Dec 28, 2024
2828a3b
Move PATTERNS dict to end of docs
janbridley Dec 28, 2024
54f316c
Clean up development.rst
janbridley Dec 28, 2024
dadb0b4
Expand with tests from additional databases
janbridley Dec 28, 2024
8672046
Disable lint that causes warning
janbridley Dec 28, 2024
08b35c1
Fix for multiline data entries
janbridley Dec 29, 2024
10a60fa
Progress toward multiline string parsing
janbridley Dec 29, 2024
f7402d4
Working impl that fails for blocks containing a semicolon
janbridley Dec 29, 2024
58339c5
Clean up
janbridley Dec 29, 2024
993f698
Messy working impl
janbridley Dec 29, 2024
88c3a1a
Clean up
janbridley Dec 29, 2024
602edf9
Retain newlines
janbridley Dec 29, 2024
0230b52
Lint
janbridley Dec 29, 2024
82f31c6
Add TODO
janbridley Dec 29, 2024
f029663
Add missing multiline keys
janbridley Dec 29, 2024
e648ce4
Wrap accumulator into a function
janbridley Dec 29, 2024
d643fad
Clean up _accumulate_nonsimple_data
janbridley Dec 29, 2024
85c39be
Clean up unused comments
janbridley Dec 29, 2024
a7a2468
Update changelog.rst
janbridley Dec 30, 2024
67b59d4
Fix version headings in changelog
janbridley Dec 30, 2024
0c14506
Update README to reflect correct CIF2.0 status
janbridley Dec 31, 2024
af0325d
Add CIFTEST data to gitignore
janbridley Dec 31, 2024
19a8173
Escape dash in regex and allow forward slash in data name
janbridley Dec 31, 2024
2fec769
Swap namedtuple to dataclass and clean up provided keys
janbridley Dec 31, 2024
5eb4bbf
Auto detect cif keys
janbridley Dec 31, 2024
6f30930
Allow pdb matrix keys
janbridley Dec 31, 2024
0ff9e9a
Generalize nonsimple data delimiters
janbridley Dec 31, 2024
bb143d3
Add architecture.md
janbridley Dec 31, 2024
c383131
Update table tests and fix regex for nonsimple data in tabs
janbridley Dec 31, 2024
e84e94a
Add pycifrw to test reqs
janbridley Dec 31, 2024
327d3f1
Lint tests
janbridley Dec 31, 2024
65a0015
Verify all table content
janbridley Jan 1, 2025
fdf1e21
Lint
janbridley Jan 1, 2025
f6051c9
import annotations
janbridley Jan 1, 2025
8b8df48
Remove unused pattern
janbridley Jan 1, 2025
d031882
Rename tables to loops
janbridley Jan 1, 2025
55e8817
Remove extra character from regex
janbridley Jan 1, 2025
f6517b0
Clean up table reader
janbridley Jan 1, 2025
357615a
Clean up
janbridley Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Rename tables to loops
  • Loading branch information
janbridley committed Jan 1, 2025
commit d031882bd91301e63a9de4dbd814d1f22af58c74
12 changes: 6 additions & 6 deletions doc/source/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,18 @@ This segment of the table shown above contains the table data, with 6 columns an
.. _structured arrays: https://numpy.org/doc/stable/user/basics.rec.html

Now, let's read the table. `parsnip` stores data as Numpy `structured arrays`_, which
allow for a dict-like access of data columns. The :attr:`~.tables` property returns a
list of such arrays, although the :attr:`~.get_from_tables` method is often more
allow for a dict-like access of data columns. The :attr:`~.loops` property returns a
list of such arrays, although the :attr:`~.get_from_loops` method is often more
convenient.


.. code-block:: python


len(cif.tables)
len(cif.loops)
... 2

cif.tables[0]
cif.loops[0]
... array(
... [[('Cu1', '0.0000000000', '0.0000000000', '0.0000000000', 'Cu', 'a')]],
... dtype=[
Expand All @@ -112,12 +112,12 @@ convenient.
... ]
... )

cif.tables[0]["_atom_site_label"]
cif.loops[0]["_atom_site_label"]
... array([['Cu1']], dtype='<U12')


# (Unstructured) slices of tables can be easily accessed!
xyz = cif.get_from_tables(["_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z"])
xyz = cif.get_from_loops(["_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z"])

print(xyz)
... array([['0.0000000000', '0.0000000000', '0.0000000000']], dtype='<U12')
Expand Down
104 changes: 53 additions & 51 deletions parsnip/parsnip.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,16 @@ class CifFile:
>>> from parsnip import CifFile
>>> cif = CifFile("doc/source/example_file.cif")
>>> print(cif)
CifFile(fn=doc/source/example_file.cif) : 12 data entries, 2 data tables
CifFile(fn=doc/source/example_file.cif) : 12 data entries, 2 data loops

Data entries are accessible via the :attr:`~.pairs` and :attr:`~.tables` attributes:
Data entries are accessible via the :attr:`~.pairs` and :attr:`~.loops` attributes:

>>> cif.pairs
{'_journal_year': '1999', '_journal_page_first': '0', ...}
>>> cif.tables[0]
>>> cif.loops[0]
array([[('Cu1', '0.0000000000', '0.0000000000', '0.0000000000', 'Cu', 'a')]],
dtype=...)
>>> cif.tables[1]
>>> cif.loops[1]
array([[('1', 'x,y,z')],
[('96', 'z,y+1/2,x+1/2')],
[('118', 'z+1/2,-y,x+1/2')],
Expand All @@ -106,7 +106,7 @@ class CifFile:

.. tip::

See the docs for :attr:`__getitem__` and :attr:`get_from_tables` to query
See the docs for :attr:`__getitem__` and :attr:`get_from_loops` to query
for data by key or column label!

Parameters
Expand All @@ -121,13 +121,13 @@ class CifFile:
def __init__(self, fn: str, cast_values: bool = False):
"""Create a CifFile object from a filename.

On construction, the entire file is parsed into key-value pairs and data tables.
On construction, the entire file is parsed into key-value pairs and data loops.
Comment lines are ignored.

"""
self._fn = fn
self._pairs = {}
self._tables = []
self._loops = []

self._cpat = {k: re.compile(pattern) for (k, pattern) in self.PATTERNS.items()}
self._cast_values = cast_values
Expand All @@ -149,8 +149,8 @@ def pairs(self):
return self._pairs

@property
def tables(self):
"""A list of data tables extracted from the file.
def loops(self):
"""A list of data tables (:code:``loop_``'s) extracted from the file.

These are stored as `numpy structured arrays`_, which can be indexed by column
labels. See the :attr:`~.structured_to_unstructured` helper function below for
Expand All @@ -163,23 +163,23 @@ def tables(self):
list[:class:`numpy.ndarray[str]`]:
A list of structured arrays containing table data from the file.
"""
return self._tables
return self._loops

@property
def table_labels(self):
"""A list of column labels for each data array.

This property is equivalent to :code:`[arr.dtype.names for arr in self.tables]`.
This property is equivalent to :code:`[arr.dtype.names for arr in self.loops]`.

Returns
-------
list[list[str]]:
Column labels for :attr:`~.tables`, stored as a nested list of strings.
Column labels for :attr:`~.loops`, stored as a nested list of strings.
"""
return [arr.dtype.names for arr in self.tables]
return [arr.dtype.names for arr in self.loops]

def get_from_tables(self, index: ArrayLike):
"""Return a column or columns from the matching table in :meth:`~.self.tables`.
def get_from_loops(self, index: ArrayLike):
"""Return a column or columns from the matching table in :attr:`~.loops`.

If index is a single string, a single column will be returned from the matching
table. If index is an Iterable of strings, the corresponding table slices will
Expand All @@ -188,7 +188,7 @@ def get_from_tables(self, index: ArrayLike):

.. tip::

It is highly recommended that queries across multiple tables are provided in
It is highly recommended that queries across multiple loops are provided in
separated calls to this function. This helps ensure output data is ordered
as expected and allows for easier handling of cases where non-matching keys
are provided.
Expand All @@ -198,7 +198,7 @@ def get_from_tables(self, index: ArrayLike):
-------
Extract a single column from a single table:

>>> cif.get_from_tables("_symmetry_equiv_pos_as_xyz")
>>> cif.get_from_loops("_symmetry_equiv_pos_as_xyz")
array([['x,y,z'],
['z,y+1/2,x+1/2'],
['z+1/2,-y,x+1/2'],
Expand All @@ -207,17 +207,17 @@ def get_from_tables(self, index: ArrayLike):
Extract multiple columns from a single table:

>>> table_1_cols = ["_symmetry_equiv_pos_site_id", "_symmetry_equiv_pos_as_xyz"]
>>> cif.get_from_tables(table_1_cols)
>>> cif.get_from_loops(table_1_cols)
array([['1', 'x,y,z'],
['96', 'z,y+1/2,x+1/2'],
['118', 'z+1/2,-y,x+1/2'],
['192', 'z+1/2,y+1/2,x']], dtype='<U14')

Extract multiple columns from multiple tables:
Extract multiple columns from multiple loops:

>>> table_1_cols = ["_symmetry_equiv_pos_site_id", "_symmetry_equiv_pos_as_xyz"]
>>> table_2_cols = ["_atom_site_type_symbol", "_atom_site_Wyckoff_label"]
>>> [cif.get_from_tables(cols) for cols in (table_1_cols, table_2_cols)]
>>> [cif.get_from_loops(cols) for cols in (table_1_cols, table_2_cols)]
[array([['1', 'x,y,z'],
['96', 'z,y+1/2,x+1/2'],
['118', 'z+1/2,-y,x+1/2'],
Expand All @@ -228,14 +228,14 @@ def get_from_tables(self, index: ArrayLike):
.. caution::

Returned arrays will match the ordering of input ``index`` keys if all
indices correspond to a single table. Indices that match multiple tables
will return all possible matches, in the order of the input tables. Lists of
input that correspond with multiple tables will return data from those
tables *in the order they were read from the file.*
indices correspond to a single table. Indices that match multiple loops
will return all possible matches, in the order of the input loops. Lists of
input that correspond with multiple loops will return data from those
loops *in the order they were read from the file.*

Case where ordering of output matches the input file, not the provided keys:

>>> cif.get_from_tables([*table_1_cols, *table_2_cols])
>>> cif.get_from_loops([*table_1_cols, *table_2_cols])
[array([['Cu', 'a']], dtype='<U12'),
array([['1', 'x,y,z'],
['96', 'z,y+1/2,x+1/2'],
Expand All @@ -256,7 +256,7 @@ def get_from_tables(self, index: ArrayLike):
"""
index = np.atleast_1d(index)
result = []
for table in self.tables:
for table in self.loops:
matches = index[np.any(index[:, None] == table.dtype.names, axis=1)]
if len(matches) == 0:
continue
Expand Down Expand Up @@ -400,7 +400,7 @@ def read_symmetry_operations(self):
)

# Only one key is valid in each standard, so we only ever get one match.
return self.get_from_tables(symmetry_keys)
return self.get_from_loops(symmetry_keys)

def read_wyckoff_positions(self):
r"""Extract symmetry-irreducible, fractional x,y,z coordinates from a CIF file.
Expand All @@ -414,12 +414,13 @@ def read_wyckoff_positions(self):
"""
xyz_keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")

return cast_array_to_float(arr=self.get_from_tables(xyz_keys), dtype=float)
return cast_array_to_float(arr=self.get_from_loops(xyz_keys), dtype=float)

def build_unit_cell(
self,
fractional: bool = True,
n_decimal_places: int = 4,
wrap_coords: bool = True, # TODO: docs
verbose: bool = False,
):
"""Reconstruct atomic positions from Wyckoff sites and symmetry operations.
Expand Down Expand Up @@ -480,7 +481,8 @@ def build_unit_cell(
]

pos = np.vstack(all_frac_positions)
pos %= 1 # Wrap particles into the box
if wrap_coords:
pos %= 1 # Wrap particles into the box

# Filter unique points. This takes some time but makes the method faster overall
_, unique_indices, unique_counts = np.unique(
Expand Down Expand Up @@ -546,7 +548,7 @@ def cell(self):
def structured_to_unstructured(cls, arr: np.ndarray):
"""Convert a structured (column-labeled) array to a standard unstructured array.

This is useful when extracting entire tables from :attr:`~.tables` for use in
This is useful when extracting entire loops from :attr:`~.loops` for use in
other programs. This classmethod simply calls
:code:`np.lib.recfunctions.structured_to_unstructured` on the input data to
ensure the resulting array is properly laid out in memory. See
Expand Down Expand Up @@ -598,81 +600,81 @@ def _parse(self, data_iter: Iterable):
)

# Build up tables by incrementing through the iterator =====================
table = re.match(self._cpat["table_delimiter"], line)
loop = re.match(self._cpat["loop_delimiter"], line)

if table is not None:
table_keys, table_data = [], []
if loop is not None:
loop_keys, loop_data = [], []

# First, extract table headers. Must be prefixed with underscore
line_groups = table.groups()
if line_groups[-1] != "": # Extract table keys from the _loop line
line_groups = loop.groups()
if line_groups[-1] != "": # Extract loop keys from the _loop line
fragment = _strip_comments(line_groups[-1].strip())
if fragment[:1] == "_":
keys = self._cpat["key_list"].findall(fragment)
table_keys.extend(keys if keys is not None else [])
loop_keys.extend(keys if keys is not None else [])
else:
continue

while _is_key(data_iter.peek(None)):
line = _accumulate_nonsimple_data(
data_iter, _strip_comments(next(data_iter))
)
table_keys.extend(self._cpat["key_list"].findall(line))
loop_keys.extend(self._cpat["key_list"].findall(line))

while _is_data(data_iter.peek(None)):
line = _accumulate_nonsimple_data(
data_iter, _strip_comments(next(data_iter))
)
parsed_line = self._cpat["space_delimited_data"].findall(line)
parsed_line = [m for m in parsed_line if m != ""]
table_data.extend([parsed_line] if parsed_line else [])
loop_data.extend([parsed_line] if parsed_line else [])

n_elements, n_cols = (
sum(len(row) for row in table_data),
len(table_keys),
sum(len(row) for row in loop_data),
len(loop_keys),
)

if n_cols == 0:
continue # Skip empty tables

if n_elements % n_cols != 0:
warnings.warn(
f"Parsed data for table {len(self.tables)+1} cannot be resolved"
f"Parsed data for table {len(self.loops)+1} cannot be resolved"
f" into a table of the expected size and will be ignored. "
f"Got n={n_elements} items, expected c={n_cols} columns: "
f"n%c={n_elements % n_cols}).",
category=ParseWarning,
stacklevel=2,
)
continue
if not all(len(key) == len(table_keys[0]) for key in table_keys):
table_data = np.array([*flatten(table_data)]).reshape(-1, n_cols)
dt = _dtype_from_int(max(max(len(s) for s in l) for l in table_data))
if not all(len(key) == len(loop_keys[0]) for key in loop_keys):
loop_data = np.array([*flatten(loop_data)]).reshape(-1, n_cols)
dt = _dtype_from_int(max(max(len(s) for s in l) for l in loop_data))

if len(set(table_keys)) < len(table_keys):
if len(set(loop_keys)) < len(loop_keys):
warnings.warn(
"Duplicate keys detected - table will not be processed.",
category=ParseWarning,
stacklevel=2,
)
continue

rectable = np.atleast_2d(table_data)
rectable.dtype = [*zip(table_keys, [dt] * n_cols)]
rectable = np.atleast_2d(loop_data)
rectable.dtype = [*zip(loop_keys, [dt] * n_cols)]
rectable = rectable.reshape(rectable.shape, order="F")
self.tables.append(rectable)
self.loops.append(rectable)

if data_iter.peek(None) is None:
break

def __repr__(self):
n_pairs = len(self.pairs)
n_tabs = len(self.tables)
return f"CifFile(fn={self._fn}) : {n_pairs} data entries, {n_tabs} data tables"
n_tabs = len(self.loops)
return f"CifFile(fn={self._fn}) : {n_pairs} data entries, {n_tabs} data loops"

PATTERNS: ClassVar = {
"key_value_general": r"^(_[\w\.\-/|\[\d\]]+)\s+([^#]+)",
"table_delimiter": r"([Ll][Oo][Oo][Pp]_)[ |\t]*([^\n]*)",
"loop_delimiter": r"([Ll][Oo][Oo][Pp]_)[ |\t]*([^\n]*)",
"block_delimiter": r"([Dd][Aa][Tt][Aa]_)[ |\t]*([^\n]*)",
"key_list": r"_[\w_\.*]+[\[\d\]]*",
"space_delimited_data": r"(\;[^\;]*\;|\'[^\']*\'|\"[^\"]*\"]|[^\'\"\;\s]*)\s*",
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
@dataclass
class CifData:
filename: str
symop_keys: tuple[str, ...]
atom_site_keys: tuple[str, ...]
file: CifFile
symop_keys: tuple[str, ...] = ()
atom_site_keys: tuple[str, ...] = ()
failing: tuple[str, ...] = ()
"""Test cases that DO NOT read properly."""
manual_keys: tuple[str, ...] = ()
Expand Down
Loading