molstar
diff --git a/‎ciftools/binary/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎ciftools/binary/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎ciftools/binary/data.py‎
Lines changed: 173 additions & 0 deletions b/‎ciftools/binary/data.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎ciftools/binary/encoding/data_types.py‎ renamed to ‎ciftools/binary/data_types.py‎
Lines changed: 1 addition & 1 deletion b/‎ciftools/binary/encoding/data_types.py‎ renamed to ‎ciftools/binary/data_types.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ciftools/binary/decoder.py‎
Lines changed: 11 additions & 18 deletions b/‎ciftools/binary/decoder.py‎
Lines changed: 11 additions & 18 deletions
diff --git a/‎ciftools/binary/encoding/types.py‎ renamed to ‎ciftools/binary/encoded_data.py‎
Lines changed: 4 additions & 3 deletions b/‎ciftools/binary/encoding/types.py‎ renamed to ‎ciftools/binary/encoded_data.py‎
Lines changed: 4 additions & 3 deletions
@@ -1,2 +0,0 @@
-from ciftools.binary.decoder import decode_cif_data
-from ciftools.binary.writer import BinaryCIFWriter
@@ -0,0 +1,173 @@
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+from ciftools.binary.decoder import decode_cif_data
+from ciftools.binary.encoded_data import EncodedCIFCategory, EncodedCIFColumn, EncodedCIFFile
+from ciftools.models.data import CIFCategory, CIFColumn, CIFDataBlock, CIFFile, CIFValuePresenceEnum
+
+
+class BinaryCIFColumn(CIFColumn):
+    def __init__(
+        self,
+        name: str,
+        values: np.ndarray,
+        value_presence: Optional[np.ndarray],
+    ):
+        self.name = name
+        self._values = values
+        self._value_presence = value_presence
+        self._row_count = len(values)
+
+    def get_string(self, row: int) -> str:
+        return str(self._values[row])
+
+    def get_integer(self, row: int) -> int:
+        return int(self._values[row])
+
+    def get_float(self, row: int) -> float:
+        return float(self._values[row])
+
+    def get_value_presence(self, row: int) -> CIFValuePresenceEnum:
+        if self._value_presence:
+            return self._value_presence[row]
+        return 0  # type: ignore
+
+    def are_values_equal(self, row_a: int, row_b: int) -> bool:
+        return self._values[row_a] == self._values[row_b]
+
+    def string_equals(self, row: int, value: str) -> bool:
+        return str(self._values[row]) == value
+
+    def as_ndarray(
+        self, *, dtype: Optional[Union[np.dtype, str]] = None, start: Optional[int] = None, end: Optional[int] = None
+    ) -> np.ndarray:
+        if dtype is None and start is None and end is None:
+            return self._values
+        if dtype is None:
+            return self._values[start:end]
+        return self._values[start:end].astype(dtype)
+
+    def __getitem__(self, idx: Any) -> Any:
+        if isinstance(idx, int) and self._value_presence and self._value_presence[idx]:
+            return None
+        return self._values[idx]
+
+    def __len__(self):
+        return self._row_count
+
+    @property
+    def value_presences(self) -> Optional[np.ndarray]:
+        return self._value_presence
+
+
+def _decode_cif_column(column: EncodedCIFColumn) -> CIFColumn:
+    values = decode_cif_data(column["data"])
+    value_mask = decode_cif_data(column["mask"]) if column["mask"] else None
+    return BinaryCIFColumn(column["name"], values, value_mask)
+
+
+class BinaryCIFCategory(CIFCategory):
+    def __getitem__(self, name: str) -> BinaryCIFColumn:
+        if name not in self._field_cache:
+            raise ValueError(f"{name} is not a valid category name")
+
+        if not self._field_cache[name]:
+            self._field_cache[name] = _decode_cif_column(self._columns[name])
+
+        return self._field_cache[name]  # type: ignore
+
+    def __contains__(self, key: str):
+        return key in self._columns
+
+    def __init__(self, category: EncodedCIFCategory, lazy: bool):
+        self._field_names = [c["name"] for c in category["columns"]]
+        self._field_cache = {c["name"]: None if lazy else _decode_cif_column(c) for c in category["columns"]}
+        self._columns: dict[str, EncodedCIFColumn] = {c["name"]: c for c in category["columns"]}
+        self._n_columns = len(category["columns"])
+        self._n_rows = category["rowCount"]
+        self._name = category["name"][1:]
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @property
+    def n_rows(self) -> int:
+        return self._n_rows
+
+    @property
+    def n_columns(self) -> int:
+        return self._n_columns
+
+    @property
+    def field_names(self) -> List[str]:
+        return self._field_names
+
+
+class BinaryCIFDataBlock(CIFDataBlock):
+    def __getitem__(self, name: str) -> CIFCategory:
+        return self._categories[name]
+
+    def __contains__(self, key: str):
+        return key in self._categories
+
+    def __init__(self, header: str, categories: Dict[str, BinaryCIFCategory]):
+        self._header = header
+        self._categories = categories
+
+    @property
+    def header(self) -> str:
+        return self._header
+
+    @property
+    def categories(self) -> Dict[str, CIFCategory]:
+        return self._categories  # type: ignore
+
+
+class BinaryCIFFile(CIFFile):
+    def __getitem__(self, index_or_name: Union[int, str]):
+        if isinstance(index_or_name, str):
+            return self._block_map.get(index_or_name)
+        else:
+            return (
+                self.data_blocks[index_or_name]
+                if index_or_name < len(self.data_blocks) and index_or_name >= 0
+                else None
+            )
+
+    def __len__(self):
+        return len(self._data_blocks)
+
+    def __contains__(self, key: str):
+        return key in self._block_map
+
+    def __init__(self, data_blocks: List[BinaryCIFDataBlock]):
+        self._data_blocks = data_blocks
+        self._block_map: dict[str, CIFDataBlock] = {b.header: b for b in data_blocks}
+
+    @staticmethod
+    def from_data(data: EncodedCIFFile, *, lazy=True) -> "BinaryCIFFile":
+        """
+        - lazy:
+            - True: individual columns are decoded only when accessed
+            - False: decode all columns immediately
+        """
+
+        min_version = (0, 3, 0)
+        version = tuple(map(int, data["version"].split(".")))
+        if version < min_version:
+            raise ValueError(f"Invalid version {data['version']}, expected >={'.'.join(map(str, min_version))}")
+
+        data_blocks = [
+            BinaryCIFDataBlock(
+                block["header"],
+                {category["name"][1:]: BinaryCIFCategory(category, lazy) for category in block["categories"]},
+            )
+            for block in data["dataBlocks"]
+        ]
+
+        return BinaryCIFFile(data_blocks)
+
+    @property
+    def data_blocks(self) -> List[CIFDataBlock]:
+        return self._data_blocks  # type: ignore
@@ -35,7 +35,7 @@ class DataType:
 
     @staticmethod
     def from_dtype(dtype: Union[np.dtype, str]) -> DataTypeEnum:
-        t = str(dtype.str)
+        t = dtype if isinstance(dtype, str) else str(dtype.str)
         if t[0] in (">", "<", "|"):
             t = t[1:]
         return DataTypeEnum(DataType.__dtypes_to_data_types[t])
 
@@ -1,8 +1,7 @@
-from typing import Union
-
 import numpy as np
-from ciftools.binary.encoding.data_types import DataType
-from ciftools.binary.encoding.encodings import (
+from ciftools.binary.data_types import DataType
+from ciftools.binary.encoded_data import EncodedCIFData
+from ciftools.binary.encoding_types import (
     ByteArrayEncoding,
     DeltaEncoding,
     FixedPointEncoding,
@@ -11,30 +10,21 @@
     RunLengthEncoding,
     StringArrayEncoding,
 )
-from ciftools.binary.encoding.types import EncodedCIFColumn, EncodedCIFData
-from ciftools.cif_format.base import CIFColumnBase
-from ciftools.cif_format.binary.column import BinaryCIFColumn
-
-
-def decode_cif_column(column: EncodedCIFColumn) -> CIFColumnBase:
-    values = decode_cif_data(column["data"])
-    value_kinds = decode_cif_data(column["mask"]) if column["mask"] else None  # type: ignore
-    return BinaryCIFColumn(column["name"], values, value_kinds)  # type: ignore
 
 
-def decode_cif_data(encoded_data: EncodedCIFData) -> Union[np.ndarray, list[str]]:
+def decode_cif_data(encoded_data: EncodedCIFData) -> np.ndarray:
     result = encoded_data["data"]
     for encoding in encoded_data["encoding"][::-1]:
         if encoding["kind"] in _decoders:
             result = _decoders[encoding["kind"]](result, encoding)  # type: ignore
         else:
             raise ValueError(f"Unsupported encoding '{encoding['kind']}'")
 
-    return result
+    return result  # type: ignore
 
 
 def _decode_byte_array(data: bytes, encoding: ByteArrayEncoding) -> np.ndarray:
-    return np.frombuffer(data, dtype="<" + DataType.to_dtype(encoding["type"]))
+    return np.frombuffer(data, dtype=f"<{str(DataType.to_dtype(encoding['type']))}")
 
 
 def _decode_fixed_point(data: np.ndarray, encoding: FixedPointEncoding) -> np.ndarray:
@@ -57,6 +47,7 @@ def _decode_delta(data: np.ndarray, encoding: DeltaEncoding) -> np.ndarray:
     return np.cumsum(result, out=result)
 
 
+# TODO: JIT
 def _decode_integer_packing_signed(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray:
     upper_limit = 0x7F if encoding["byteCount"] == 1 else 0x7FFF
     lower_limit = -upper_limit - 1
@@ -78,6 +69,7 @@ def _decode_integer_packing_signed(data: np.ndarray, encoding: IntegerPackingEnc
     return output
 
 
+# TODO: JIT
 def _decode_integer_packing_unsigned(data: np.ndarray, encoding: IntegerPackingEncoding) -> np.ndarray:
     upper_limit = 0xFF if encoding["byteCount"] == 1 else 0xFFFF
     n = len(data)
@@ -107,7 +99,7 @@ def _decode_integer_packing(data: np.ndarray, encoding: IntegerPackingEncoding)
         return _decode_integer_packing_signed(data, encoding)
 
 
-def _decode_string_array(data: np.ndarray, encoding: StringArrayEncoding) -> list[str]:
+def _decode_string_array(data: np.ndarray, encoding: StringArrayEncoding) -> np.ndarray:
     offsets = decode_cif_data(EncodedCIFData(encoding=encoding["offsetEncoding"], data=encoding["offsets"]))
     indices = decode_cif_data(EncodedCIFData(encoding=encoding["dataEncoding"], data=data))
 
@@ -117,7 +109,8 @@ def _decode_string_array(data: np.ndarray, encoding: StringArrayEncoding) -> lis
     for i in range(1, len(offsets)):
         strings.append(string_data[offsets[i - 1] : offsets[i]])  # type: ignore
 
-    return [strings[i + 1] for i in indices]  # type: ignore
+    return np.array([strings[i + 1] for i in indices], dtype=np.object_)
+    # return [strings[i + 1] for i in indices]
 
 
 _decoders = {
 
@@ -1,11 +1,12 @@
-from typing import Optional, TypedDict
+from typing import Optional, TypedDict, Union
 
-from ciftools.binary.encoding.encodings import EncodingBase
+import numpy as np
+from ciftools.binary.encoding_types import EncodingBase
 
 
 class EncodedCIFData(TypedDict):
     encoding: list[EncodingBase]
-    data: bytes
+    data: Union[bytes, np.ndarray]
 
 
 class EncodedCIFColumn(TypedDict):
Original file line number	Diff line number	Diff line change
`@@ -1,2 +0,0 @@`
`1`		`-from ciftools.binary.decoder import decode_cif_data`
`2`		`-from ciftools.binary.writer import BinaryCIFWriter`