huggingface
diff --git a/‎.github/workflows/python.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/python.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎bindings/python/Makefile‎
Lines changed: 2 additions & 0 deletions b/‎bindings/python/Makefile‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bindings/python/README.md‎
Lines changed: 4 additions & 0 deletions b/‎bindings/python/README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bindings/python/benches/test_tiktoken.py‎
Lines changed: 19 additions & 24 deletions b/‎bindings/python/benches/test_tiktoken.py‎
Lines changed: 19 additions & 24 deletions
diff --git a/‎bindings/python/docs/pyo3.md‎
Lines changed: 7 additions & 0 deletions b/‎bindings/python/docs/pyo3.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎bindings/python/examples/example.py‎
Lines changed: 1 addition & 1 deletion b/‎bindings/python/examples/example.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bindings/python/examples/train_with_datasets.py‎
Lines changed: 2 additions & 2 deletions b/‎bindings/python/examples/train_with_datasets.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎bindings/python/py_src/tokenizers/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎bindings/python/py_src/tokenizers/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -116,7 +116,7 @@ jobs:
           python -m venv .env
           source .env/bin/activate
           pip install -U pip
-          pip install pytest requests setuptools_rust numpy pyarrow datasets
+          pip install pytest requests setuptools_rust numpy pyarrow datasets ty
           pip install -e .[dev]
 
       - name: Check style
@@ -125,6 +125,12 @@ jobs:
           source .env/bin/activate
           make check-style
 
+      - name: Type check
+        working-directory: ./bindings/python
+        run: |
+          source .env/bin/activate
+          ty check py_src tests
+
       - name: Run tests
         working-directory: ./bindings/python
         run: |
 
@@ -10,12 +10,14 @@ style:
 	python stub.py
 	ruff check  $(check_dirs) --fix 
 	ruff format $(check_dirs)
+	ty check py_src tests
 
 # Check the source code is formatted correctly
 check-style:
 	python stub.py --check
 	ruff check $(check_dirs)
 	ruff format --check $(check_dirs)
+	ty check py_src tests 
 
 TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
 
 
@@ -168,3 +168,7 @@ tokenizer = Tokenizer.from_file("byte-level-bpe.tokenizer.json")
 
 encoded = tokenizer.encode("I can feel the magic, can you?")
 ```
+
+### Typing support and `stub.py`
+
+The compiled PyO3 extension does not expose type annotations, so editors and type checkers would otherwise see most objects as `Any`. The `stub.py` helper walks the loaded extension modules, renders `.pyi` stub files (plus minimal forwarding `__init__.py` shims), and formats them so that tools like mypy/pyright can understand the public API. Run `python stub.py` whenever you change the Python-visible surface to keep the generated stubs in sync.
@@ -2,8 +2,8 @@
 import time
 import argparse
 from datasets import load_dataset
-from tiktoken.load import load_tiktoken_bpe
-import tiktoken
+from tiktoken.load import load_tiktoken_bpe  # type: ignore[import]
+import tiktoken  # type: ignore[import]
 from tokenizers import Tokenizer
 from huggingface_hub import hf_hub_download
 from typing import Tuple, List
@@ -30,7 +30,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
     num_bytes = sum(map(len, map(str.encode, documents)))
     readable_size, unit = format_byte_size(num_bytes)
     print(f"==============")
-    print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
+    print(
+        f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}"
+    )
     filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
     mergeable_ranks = load_tiktoken_bpe(filename)
     pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
@@ -46,20 +48,15 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
         "<|end_header_id|>",
         "<|reserved_special_token_4|>",
         "<|eot_id|>",  # end of turn
-    ] + [
-        f"<|reserved_special_token_{i}|>"
-        for i in range(5, num_reserved_special_tokens - 5)
-    ]
+    ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
     num_base_tokens = len(mergeable_ranks)
-    special_tokens = {
-        token: num_base_tokens + i for i, token in enumerate(special_tokens)
-    }
+    special_tokens = {token: num_base_tokens + i for i, token in enumerate(special_tokens)}
     enc = tiktoken.Encoding(
-            name=model,
-            pat_str=pat_str,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=special_tokens,
-        )
+        name=model,
+        pat_str=pat_str,
+        mergeable_ranks=mergeable_ranks,
+        special_tokens=special_tokens,
+    )
     out = enc.encode("This is a test")
 
     hf_enc = Tokenizer.from_pretrained(model)
@@ -74,7 +71,6 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
     readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
     print(f"tiktoken \t{readable_size}  / s")
 
-
     start = time.perf_counter_ns()
     hf_enc.encode_batch_fast(documents)
     end = time.perf_counter_ns()
@@ -98,7 +94,7 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
                 else:
                     documents.append(item["premise"]["en"])
             if fuse:
-                documents=["".join(documents)]
+                documents = ["".join(documents)]
 
             document_length = sum(len(d) for d in documents) / len(documents)
 
@@ -115,15 +111,14 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
 
 
 def main():
-
     parser = argparse.ArgumentParser(
-                    prog='bench_tokenizer',
-                    description='Getting a feel for speed when tokenizing',
+        prog="bench_tokenizer",
+        description="Getting a feel for speed when tokenizing",
     )
-    parser.add_argument('-m', '--model', default=MODEL_ID, type=str)
-    parser.add_argument('-d', '--dataset', default=DATASET, type=str)
-    parser.add_argument('-ds', '--dataset-config', default=DATASET_CONFIG, type=str)
-    parser.add_argument('-t', '--threads', nargs='+', default=DEFAULT_THREADS, type=int)
+    parser.add_argument("-m", "--model", default=MODEL_ID, type=str)
+    parser.add_argument("-d", "--dataset", default=DATASET, type=str)
+    parser.add_argument("-ds", "--dataset-config", default=DATASET_CONFIG, type=str)
+    parser.add_argument("-t", "--threads", nargs="+", default=DEFAULT_THREADS, type=int)
     args = parser.parse_args()
     test(args.model, args.dataset, args.dataset_config, args.threads)
 
 
@@ -0,0 +1,7 @@
+# PyO3 Usage Notes
+
+## Why we take `self_: PyRef<'_, Self>`
+
+Most of the Python-facing structs are declared with `#[pyclass(extends = ...)]`. The actual data (for example the `processor` field in `PyPostProcessor`) lives in the base class, while the derived Rust structs are often just markers so that Python sees a proper subclass. When we implement a method on the subclass, we still need to reach into the base storage without downcasting a `PyAny` or re-wrapping objects.
+
+Using `self_: PyRef<'_, Self>` gives us a borrowed reference to the Python-owned value that keeps the GIL lifetime, reference counts, and the inheritance chain intact. With it we can call `self_.as_ref()` to view the base `PyPostProcessor` directly and access shared helpers like the processor getters/setters. If we used a plain `&self` we would only see the zero-sized derived struct and would have to convert through a super type just to touch the processors, which adds boilerplate and loses the link to the Python inheritance model. This is the PyO3 equivalent of Python’s `super()`—it keeps the Rust type information while letting us operate on the underlying parent.
@@ -8,7 +8,7 @@
 from tokenizers.models import BPE, WordPiece
 from tokenizers.normalizers import BertNormalizer
 from tokenizers.processors import BertProcessing
-from transformers import BertTokenizer, GPT2Tokenizer
+from transformers import BertTokenizer, GPT2Tokenizer  # type: ignore[import]
 
 logging.getLogger("transformers").disabled = True
 logging.getLogger("transformers.tokenization_utils").disabled = True
 
@@ -15,9 +15,9 @@
 # Build an iterator over this dataset
 def batch_iterator():
     batch_size = 1000
-    for batch in dataset.iter(batch_size=batch_size):
+    for batch in dataset.iter(batch_size=batch_size):  # type: ignore[attr-defined]
         yield batch["text"]
 
 
 # And finally train
-bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset))
+bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset))  # type: ignore[arg-type]
@@ -75,7 +75,7 @@ class SplitDelimiterBehavior(Enum):
     CONTIGUOUS = "contiguous"
 
 
-from .tokenizers import (
+from .tokenizers import (  # type: ignore[import]
     AddedToken,
     Encoding,
     NormalizedString,