Add comments to utf-8 error handling

Explain choice of 'replace' over 'strict' and 'ignore'
Future-House · dmcgrath19 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 7, 2025
commit 7a88d42d6c9a19bce1b442b5f701b13b29c428c5
diff --git a/src/paperqa/utils.py b/src/paperqa/utils.py
@@ -104,6 +104,10 @@ def strings_similarity(s1: str, s2: str, case_insensitive: bool = True) -> float
 
 def hexdigest(data: str | bytes) -> str:
     if isinstance(data, str):
+        # Using 'replace' to handle invalid UTF-8 chars and preserve data structure,
+        # important for papers with special characters/units we cannot parse but want to keep.
+        # Chosen over 'strict', which throws errors stopping processing, and
+        # 'ignore', which causes silent data loss by dropping important units before hashing.
         return hashlib.md5(data.encode("utf-8", errors="replace")).hexdigest()  # noqa: S324
-        return hashlib.md5(data.encode("utf-8", errors="replace")).hexdigest()  # noqa: S324
+        return hashlib.md5(data.encode("utf-8", errors="strict")).hexdigest()  # noqa: S324
-        return hashlib.md5(data.encode("utf-8", errors="replace")).hexdigest()  # noqa: S324
+        return hashlib.md5(data.encode("utf-8", errors="strict")).hexdigest()  # noqa: S324
     return hashlib.md5(data).hexdigest()  # noqa: S324