fix: BM25/Embedding index filename mismatch with --from-conv/--to-con…

…v slicing When using sliced runs (e.g. --from-conv 234 --to-conv 264), the index files were being saved with sequential indices (0, 1, 2...) but search was looking up with global conversation IDs (234, 235, 263), causing 'BM25 index not found' errors. Changes: - stage2_index_building.py: Use conversation_ids to name index files with extracted numeric IDs (e.g., 'bm25_index_conv_234.pkl') - evermemos_adapter.py: - Pass conversation_ids to stage2 for proper file naming - Fix conv_id_to_index mapping to map conversation_id -> extracted numeric ID (not sequential index) - Update _check_missing_indexes to use proper file naming - Save conversation_index_mapping.json for debugging This ensures index files and search lookups use consistent IDs.
EverMind-AI · Jah-yee · Mar 9, 2026 · Mar 11, 2026 · Mar 12, 2026 · Mar 11, 2026
commit ccc28a739e0763590e4d9ff4efe4b535ea9232a1
diff --git a/evaluation/src/adapters/evermemos/config.py b/evaluation/src/adapters/evermemos/config.py
@@ -96,3 +96,7 @@ class ExperimentConfig:
 
     max_retries: int = 5
     max_concurrent_requests: int = 10
+
+    # Conversation IDs for index building (needed for --from-conv/--to-conv slicing)
+    # This maps sequential indices (0, 1, 2...) to actual conversation IDs
+    conversation_ids: list = []
diff --git a/evaluation/src/adapters/evermemos/stage2_index_building.py b/evaluation/src/adapters/evermemos/stage2_index_building.py
@@ -126,8 +126,19 @@ def build_bm25_index(
 
     print(f"Reading data from: {data_dir}")
 
+    # Get conversation IDs for proper file naming
+    # If conversation_ids is provided, use them; otherwise fall back to sequential indices
+    conversation_ids = getattr(config, 'conversation_ids', [])
+
     for i in range(config.num_conv):
-        file_path = data_dir / f"memcell_list_conv_{i}.json"
+        # Use conversation_id if available, otherwise use sequential index
+        if conversation_ids and i < len(conversation_ids):
+            # Extract numeric ID from conversation_id (e.g., "locomo_234" -> "234")
+            conv_id = conversation_ids[i].split("_")[-1] if "_" in conversation_ids[i] else conversation_ids[i]
+        else:
+            conv_id = str(i)
+
+        file_path = data_dir / f"memcell_list_conv_{conv_id}.json"
         if not file_path.exists():
             print(f"Warning: File not found, skipping: {file_path}")
             continue
@@ -161,7 +172,7 @@ def build_bm25_index(
         # --- Saving the Index ---
         index_data = {"bm25": bm25, "docs": original_docs}
 
-        output_path = bm25_save_dir / f"bm25_index_conv_{i}.pkl"
+        output_path = bm25_save_dir / f"bm25_index_conv_{conv_id}.pkl"
         print(f"Saving index to: {output_path}")
         with open(output_path, "wb") as f:
             pickle.dump(index_data, f)
@@ -190,8 +201,19 @@ async def build_emb_index(config: ExperimentConfig, data_dir: Path, emb_save_dir
 
     import time  # For performance statistics
 
+    # Get conversation IDs for proper file naming
+    # If conversation_ids is provided, use them; otherwise fall back to sequential indices
+    conversation_ids = getattr(config, 'conversation_ids', [])
+
     for i in range(config.num_conv):
-        file_path = data_dir / f"memcell_list_conv_{i}.json"
+        # Use conversation_id if available, otherwise use sequential index
+        if conversation_ids and i < len(conversation_ids):
+            # Extract numeric ID from conversation_id (e.g., "locomo_234" -> "234")
+            conv_id = conversation_ids[i].split("_")[-1] if "_" in conversation_ids[i] else conversation_ids[i]
+        else:
+            conv_id = str(i)
+
+        file_path = data_dir / f"memcell_list_conv_{conv_id}.json"
         if not file_path.exists():
             print(f"Warning: File not found, skipping: {file_path}")
             continue
@@ -365,7 +387,7 @@ async def process_batch_with_retry(
         #     },
         #     ...
         # ]
-        output_path = emb_save_dir / f"embedding_index_conv_{i}.pkl"
+        output_path = emb_save_dir / f"embedding_index_conv_{conv_id}.pkl"
         emb_save_dir.mkdir(parents=True, exist_ok=True)
         print(f"Saving embeddings to: {output_path}")
         with open(output_path, "wb") as f:

diff --git a/evaluation/src/adapters/evermemos_adapter.py b/evaluation/src/adapters/evermemos_adapter.py
@@ -102,7 +102,8 @@ def _extract_conv_index(conversation_id: str) -> str:
         return conversation_id
 
     def _check_missing_indexes(
-        self, index_dir: Path, num_conv: int, index_type: str = "bm25"
+        self, index_dir: Path, num_conv: int, index_type: str = "bm25",
+        conversation_ids: List[str] = None
     ) -> List[int]:
         """
         Check for missing index files.
@@ -111,17 +112,24 @@ def _check_missing_indexes(
             index_dir: Index directory
             num_conv: Total number of conversations
             index_type: Index type ("bm25" or "embedding")
+            conversation_ids: List of conversation IDs for proper file naming
 
         Returns:
             List of conversation indices with missing indexes
         """
         missing_indexes = []
 
         for i in range(num_conv):
+            # Use extracted numeric ID for file naming if conversation_ids provided
+            if conversation_ids and i < len(conversation_ids):
+                conv_id = self._extract_conv_index(conversation_ids[i])
+            else:
+                conv_id = str(i)
+
             if index_type == "bm25":
-                index_file = index_dir / f"bm25_index_conv_{i}.pkl"
+                index_file = index_dir / f"bm25_index_conv_{conv_id}.pkl"
             else:  # embedding
-                index_file = index_dir / f"embedding_index_conv_{i}.pkl"
+                index_file = index_dir / f"embedding_index_conv_{conv_id}.pkl"
 
             if not index_file.exists():
                 missing_indexes.append(i)
@@ -363,10 +371,14 @@ async def run_with_completion(conv_id, task):
         # Call stage2 implementation to build indexes
         exp_config = self._convert_config_to_experiment_config()
         exp_config.num_conv = len(conversations)  # Set conversation count
+        # Pass conversation IDs for proper index file naming (supports --from-conv/--to-conv slicing)
+        conversation_ids_list = [conv.conversation_id for conv in conversations]
+        exp_config.conversation_ids = conversation_ids_list
 
         # Smart skip logic: check existing index files
         bm25_need_build = self._check_missing_indexes(
-            index_dir=bm25_index_dir, num_conv=len(conversations), index_type="bm25"
+            index_dir=bm25_index_dir, num_conv=len(conversations), index_type="bm25",
+            conversation_ids=conversation_ids_list
         )
 
         emb_need_build = []
@@ -376,6 +388,7 @@ async def run_with_completion(conv_id, task):
                 index_dir=emb_index_dir,
                 num_conv=len(conversations),
                 index_type="embedding",
+                conversation_ids=conversation_ids_list
             )
 
         # Statistics
@@ -424,12 +437,29 @@ async def run_with_completion(conv_id, task):
 
         # ========== Plan A: Return index metadata (lazy loading) ==========
         # Don't load indexes into memory, only return paths and metadata
+
+        # Build mapping from conversation_id to extracted numeric ID
+        # This is needed because when using --from-conv/--to-conv slicing:
+        # - Index files are saved with extracted numeric IDs (e.g., "234", "235"...)
+        # - But conversation_ids still contain original IDs (e.g., "locomo_exp_user_234")
+        # - We need to map conversation_id -> extracted numeric ID (not sequential index!)
+        conv_id_to_index = {
+            conv.conversation_id: self._extract_conv_index(conv.conversation_id) 
+            for idx, conv in enumerate(conversations)
+        }
+
+        # Save mapping to a JSON file for persistence across stages
+        mapping_file = output_dir / "conversation_index_mapping.json"
+        with open(mapping_file, "w") as f:
+            json.dump(conv_id_to_index, f, indent=2)
+
         index_metadata = {
             "type": "lazy_load",  # Mark as lazy loading
             "memcells_dir": str(memcells_dir),
             "bm25_index_dir": str(bm25_index_dir),
             "emb_index_dir": str(emb_index_dir),
             "conversation_ids": [conv.conversation_id for conv in conversations],
+            "conv_id_to_index": conv_id_to_index,  # Add mapping for search stage
             "use_hybrid_search": use_hybrid,
             "total_conversations": len(conversations),
         }
@@ -454,16 +484,29 @@ async def search(
         Search stage: Retrieve relevant MemCells.
 
         Lazy loading: Load indexes from files on demand (memory-friendly).
+
+        Fix for --from-conv/--to-conv slicing:
+        - When building indexes, files are saved with sequential indices (0, 1, 2...)
+        - But conversation_id still contains original ID (e.g., "locomo_234")
+        - Use the mapping (conv_id_to_index) to find the correct sequential index
         """
         # Lazy loading - read indexes from files
         bm25_index_dir = Path(index["bm25_index_dir"])
         emb_index_dir = Path(index["emb_index_dir"])
 
-        # Extract numeric index from conversation_id to find index files
-        # Example: conversation_id = "locomo_0" -> conv_index = "0"
-        conv_index = self._extract_conv_index(conversation_id)
+        # Get the sequential index from the mapping
+        # This mapping was created in add() stage and maps conversation_id -> sequential index
+        conv_id_to_index = index.get("conv_id_to_index", {})
+
+        if conversation_id in conv_id_to_index:
+            # Use the mapping to get sequential index
+            conv_index = conv_id_to_index[conversation_id]
+        else:
+            # Fallback: extract index from conversation_id (legacy behavior)
+            # This handles cases where the mapping is not available (e.g., old index files)
+            conv_index = self._extract_conv_index(conversation_id)
 
-        # Load BM25 index on demand (using numeric index)
+        # Load BM25 index on demand (using sequential index)
         bm25_file = bm25_index_dir / f"bm25_index_conv_{conv_index}.pkl"
         if not bm25_file.exists():
             return SearchResult(
@@ -682,12 +725,21 @@ def build_lazy_index(
         Returns:
             Index metadata dict
         """
+        # Build mapping from conversation_id to extracted numeric ID
+        # This is needed for --from-conv/--to-conv slicing support
+        # Index files are named with extracted numeric IDs (e.g., "234", not sequential 0)
+        conv_id_to_index = {
+            conv.conversation_id: self._extract_conv_index(conv.conversation_id)
+            for idx, conv in enumerate(conversations)
+        }
+
         return {
             "type": "lazy_load",
             "memcells_dir": str(output_dir / "memcells"),
             "bm25_index_dir": str(output_dir / "bm25_index"),
             "emb_index_dir": str(output_dir / "vectors"),
             "conversation_ids": [conv.conversation_id for conv in conversations],
+            "conv_id_to_index": conv_id_to_index,  # Add mapping for search stage
             "use_hybrid_search": True,
             "total_conversations": len(conversations),
         }