Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix: BM25/Embedding index filename mismatch with --from-conv/--to-con…
…v slicing

When using sliced runs (e.g. --from-conv 234 --to-conv 264), the index
files were being saved with sequential indices (0, 1, 2...) but search
was looking up with global conversation IDs (234, 235, 263), causing
'BM25 index not found' errors.

Changes:
- stage2_index_building.py: Use conversation_ids to name index files with
  extracted numeric IDs (e.g., 'bm25_index_conv_234.pkl')
- evermemos_adapter.py:
  - Pass conversation_ids to stage2 for proper file naming
  - Fix conv_id_to_index mapping to map conversation_id -> extracted
    numeric ID (not sequential index)
  - Update _check_missing_indexes to use proper file naming
  - Save conversation_index_mapping.json for debugging

This ensures index files and search lookups use consistent IDs.
  • Loading branch information
OpenClaw Assistant committed Mar 11, 2026
commit ccc28a739e0763590e4d9ff4efe4b535ea9232a1
4 changes: 4 additions & 0 deletions evaluation/src/adapters/evermemos/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,7 @@ class ExperimentConfig:

max_retries: int = 5
max_concurrent_requests: int = 10

# Conversation IDs for index building (needed for --from-conv/--to-conv slicing)
# This maps sequential indices (0, 1, 2...) to actual conversation IDs
conversation_ids: list = []
30 changes: 26 additions & 4 deletions evaluation/src/adapters/evermemos/stage2_index_building.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,19 @@ def build_bm25_index(

print(f"Reading data from: {data_dir}")

# Get conversation IDs for proper file naming
# If conversation_ids is provided, use them; otherwise fall back to sequential indices
conversation_ids = getattr(config, 'conversation_ids', [])

for i in range(config.num_conv):
file_path = data_dir / f"memcell_list_conv_{i}.json"
# Use conversation_id if available, otherwise use sequential index
if conversation_ids and i < len(conversation_ids):
# Extract numeric ID from conversation_id (e.g., "locomo_234" -> "234")
conv_id = conversation_ids[i].split("_")[-1] if "_" in conversation_ids[i] else conversation_ids[i]
else:
conv_id = str(i)

file_path = data_dir / f"memcell_list_conv_{conv_id}.json"
if not file_path.exists():
print(f"Warning: File not found, skipping: {file_path}")
continue
Expand Down Expand Up @@ -161,7 +172,7 @@ def build_bm25_index(
# --- Saving the Index ---
index_data = {"bm25": bm25, "docs": original_docs}

output_path = bm25_save_dir / f"bm25_index_conv_{i}.pkl"
output_path = bm25_save_dir / f"bm25_index_conv_{conv_id}.pkl"
print(f"Saving index to: {output_path}")
with open(output_path, "wb") as f:
pickle.dump(index_data, f)
Expand Down Expand Up @@ -190,8 +201,19 @@ async def build_emb_index(config: ExperimentConfig, data_dir: Path, emb_save_dir

import time # For performance statistics

# Get conversation IDs for proper file naming
# If conversation_ids is provided, use them; otherwise fall back to sequential indices
conversation_ids = getattr(config, 'conversation_ids', [])

for i in range(config.num_conv):
file_path = data_dir / f"memcell_list_conv_{i}.json"
# Use conversation_id if available, otherwise use sequential index
if conversation_ids and i < len(conversation_ids):
# Extract numeric ID from conversation_id (e.g., "locomo_234" -> "234")
conv_id = conversation_ids[i].split("_")[-1] if "_" in conversation_ids[i] else conversation_ids[i]
else:
conv_id = str(i)

file_path = data_dir / f"memcell_list_conv_{conv_id}.json"
if not file_path.exists():
print(f"Warning: File not found, skipping: {file_path}")
continue
Expand Down Expand Up @@ -365,7 +387,7 @@ async def process_batch_with_retry(
# },
# ...
# ]
output_path = emb_save_dir / f"embedding_index_conv_{i}.pkl"
output_path = emb_save_dir / f"embedding_index_conv_{conv_id}.pkl"
emb_save_dir.mkdir(parents=True, exist_ok=True)
print(f"Saving embeddings to: {output_path}")
with open(output_path, "wb") as f:
Expand Down
68 changes: 60 additions & 8 deletions evaluation/src/adapters/evermemos_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def _extract_conv_index(conversation_id: str) -> str:
return conversation_id

def _check_missing_indexes(
self, index_dir: Path, num_conv: int, index_type: str = "bm25"
self, index_dir: Path, num_conv: int, index_type: str = "bm25",
conversation_ids: List[str] = None
) -> List[int]:
"""
Check for missing index files.
Expand All @@ -111,17 +112,24 @@ def _check_missing_indexes(
index_dir: Index directory
num_conv: Total number of conversations
index_type: Index type ("bm25" or "embedding")
conversation_ids: List of conversation IDs for proper file naming

Returns:
List of conversation indices with missing indexes
"""
missing_indexes = []

for i in range(num_conv):
# Use extracted numeric ID for file naming if conversation_ids provided
if conversation_ids and i < len(conversation_ids):
conv_id = self._extract_conv_index(conversation_ids[i])
else:
conv_id = str(i)

if index_type == "bm25":
index_file = index_dir / f"bm25_index_conv_{i}.pkl"
index_file = index_dir / f"bm25_index_conv_{conv_id}.pkl"
else: # embedding
index_file = index_dir / f"embedding_index_conv_{i}.pkl"
index_file = index_dir / f"embedding_index_conv_{conv_id}.pkl"

if not index_file.exists():
missing_indexes.append(i)
Expand Down Expand Up @@ -363,10 +371,14 @@ async def run_with_completion(conv_id, task):
# Call stage2 implementation to build indexes
exp_config = self._convert_config_to_experiment_config()
exp_config.num_conv = len(conversations) # Set conversation count
# Pass conversation IDs for proper index file naming (supports --from-conv/--to-conv slicing)
conversation_ids_list = [conv.conversation_id for conv in conversations]
exp_config.conversation_ids = conversation_ids_list

# Smart skip logic: check existing index files
bm25_need_build = self._check_missing_indexes(
index_dir=bm25_index_dir, num_conv=len(conversations), index_type="bm25"
index_dir=bm25_index_dir, num_conv=len(conversations), index_type="bm25",
conversation_ids=conversation_ids_list
)

emb_need_build = []
Expand All @@ -376,6 +388,7 @@ async def run_with_completion(conv_id, task):
index_dir=emb_index_dir,
num_conv=len(conversations),
index_type="embedding",
conversation_ids=conversation_ids_list
)

# Statistics
Expand Down Expand Up @@ -424,12 +437,29 @@ async def run_with_completion(conv_id, task):

# ========== Plan A: Return index metadata (lazy loading) ==========
# Don't load indexes into memory, only return paths and metadata

# Build mapping from conversation_id to extracted numeric ID
# This is needed because when using --from-conv/--to-conv slicing:
# - Index files are saved with extracted numeric IDs (e.g., "234", "235"...)
# - But conversation_ids still contain original IDs (e.g., "locomo_exp_user_234")
# - We need to map conversation_id -> extracted numeric ID (not sequential index!)
conv_id_to_index = {
conv.conversation_id: self._extract_conv_index(conv.conversation_id)
for idx, conv in enumerate(conversations)
}

# Save mapping to a JSON file for persistence across stages
mapping_file = output_dir / "conversation_index_mapping.json"
with open(mapping_file, "w") as f:
json.dump(conv_id_to_index, f, indent=2)

index_metadata = {
"type": "lazy_load", # Mark as lazy loading
"memcells_dir": str(memcells_dir),
"bm25_index_dir": str(bm25_index_dir),
"emb_index_dir": str(emb_index_dir),
"conversation_ids": [conv.conversation_id for conv in conversations],
"conv_id_to_index": conv_id_to_index, # Add mapping for search stage
"use_hybrid_search": use_hybrid,
"total_conversations": len(conversations),
}
Expand All @@ -454,16 +484,29 @@ async def search(
Search stage: Retrieve relevant MemCells.

Lazy loading: Load indexes from files on demand (memory-friendly).

Fix for --from-conv/--to-conv slicing:
- When building indexes, files are saved with sequential indices (0, 1, 2...)
- But conversation_id still contains original ID (e.g., "locomo_234")
- Use the mapping (conv_id_to_index) to find the correct sequential index
"""
# Lazy loading - read indexes from files
bm25_index_dir = Path(index["bm25_index_dir"])
emb_index_dir = Path(index["emb_index_dir"])

# Extract numeric index from conversation_id to find index files
# Example: conversation_id = "locomo_0" -> conv_index = "0"
conv_index = self._extract_conv_index(conversation_id)
# Get the sequential index from the mapping
# This mapping was created in add() stage and maps conversation_id -> sequential index
conv_id_to_index = index.get("conv_id_to_index", {})

if conversation_id in conv_id_to_index:
# Use the mapping to get sequential index
conv_index = conv_id_to_index[conversation_id]
else:
# Fallback: extract index from conversation_id (legacy behavior)
# This handles cases where the mapping is not available (e.g., old index files)
conv_index = self._extract_conv_index(conversation_id)

# Load BM25 index on demand (using numeric index)
# Load BM25 index on demand (using sequential index)
bm25_file = bm25_index_dir / f"bm25_index_conv_{conv_index}.pkl"
if not bm25_file.exists():
return SearchResult(
Expand Down Expand Up @@ -682,12 +725,21 @@ def build_lazy_index(
Returns:
Index metadata dict
"""
# Build mapping from conversation_id to extracted numeric ID
# This is needed for --from-conv/--to-conv slicing support
# Index files are named with extracted numeric IDs (e.g., "234", not sequential 0)
conv_id_to_index = {
conv.conversation_id: self._extract_conv_index(conv.conversation_id)
for idx, conv in enumerate(conversations)
}

return {
"type": "lazy_load",
"memcells_dir": str(output_dir / "memcells"),
"bm25_index_dir": str(output_dir / "bm25_index"),
"emb_index_dir": str(output_dir / "vectors"),
"conversation_ids": [conv.conversation_id for conv in conversations],
"conv_id_to_index": conv_id_to_index, # Add mapping for search stage
"use_hybrid_search": True,
"total_conversations": len(conversations),
}