From 3e41b7c5f0375fcaae5a2ceadb7e1d41166ccaec Mon Sep 17 00:00:00 2001 From: Antonio Cheong Date: Fri, 5 Sep 2025 20:24:33 +0100 Subject: [PATCH] fix: Better error messages --- src/lib.rs | 47 ++++++++++++++----- ...rockspec => tiktoken_core-0.2.5-1.rockspec | 4 +- 2 files changed, 37 insertions(+), 14 deletions(-) rename tiktoken_core-0.2.4-1.rockspec => tiktoken_core-0.2.5-1.rockspec (92%) diff --git a/src/lib.rs b/src/lib.rs index 20f2988..0fc4289 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -215,19 +215,32 @@ fn new( ) { let mut encoder: HashMap, usize> = HashMap::default(); // Read the encoder file each line is a base64 encoded token and rank separated by a space - let file = File::open(encoder_path).unwrap(); + let file = File::open(&encoder_path) + .map_err(|e| format!("Failed to open encoder file '{}': {}", encoder_path, e)) + .unwrap(); let reader = BufReader::new(file); for line in reader.lines() { - let line = line.unwrap(); + let line = line + .map_err(|e| format!("Failed to read line from encoder file: {}", e)) + .unwrap(); let mut parts = line.split_whitespace(); + let token_b64 = parts.next() + .ok_or_else(|| format!("Invalid encoder file format: missing token in line '{}'", line)) + .unwrap(); let token = BASE64_STANDARD - .decode(parts.next().unwrap().as_bytes()) + .decode(token_b64.as_bytes()) + .map_err(|e| format!("Failed to decode base64 token '{}': {}", token_b64, e)) + .unwrap(); + let rank_str = parts.next() + .ok_or_else(|| format!("Invalid encoder file format: missing rank in line '{}'", line)) + .unwrap(); + let rank = rank_str.parse() + .map_err(|e| format!("Failed to parse rank '{}': {}", rank_str, e)) .unwrap(); - let rank = parts.next().unwrap().parse().unwrap(); encoder.insert(token, rank); } let regex = Regex::new(&pattern) - .map_err(|e| mlua::Error::external(e)) + .map_err(|e| format!("Failed to compile main regex pattern '{}': {}", pattern, e)) .unwrap(); let special_regex = { let _parts = special_tokens_encoder @@ -235,7 +248,7 @@ fn new( .map(|s| fancy_regex::escape(s)) .collect::>(); Regex::new(&_parts.join("|")) - .map_err(|e| mlua::Error::external(e)) + .map_err(|e| format!("Failed to compile special tokens regex: {}", e)) .unwrap() }; let special_tokens_decoder: HashMap> = special_tokens_encoder @@ -244,7 +257,9 @@ fn new( .collect(); let mut sorted_token_bytes: Vec> = encoder.keys().cloned().collect(); sorted_token_bytes.sort(); - let mut core_bpe_lock = state.core_bpe.lock().unwrap(); + let mut core_bpe_lock = state.core_bpe.lock() + .map_err(|e| format!("Failed to acquire lock on core_bpe: {}", e)) + .unwrap(); *core_bpe_lock = Some(CoreBPENative { encoder, special_tokens_encoder, @@ -267,9 +282,9 @@ fn encode(state: &State, text: mlua::String) -> LuaResult<(Vec, usize, us Ok(state .core_bpe .lock() - .unwrap() + .map_err(|e| mlua::Error::external(format!("Failed to acquire lock on core_bpe: {}", e)))? .as_ref() - .unwrap() + .ok_or_else(|| mlua::Error::external("Core BPE not initialized"))? ._encode_native(&encoded_str, &allowed_special, max_tokens)) } @@ -313,7 +328,10 @@ impl CoreBPENative { let regex = self._get_tl_regex(); let mut ret = vec![]; for mat in regex.find_iter(text) { - let piece = mat.unwrap().as_str().as_bytes(); + let piece = mat + .map_err(|e| format!("Regex matching failed: {}", e)) + .unwrap() + .as_str().as_bytes(); if let Some(token) = self.encoder.get(piece) { ret.push(*token); continue; @@ -341,7 +359,9 @@ impl CoreBPENative { let mut start_find = start; loop { // Find the next allowed special token, if any - next_special = special_regex.find_from_pos(text, start_find).unwrap(); + next_special = special_regex.find_from_pos(text, start_find) + .map_err(|e| format!("Special regex matching failed at position {}: {}", start_find, e)) + .unwrap(); match next_special { Some(m) => { if allowed_special.contains(&text[m.start()..m.end()]) { @@ -356,7 +376,10 @@ impl CoreBPENative { // Okay, here we go, compare this logic to _encode_ordinary_native for mat in regex.find_iter(&text[start..end]) { - let piece = mat.unwrap().as_str().as_bytes(); + let piece = mat + .map_err(|e| format!("Regex matching failed in text slice: {}", e)) + .unwrap() + .as_str().as_bytes(); if let Some(token) = self.encoder.get(piece) { last_piece_token_len = 1; ret.push(*token); diff --git a/tiktoken_core-0.2.4-1.rockspec b/tiktoken_core-0.2.5-1.rockspec similarity index 92% rename from tiktoken_core-0.2.4-1.rockspec rename to tiktoken_core-0.2.5-1.rockspec index 24292a0..417272c 100644 --- a/tiktoken_core-0.2.4-1.rockspec +++ b/tiktoken_core-0.2.5-1.rockspec @@ -1,9 +1,9 @@ package = "tiktoken_core" -version = "0.2.4-1" +version = "0.2.5-1" source = { url = "git+https://github.com/gptlang/lua-tiktoken", - tag = "v0.2.4", + tag = "v0.2.5", } description = {