integrate llguidance

EricLBuehler · EricLBuehler · Dec 2, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
commit eaf52be02e0a4ed8c992acc8b4b5ebbb1e33ddbc
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,6 +7,9 @@ members = [
     "mistralrs-bench",
     "mistralrs-vision",
     "mistralrs-quant",
+    "external/toktrie/core",
+    "external/toktrie/hf_tokenizers",
+    "external/llguidance/parser",
 ]
 exclude = [
     "mistralrs-paged_attn",
@@ -50,3 +53,7 @@ url = "2.5.2"
 data-url = "0.3.1"
 buildstructor = "0.5.4"
 float8 = "0.1.1"
+
+[patch.'https://github.com/microsoft/toktrie']
+toktrie = { path = "external/toktrie/core" }
+toktrie_hf_tokenizers = { path = "external/toktrie/hf_tokenizers" }
diff --git a/external/toktrie b/external/toktrie
diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -79,6 +79,9 @@ safetensors = "0.4.5"
 serde_plain = "1.0.2"
 as-any = "0.3.1"
 float8.workspace = true
+llguidance_parser = { path = "../external/llguidance/parser" }
+toktrie = { path = "../external/toktrie/core" }
+toktrie_hf_tokenizers = { path = "../external/toktrie/hf_tokenizers" }
 
 [features]
 pyo3_macros = ["pyo3"]

diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs
@@ -8,12 +8,13 @@ use std::{
     time::{Instant, SystemTime, UNIX_EPOCH},
 };
 use tokio::sync::{mpsc::Receiver, Mutex};
+use toktrie::TokEnv;
 
 use crate::{
-    aici::{cfg::CfgParser, recognizer::StackRecognizer, rx::RecRx},
     pipeline::{
-        text_models_inputs_processor::PagedAttentionMeta, AdapterInstruction, CacheBackendMetadata,
-        CacheInstruction,
+        llg::{constraint_from_llg_grammar, llg_grammar_from_constraint},
+        text_models_inputs_processor::PagedAttentionMeta,
+        AdapterInstruction, CacheBackendMetadata, CacheInstruction,
     },
     request::NormalRequest,
     response::CompletionChoice,
@@ -455,15 +456,19 @@ impl Engine {
         }
     }
 
-    fn build_sequence_recognizer(constraint: &Constraint) -> anyhow::Result<SequenceRecognizer> {
-        let recognizer = match constraint {
-            Constraint::Regex(rx) => {
-                SequenceRecognizer::Regex(StackRecognizer::from(RecRx::from_rx(rx, None)?).into())
-            }
-            Constraint::Yacc(cfg) => SequenceRecognizer::Cfg(CfgParser::from_yacc(cfg)?.into()),
-            Constraint::None => SequenceRecognizer::None,
-        };
-        Ok(recognizer)
+    fn build_sequence_recognizer(
+        tok_env: &Option<TokEnv>,
+        constraint: &Constraint,
+    ) -> anyhow::Result<SequenceRecognizer> {
+        if let Some(grm) = llg_grammar_from_constraint(constraint)? {
+            let tok_env = tok_env
+                .as_ref()
+                .ok_or_else(|| anyhow::anyhow!("No token environment found."))?;
+            let llg = constraint_from_llg_grammar(tok_env.clone(), grm)?;
+            Ok(SequenceRecognizer::Llguidance(Box::new(llg)))
+        } else {
+            Ok(SequenceRecognizer::None)
+        }
     }
 
     async fn handle_request(&mut self, request: Request) {
@@ -668,6 +673,7 @@ impl Engine {
                 for id in i {
                     // We can't use ` ` (space) as a stop token because other tokens like ` moon` start with a space.
                     if let Some(tok_trie) = tok_trie.as_ref() {
+                        let tok_trie = tok_trie.tok_trie();
                         if tok_trie.has_extensions(tok_trie.token(*id)) {
                             request
                                 .response
@@ -712,6 +718,7 @@ impl Engine {
 
                     if toks.len() == 1 {
                         if tok_trie.as_ref().is_some_and(|tok_trie| {
+                            let tok_trie = tok_trie.tok_trie();
                             tok_trie.has_extensions(tok_trie.token(toks[0]))
                         }) {
                             stop_strings.push(stop_txt.clone());
@@ -766,7 +773,11 @@ impl Engine {
 
         // Add sequences
         for response_index in 0..request.sampling_params.n_choices {
-            let recognizer = match Self::build_sequence_recognizer(&request.constraint) {
+            let trie = get_mut_arcmutex!(self.pipeline)
+                .get_metadata()
+                .tok_trie
+                .clone();
+            let recognizer = match Self::build_sequence_recognizer(&trie, &request.constraint) {
                 Ok(recognizer) => recognizer,
                 Err(err) => {
                     request
@@ -785,11 +796,6 @@ impl Engine {
                 .cache_config
                 .clone()
                 .map(|conf| conf.block_size);
-            let trie = get_mut_arcmutex!(self.pipeline)
-                .get_metadata()
-                .tok_trie
-                .as_ref()
-                .map(|x| (**x).clone());
             let seq = Sequence::new_waiting(
                 prompt_tokens.clone(),
                 prompt_text.clone(),
@@ -816,7 +822,6 @@ impl Engine {
                 request.adapters.clone(),
                 images.clone(),
                 block_size,
-                trie,
                 matcher.clone(),
                 image_generation_format,
                 seq_step_type,

diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs
@@ -23,7 +23,6 @@ use std::{
 };
 use tokio::sync::mpsc::{channel, Sender};
 
-mod aici;
 mod cuda;
 mod device_map;
 mod engine;

diff --git a/mistralrs-core/src/pipeline/amoe.rs b/mistralrs-core/src/pipeline/amoe.rs
@@ -568,7 +568,6 @@ fn new_dummy_seq(
         None, // TODO incorrect for PagedAttention
         None,
         None,
-        None,
         SeqStepType::PromptAndDecode,
         None,
     )

diff --git a/mistralrs-core/src/pipeline/ggml.rs b/mistralrs-core/src/pipeline/ggml.rs
@@ -8,10 +8,9 @@ use super::{
     AdapterActivationMixin, AnyMoePipelineMixin, CacheManagerMixin, ForwardInputsResult,
     IsqPipelineMixin, MetadataMixin, ModelCategory, PreProcessingMixin,
 };
-use crate::aici::bintokens::build_tok_trie;
-use crate::aici::toktree::TokTrie;
 use crate::lora::Ordering;
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
+use super::llg::build_tok_env;
 use crate::pipeline::sampling::sample_and_add_toks;
 use crate::pipeline::{get_chat_template, Cache};
 use crate::pipeline::{ChatTemplate, LocalModelPaths};
@@ -356,7 +355,7 @@ impl Loader for GGMLLoader {
             Model::Llama(ref l) => l.max_seq_len,
             Model::XLoraLlama(ref xl) => xl.max_seq_len,
         };
-        let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
+        let tok_trie = build_tok_env(tokenizer.clone()).into();
         let num_hidden_layers = match model {
             Model::Llama(ref model) => model.cache.lock().len(),
             Model::XLoraLlama(ref model) => model.cache.lock().len(),

diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
@@ -1,4 +1,5 @@
 use super::cache_manager::DefaultCacheManager;
+use super::llg::build_tok_env;
 use super::{
     get_model_paths, get_xlora_paths, text_models_inputs_processor::ModelInputs, AdapterKind,
     CacheManager, GeneralMetadata, Loader, ModelKind, ModelPaths, PrettyName, QuantizationKind,
@@ -8,8 +9,6 @@ use super::{
     AdapterActivationMixin, AnyMoePipelineMixin, CacheManagerMixin, ForwardInputsResult,
     IsqPipelineMixin, MetadataMixin, ModelCategory, PreProcessingMixin,
 };
-use crate::aici::bintokens::build_tok_trie;
-use crate::aici::toktree::TokTrie;
 use crate::gguf::{
     get_gguf_chat_template, {convert_gguf_to_hf_tokenizer, GgufTokenizerConversion},
 };
@@ -484,7 +483,7 @@ impl Loader for GGUFLoader {
             Model::Starcoder2(ref p) => p.max_seq_len,
             Model::Qwen2(ref p) => p.max_seq_len,
         };
-        let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
+        let tok_trie = build_tok_env(tokenizer.clone()).into();
         let num_hidden_layers = match model {
             Model::Llama(ref model) => model.cache.lock().len(),
             Model::Phi2(ref model) => model.cache.lock().len(),

diff --git a/mistralrs-core/src/pipeline/llg.rs b/mistralrs-core/src/pipeline/llg.rs
@@ -0,0 +1,52 @@
+use std::sync::Arc;
+
+use anyhow::Result;
+use llguidance_parser::{
+    api::{ParserLimits, RegexNode, TopLevelGrammar},
+    lark::{lark_to_llguidance, parse_lark},
+    JsonCompileOptions, TokenParser,
+};
+use tokenizers::Tokenizer;
+use toktrie::{InferenceCapabilities, TokEnv};
+
+use crate::Constraint;
+
+pub fn build_tok_env(tokenizer: Tokenizer) -> TokEnv {
+    let bt = toktrie_hf_tokenizers::ByteTokenizer::from_tokenizer(tokenizer)
+        .expect("Failed to create ByteTokenizer from Tokenizer");
+    let env = toktrie_hf_tokenizers::ByteTokenizerEnv::new(bt, None)
+        .expect("Failed to create ByteTokenizerEnv");
+    Arc::new(env)
+}
+
+pub fn llg_grammar_from_constraint(constraint: &Constraint) -> Result<Option<TopLevelGrammar>> {
+    let grm = match constraint {
+        Constraint::Regex(regex) => {
+            TopLevelGrammar::from_regex(RegexNode::Regex(regex.to_string()))
+        }
+        Constraint::Lark(lark) => lark_to_llguidance(parse_lark(lark)?)?,
+        Constraint::JsonSchema(value) => {
+            JsonCompileOptions::default().json_to_llg_no_validate(value)?
+        }
+        Constraint::Llguidance(value) => serde_json::from_value(value.clone())?,
+        Constraint::None => return Ok(None),
+    };
+    Ok(Some(grm))
+}
+
+pub fn constraint_from_llg_grammar(
+    tok_env: TokEnv,
+    grm: TopLevelGrammar,
+) -> Result<llguidance_parser::Constraint> {
+    let parser = TokenParser::from_llguidance_json(
+        tok_env,
+        grm,
+        llguidance_parser::Logger::new(0, 1),
+        InferenceCapabilities {
+            ..Default::default()
+        },
+        ParserLimits::default(),
+        vec![],
+    )?;
+    Ok(llguidance_parser::Constraint::new(parser))
+}
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
@@ -14,9 +14,9 @@ mod processing;
 mod sampling;
 mod speculative;
 mod vision;
+pub(crate) mod llg;
 
 pub use super::diffusion_models::DiffusionGenerationParams;
-use crate::aici::toktree::TokTrie;
 use crate::amoe::{AnyMoeConfig, AnyMoeExpertType, AnyMoeTrainingInputs, AnyMoeTrainingResult};
 use crate::diffusion_models::response::send_responses;
 use crate::paged_attention::{CacheConfig, CacheEngine};
@@ -66,7 +66,7 @@ use self::text_models_inputs_processor::PagedAttentionMeta;
 pub struct GeneralMetadata {
     pub max_seq_len: usize,
     /// Only None if it doesnt make sense for the model
-    pub tok_trie: Option<Arc<TokTrie>>,
+    pub tok_trie: Option<toktrie::TokEnv>,
     pub has_no_kv_cache: bool,
     pub num_hidden_layers: usize,
     pub eos_tok: Vec<u32>,

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -12,8 +12,7 @@ use super::{
     AutoLoader, Gemma2Loader, GemmaLoader, LlamaLoader, MistralLoader, MixtralLoader,
     NormalLoaderType, Phi2Loader, Phi3Loader, Phi3_5MoELoader, Qwen2Loader, Starcoder2Loader,
 };
-use crate::aici::bintokens::build_tok_trie;
-use crate::aici::toktree::TokTrie;
+use super::llg::build_tok_env;
 use crate::amoe::AnyMoeExpertType;
 use crate::lora::Ordering;
 use crate::paged_attention::{calculate_cache_config, AttentionImplementation, CacheEngine};
@@ -426,7 +425,7 @@ impl Loader for NormalLoader {
         };
 
         let max_seq_len = model.max_seq_len();
-        let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
+        let tok_trie = build_tok_env(tokenizer.clone()).into();
         let num_hidden_layers = model.cache().lock().len();
         let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer);
         let sliding_window = model.config().sliding_window;

diff --git a/mistralrs-core/src/pipeline/sampling.rs b/mistralrs-core/src/pipeline/sampling.rs
@@ -4,7 +4,6 @@ use candle_core::{DType, Device, Result, Tensor};
 use rand_isaac::Isaac64Rng;
 
 use crate::{
-    get_bias_if_not_allowed,
     prefix_cacher::PrefixCacheManager,
     sampler::Logprobs,
     sequence::{Sequence, SequenceRecognizer},
@@ -30,6 +29,7 @@ pub(crate) async fn finish_or_add_toks_to_seq(
                 "`finish_or_add_toks_to_seq` requires the pipeline to have a token trie"
                     .to_string(),
             ))?
+            .tok_trie()
             .decode(&[logprobs.token]),
         &is_done,
     );
@@ -325,25 +325,23 @@ pub async fn sample_sequence(
     };
 
     let bias_if_not_allowed = match &mut seq.recognizer {
-        SequenceRecognizer::Regex(ref mut rx) => {
-            get_bias_if_not_allowed!(seq.tok_trie, rx.as_mut(), first_lobprobs_response.token)
-        }
-        SequenceRecognizer::Cfg(ref mut cfg) => {
-            get_bias_if_not_allowed!(seq.tok_trie, cfg.as_mut(), first_lobprobs_response.token)
+        SequenceRecognizer::Llguidance(ref mut llg) => {
+            let bias = llg.compute_mask().map_err(candle_core::Error::msg)?;
+            if let Some(mask) = &bias.sample_mask {
+                if mask.is_allowed(first_lobprobs_response.token) {
+                    None
+                } else {
+                    Some(mask)
+                }
+            } else {
+                None
+            }
         }
         SequenceRecognizer::None => None,
     };
     let second_logprobs_response = match bias_if_not_allowed {
         Some(token_set) => {
-            let mut acc = vec![
-                -f32::INFINITY;
-                seq.tok_trie
-                    .as_ref()
-                    .ok_or(candle_core::Error::Msg(
-                        "TokTrie must be present in pipeline if bias is calculated".to_string()
-                    ))?
-                    .vocab_size()
-            ];
+            let mut acc = vec![-f32::INFINITY; token_set.len()];
             token_set.apply_to(&mut acc);
             let new_logits = (logits + Tensor::from_slice(&acc, acc.len(), &Device::Cpu)?)?;
 
@@ -374,20 +372,10 @@ pub async fn sample_sequence(
         None => first_lobprobs_response,
     };
 
-    if add_to_trie && seq.tok_trie.is_some() {
+    if add_to_trie {
         match seq.recognizer {
-            SequenceRecognizer::Regex(ref mut rx) => {
-                seq.tok_trie
-                    .as_ref()
-                    .unwrap()
-                    .append_token(rx.as_mut(), second_logprobs_response.token)
-                    .map_err(candle_core::Error::msg)?;
-            }
-            SequenceRecognizer::Cfg(ref mut cfg) => {
-                seq.tok_trie
-                    .as_ref()
-                    .unwrap()
-                    .append_token(cfg.as_mut(), second_logprobs_response.token)
+            SequenceRecognizer::Llguidance(ref mut llg) => {
+                llg.commit_token(Some(second_logprobs_response.token))
                     .map_err(candle_core::Error::msg)?;
             }
             SequenceRecognizer::None => {}

diff --git a/mistralrs-core/src/pipeline/speculative.rs b/mistralrs-core/src/pipeline/speculative.rs
@@ -579,26 +579,8 @@ impl Pipeline for SpeculativePipeline {
                     )
                     .await?;
                     match seq.recognizer {
-                        SequenceRecognizer::Regex(ref mut rx) => {
-                            get_mut_arcmutex!(self.target)
-                                .get_metadata()
-                                .tok_trie
-                                .as_ref()
-                                .ok_or(candle_core::Error::Msg(
-                                    "`SpeculativePipeline::step` requires a token trie".to_string(),
-                                ))?
-                                .append_token(rx.as_mut(), accepted.token)
-                                .map_err(candle_core::Error::msg)?;
-                        }
-                        SequenceRecognizer::Cfg(ref mut cfg) => {
-                            get_mut_arcmutex!(self.target)
-                                .get_metadata()
-                                .tok_trie
-                                .as_ref()
-                                .ok_or(candle_core::Error::Msg(
-                                    "`SpeculativePipeline::step` requires a token trie".to_string(),
-                                ))?
-                                .append_token(cfg.as_mut(), accepted.token)
+                        SequenceRecognizer::Llguidance(ref mut llg) => {
+                            llg.commit_token(Some(accepted.token))
                                 .map_err(candle_core::Error::msg)?;
                         }
                         SequenceRecognizer::None => {}
-Original file line number
+Diff line change
@@ Expand Up / @@ -568,7 +568,6 @@ fn new_dummy_seq( @@
             None, // TODO incorrect for PagedAttention
             None,
             None,
-            None,
             SeqStepType::PromptAndDecode,
             None,
         )
@@ Expand Down @@