Merge branch 'master' into llg_cleanup

EricLBuehler · EricLBuehler · Dec 2, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
commit 3d22f2b18a6d933edc28f7a2f5186319b4deabf9
diff --git a/mistralrs-core/src/engine/mod.rs b/mistralrs-core/src/engine/mod.rs
@@ -14,9 +14,9 @@ use toktrie::TokEnv;
 
 use crate::{
     pipeline::{
+        text_models_inputs_processor::PagedAttentionMeta, AdapterInstruction, CacheBackendMetadata,
+        CacheInstruction, EitherCache, NormalCache,
         llg::{constraint_from_llg_grammar, llg_grammar_from_constraint},
-        text_models_inputs_processor::PagedAttentionMeta,
-        AdapterInstruction, CacheBackendMetadata, CacheInstruction,
     },
     request::{DetokenizationRequest, NormalRequest, TokenizationRequest},
     response::CompletionChoice,
@@ -826,6 +826,56 @@ impl Engine {
                 .cache_config
                 .clone()
                 .map(|conf| conf.block_size);
+            let trie = get_mut_arcmutex!(self.pipeline)
+                .get_metadata()
+                .tok_trie
+                .as_ref()
+                .map(|x| (**x).clone());
+
+            let cache = get_mut_arcmutex!(self.pipeline).cache().clone();
+            let seq_preallocated_cache = if let EitherCache::Normal(_cache) = cache {
+                let metadata = get_mut_arcmutex!(self.pipeline).get_metadata();
+                let model_metadata = metadata
+                    .model_metadata
+                    .as_ref()
+                    .expect("If a model has a NormalCache it must have a model metadata");
+                let n_tokens = prompt_tokens.len();
+                let required_blocks = n_tokens.div_ceil(NormalCache::CACHE_GROW_SIZE);
+                let max_seq_len = required_blocks * NormalCache::CACHE_GROW_SIZE;
+                let kv_shape = (
+                    1usize,
+                    model_metadata.num_kv_heads(),
+                    max_seq_len,
+                    model_metadata.head_dim(),
+                );
+                let dtype = get_mut_arcmutex!(self.pipeline)
+                    .get_metadata()
+                    .activation_dtype;
+                let seq_cache =
+                    Tensor::zeros(kv_shape, dtype, &get_mut_arcmutex!(self.pipeline).device());
+                let seq_cache = match seq_cache {
+                    Ok(x) => x,
+                    Err(_) => {
+                        request
+                            .response
+                            .send(Response::InternalError(
+                                "Failed to allocate preallocated KV cache."
+                                    .to_string()
+                                    .into(),
+                            ))
+                            .await
+                            .expect("Expected receiver.");
+                        return;
+                    }
+                };
+                Some(seq_cache)
+            } else {
+                None
+            };
+
+            let now = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .expect("Time travel has occurred!");
             let seq = Sequence::new_waiting(
                 prompt_tokens.clone(),
                 prompt_text.clone(),

diff --git a/mistralrs-core/src/pipeline/ggml.rs b/mistralrs-core/src/pipeline/ggml.rs
@@ -1,4 +1,5 @@
 use super::cache_manager::FullCacheManager;
+use super::llg::build_tok_env;
 use super::{
     get_model_paths, get_xlora_paths, text_models_inputs_processor::ModelInputs, AdapterKind,
     CacheManager, GeneralMetadata, Loader, ModelKind, ModelPaths, QuantizationKind, TokenSource,
@@ -10,7 +11,7 @@ use super::{
 };
 use crate::lora::Ordering;
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
-use super::llg::build_tok_env;
+use crate::pipeline::get_chat_template;
 use crate::pipeline::sampling::sample_and_add_toks;
 use crate::pipeline::{ChatTemplate, LocalModelPaths};
 use crate::prefix_cacher::PrefixCacheManager;

diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
@@ -1,4 +1,4 @@
-use super::cache_manager::DefaultCacheManager;
+use super::cache_manager::{DefaultCacheManager, FullCacheManager, NormalCacheManager};
 use super::llg::build_tok_env;
 use super::{
     get_model_paths, get_xlora_paths, text_models_inputs_processor::ModelInputs, AdapterKind,

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -528,8 +528,11 @@ impl Loader for NormalLoader {
         };
 
         let max_seq_len = model.max_seq_len();
-        let tok_env = build_tok_env(tokenizer.clone()).into();
-        let num_hidden_layers = model.cache().lock().len();
+        let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
+        let num_hidden_layers = match model.cache() {
+            EitherCache::Full(full) => full.lock().len(),
+            EitherCache::Normal(normal) => normal.lock().unwrap().0.len(),
+        };
         let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer);
         let sliding_window = model.config().sliding_window;
         let model_metadata = Arc::new(model.config().clone());

diff --git a/mistralrs-core/src/pipeline/vision.rs b/mistralrs-core/src/pipeline/vision.rs
@@ -1,6 +1,8 @@
 use super::cache_manager::DefaultCacheManager;
 use super::isq::UqffFullSer;
 use super::llg::build_tok_env;
+use super::cache_manager::{FullCacheManager, NormalCacheManager};
+use super::isq::{ImatrixDataSource, UqffFullSer};
 use super::{
     get_model_paths, get_xlora_paths, AdapterActivationMixin, AnyMoePipelineMixin, CacheManager,
     CacheManagerMixin, EitherCache, ForwardInputsResult, GeneralMetadata, IsqPipelineMixin, Loader,
@@ -12,6 +14,8 @@ use super::{
     Idefics2Loader, Idefics3Loader, LLaVALoader, LLaVANextLoader, Phi3VLoader, VisionLoaderType,
 };
 use super::{Idefics2Loader, LLaVALoader, LLaVANextLoader, Phi3VLoader, VisionLoaderType};
+use crate::aici::bintokens::build_tok_trie;
+use crate::aici::toktree::TokTrie;
 use crate::paged_attention::{calculate_cache_config, AttentionImplementation, CacheEngine};
 use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
 use crate::pipeline::sampling::sample_and_add_toks;
@@ -363,8 +367,11 @@ impl Loader for VisionLoader {
         };
 
         let max_seq_len = model.max_seq_len();
-        let tok_env = build_tok_env(tokenizer.clone()).into();
-        let num_hidden_layers = model.cache().lock().len();
+        let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
+        let num_hidden_layers = match model.cache() {
+            EitherCache::Full(full) => full.lock().len(),
+            EitherCache::Normal(normal) => normal.lock().unwrap().0.len(),
+        };
         let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer);
         let sliding_window = model.config().sliding_window;
         let model_metadata = Arc::new(model.config().clone());