Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7fa4686
import llguidance modules
mmoskal Nov 6, 2024
1dcca91
llg constraint types
mmoskal Nov 6, 2024
eaf52be
integrate llguidance
mmoskal Nov 6, 2024
cc82722
fix handling of stop tokens
mmoskal Nov 6, 2024
fb83b5c
update toktrie
mmoskal Nov 6, 2024
00c36dd
remove submodules
mmoskal Nov 6, 2024
cab564b
fix version conflicts
mmoskal Nov 6, 2024
d1becbd
tok_trie -> tok_env rename
mmoskal Nov 6, 2024
f70b3de
update to latest llguidance
mmoskal Nov 30, 2024
3d22f2b
Merge branch 'master' into llg_cleanup
mmoskal Nov 30, 2024
ca9e346
sync lock
mmoskal Nov 30, 2024
7b3ae50
bump llg (lazy_static fix)
mmoskal Nov 30, 2024
146bc4c
update to latest llguidance, fix conflicts
mmoskal Nov 30, 2024
6019fee
import toktrie via llguidance
mmoskal Nov 30, 2024
dd35965
n=1
mmoskal Dec 1, 2024
3ac55ed
test with llama1b
mmoskal Dec 1, 2024
8ca7514
remove aici folder (no longer used)
mmoskal Dec 1, 2024
7967d42
use more specific type for llg grammars
mmoskal Dec 1, 2024
9a919d8
update python APIs to support json schema and llg
mmoskal Dec 1, 2024
0ad0948
update example to use lark not yacc
mmoskal Dec 1, 2024
816ac8f
rename example
mmoskal Dec 1, 2024
5fc906b
remove testing scripts
mmoskal Dec 1, 2024
5e9cbd2
re-export llguidance for easier LlguidanceGrammar construction
mmoskal Dec 2, 2024
ffcdd2a
fix formatting
mmoskal Dec 2, 2024
fda20fe
fix clippy
mmoskal Dec 2, 2024
b5add20
Merge branch 'master' into llg_cleanup
mmoskal Dec 2, 2024
2c59224
add python samples
mmoskal Dec 2, 2024
976092c
add server samples
mmoskal Dec 2, 2024
ac7c35d
add rust samples
mmoskal Dec 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge branch 'master' into llg_cleanup
  • Loading branch information
mmoskal committed Nov 30, 2024
commit 3d22f2b18a6d933edc28f7a2f5186319b4deabf9
54 changes: 52 additions & 2 deletions mistralrs-core/src/engine/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ use toktrie::TokEnv;

use crate::{
pipeline::{
text_models_inputs_processor::PagedAttentionMeta, AdapterInstruction, CacheBackendMetadata,
CacheInstruction, EitherCache, NormalCache,
llg::{constraint_from_llg_grammar, llg_grammar_from_constraint},
text_models_inputs_processor::PagedAttentionMeta,
AdapterInstruction, CacheBackendMetadata, CacheInstruction,
},
request::{DetokenizationRequest, NormalRequest, TokenizationRequest},
response::CompletionChoice,
Expand Down Expand Up @@ -826,6 +826,56 @@ impl Engine {
.cache_config
.clone()
.map(|conf| conf.block_size);
let trie = get_mut_arcmutex!(self.pipeline)
.get_metadata()
.tok_trie
.as_ref()
.map(|x| (**x).clone());

let cache = get_mut_arcmutex!(self.pipeline).cache().clone();
let seq_preallocated_cache = if let EitherCache::Normal(_cache) = cache {
let metadata = get_mut_arcmutex!(self.pipeline).get_metadata();
let model_metadata = metadata
.model_metadata
.as_ref()
.expect("If a model has a NormalCache it must have a model metadata");
let n_tokens = prompt_tokens.len();
let required_blocks = n_tokens.div_ceil(NormalCache::CACHE_GROW_SIZE);
let max_seq_len = required_blocks * NormalCache::CACHE_GROW_SIZE;
let kv_shape = (
1usize,
model_metadata.num_kv_heads(),
max_seq_len,
model_metadata.head_dim(),
);
let dtype = get_mut_arcmutex!(self.pipeline)
.get_metadata()
.activation_dtype;
let seq_cache =
Tensor::zeros(kv_shape, dtype, &get_mut_arcmutex!(self.pipeline).device());
let seq_cache = match seq_cache {
Ok(x) => x,
Err(_) => {
request
.response
.send(Response::InternalError(
"Failed to allocate preallocated KV cache."
.to_string()
.into(),
))
.await
.expect("Expected receiver.");
return;
}
};
Some(seq_cache)
} else {
None
};

let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time travel has occurred!");
let seq = Sequence::new_waiting(
prompt_tokens.clone(),
prompt_text.clone(),
Expand Down
3 changes: 2 additions & 1 deletion mistralrs-core/src/pipeline/ggml.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use super::cache_manager::FullCacheManager;
use super::llg::build_tok_env;
use super::{
get_model_paths, get_xlora_paths, text_models_inputs_processor::ModelInputs, AdapterKind,
CacheManager, GeneralMetadata, Loader, ModelKind, ModelPaths, QuantizationKind, TokenSource,
Expand All @@ -10,7 +11,7 @@ use super::{
};
use crate::lora::Ordering;
use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
use super::llg::build_tok_env;
use crate::pipeline::get_chat_template;
use crate::pipeline::sampling::sample_and_add_toks;
use crate::pipeline::{ChatTemplate, LocalModelPaths};
use crate::prefix_cacher::PrefixCacheManager;
Expand Down
2 changes: 1 addition & 1 deletion mistralrs-core/src/pipeline/gguf.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::cache_manager::DefaultCacheManager;
use super::cache_manager::{DefaultCacheManager, FullCacheManager, NormalCacheManager};
use super::llg::build_tok_env;
use super::{
get_model_paths, get_xlora_paths, text_models_inputs_processor::ModelInputs, AdapterKind,
Expand Down
7 changes: 5 additions & 2 deletions mistralrs-core/src/pipeline/normal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,11 @@ impl Loader for NormalLoader {
};

let max_seq_len = model.max_seq_len();
let tok_env = build_tok_env(tokenizer.clone()).into();
let num_hidden_layers = model.cache().lock().len();
let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
let num_hidden_layers = match model.cache() {
EitherCache::Full(full) => full.lock().len(),
EitherCache::Normal(normal) => normal.lock().unwrap().0.len(),
};
let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer);
let sliding_window = model.config().sliding_window;
let model_metadata = Arc::new(model.config().clone());
Expand Down
11 changes: 9 additions & 2 deletions mistralrs-core/src/pipeline/vision.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use super::cache_manager::DefaultCacheManager;
use super::isq::UqffFullSer;
use super::llg::build_tok_env;
use super::cache_manager::{FullCacheManager, NormalCacheManager};
use super::isq::{ImatrixDataSource, UqffFullSer};
use super::{
get_model_paths, get_xlora_paths, AdapterActivationMixin, AnyMoePipelineMixin, CacheManager,
CacheManagerMixin, EitherCache, ForwardInputsResult, GeneralMetadata, IsqPipelineMixin, Loader,
Expand All @@ -12,6 +14,8 @@ use super::{
Idefics2Loader, Idefics3Loader, LLaVALoader, LLaVANextLoader, Phi3VLoader, VisionLoaderType,
};
use super::{Idefics2Loader, LLaVALoader, LLaVANextLoader, Phi3VLoader, VisionLoaderType};
use crate::aici::bintokens::build_tok_trie;
use crate::aici::toktree::TokTrie;
use crate::paged_attention::{calculate_cache_config, AttentionImplementation, CacheEngine};
use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
use crate::pipeline::sampling::sample_and_add_toks;
Expand Down Expand Up @@ -363,8 +367,11 @@ impl Loader for VisionLoader {
};

let max_seq_len = model.max_seq_len();
let tok_env = build_tok_env(tokenizer.clone()).into();
let num_hidden_layers = model.cache().lock().len();
let tok_trie: Arc<TokTrie> = build_tok_trie(tokenizer.clone()).into();
let num_hidden_layers = match model.cache() {
EitherCache::Full(full) => full.lock().len(),
EitherCache::Normal(normal) => normal.lock().unwrap().0.len(),
};
let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer);
let sliding_window = model.config().sliding_window;
let model_metadata = Arc::new(model.config().clone());
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.