@@ -300,6 +300,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
300
300
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
301
301
# LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
302
302
# LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
303
+ # LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
303
304
# };
304
305
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
305
306
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -315,6 +316,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
315
316
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11
316
317
LLAMA_VOCAB_PRE_TYPE_OLMO = 12
317
318
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
319
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
318
320
319
321
320
322
# // note: these values should be synchronized with ggml_rope
@@ -611,17 +613,17 @@ class llama_batch(ctypes.Structure):
611
613
# };
612
614
class llama_model_kv_override_value (ctypes .Union ):
613
615
_fields_ = [
614
- ("int_value " , ctypes .c_int64 ),
615
- ("float_value " , ctypes .c_double ),
616
- ("bool_value " , ctypes .c_bool ),
617
- ("str_value " , ctypes .c_char * 128 ),
616
+ ("val_i64 " , ctypes .c_int64 ),
617
+ ("val_f64 " , ctypes .c_double ),
618
+ ("val_bool " , ctypes .c_bool ),
619
+ ("val_str " , ctypes .c_char * 128 ),
618
620
]
619
621
620
622
if TYPE_CHECKING :
621
- int_value : int
622
- float_value : float
623
- bool_value : bool
624
- str_value : bytes
623
+ val_i64 : int
624
+ val_f64 : float
625
+ val_bool : bool
626
+ val_str : bytes
625
627
626
628
627
629
class llama_model_kv_override (ctypes .Structure ):
@@ -718,6 +720,8 @@ class llama_model_params(ctypes.Structure):
718
720
]
719
721
720
722
723
+ # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
724
+ # // https://github.com/ggerganov/llama.cpp/pull/7544
721
725
# struct llama_context_params {
722
726
# uint32_t seed; // RNG seed, -1 for random
723
727
# uint32_t n_ctx; // text context, 0 = from model
@@ -744,15 +748,14 @@ class llama_model_params(ctypes.Structure):
744
748
# ggml_backend_sched_eval_callback cb_eval;
745
749
# void * cb_eval_user_data;
746
750
747
- # enum ggml_type type_k; // data type for K cache
748
- # enum ggml_type type_v; // data type for V cache
751
+ # enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
752
+ # enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
749
753
750
754
# // Keep the booleans together to avoid misalignment during copy-by-value.
751
755
# bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
752
756
# bool embeddings; // if true, extract embeddings (together with logits)
753
757
# bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
754
- # bool flash_attn; // whether to use flash attention
755
-
758
+ # bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
756
759
757
760
# // Abort callback
758
761
# // if it returns true, execution of llama_decode() will be aborted
@@ -2454,6 +2457,16 @@ def llama_token_is_eog(model: llama_model_p, token: Union[llama_token, int], /)
2454
2457
...
2455
2458
2456
2459
2460
+ # // Identify if Token Id is a control token or a render-able token
2461
+ # LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
2462
+ @ctypes_function (
2463
+ "llama_token_is_control" , [llama_model_p_ctypes , llama_token ], ctypes .c_bool
2464
+ )
2465
+ def llama_token_is_control (model : llama_model_p , token : Union [llama_token , int ], / ) -> bool :
2466
+ """Identify if Token Id is a control token or a render-able token"""
2467
+ ...
2468
+
2469
+
2457
2470
# // Special tokens
2458
2471
2459
2472
0 commit comments