@@ -468,11 +468,13 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
468468# LLAMA_POOLING_TYPE_NONE = 0,
469469# LLAMA_POOLING_TYPE_MEAN = 1,
470470# LLAMA_POOLING_TYPE_CLS = 2,
471+ # LLAMA_POOLING_TYPE_LAST = 3,
471472# };
472473LLAMA_POOLING_TYPE_UNSPECIFIED = - 1
473474LLAMA_POOLING_TYPE_NONE = 0
474475LLAMA_POOLING_TYPE_MEAN = 1
475476LLAMA_POOLING_TYPE_CLS = 2
477+ LLAMA_POOLING_TYPE_LAST = 3
476478
477479# enum llama_split_mode {
478480# LLAMA_SPLIT_MODE_NONE = 0, // single GPU
@@ -761,7 +763,6 @@ class llama_model_params(ctypes.Structure):
761763
762764# enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
763765# enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
764- # // (ignored if no pooling layer)
765766
766767# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
767768# float rope_freq_base; // RoPE base frequency, 0 = from model
@@ -2316,6 +2317,16 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
23162317 ...
23172318
23182319
2320+ # // Set whether the model is in embeddings model or not
2321+ # // If true, embeddings will be returned but logits will not
2322+ # LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
2323+ @ctypes_function ("llama_set_embeddings" , [llama_context_p_ctypes , ctypes .c_bool ], None )
2324+ def llama_set_embeddings (ctx : llama_context_p , embeddings : bool , / ):
2325+ """Set whether the model is in embeddings model or not
2326+ If true, embeddings will be returned but logits will not"""
2327+ ...
2328+
2329+
23192330# // Set whether to use causal attention or not
23202331# // If set to true, the model will only attend to the past tokens
23212332# LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
0 commit comments