@@ -34,12 +34,21 @@ class Settings(BaseSettings):
3434 default = None ,
3535 description = "The alias of the model to use for generating completions." ,
3636 )
37+ seed : int = Field (default = llama_cpp .LLAMA_DEFAULT_SEED , description = "Random seed. -1 for random." )
3738 n_ctx : int = Field (default = 2048 , ge = 1 , description = "The context size." )
39+ n_batch : int = Field (
40+ default = 512 , ge = 1 , description = "The batch size to use per eval."
41+ )
3842 n_gpu_layers : int = Field (
3943 default = 0 ,
4044 ge = 0 ,
4145 description = "The number of layers to put on the GPU. The rest will be on the CPU." ,
4246 )
47+ main_gpu : int = Field (
48+ default = 0 ,
49+ ge = 0 ,
50+ description = "Main GPU to use." ,
51+ )
4352 tensor_split : Optional [List [float ]] = Field (
4453 default = None ,
4554 description = "Split layers across multiple GPUs in proportion." ,
@@ -50,35 +59,45 @@ class Settings(BaseSettings):
5059 rope_freq_scale : float = Field (
5160 default = 1.0 , description = "RoPE frequency scaling factor"
5261 )
53- seed : int = Field (default = 1337 , description = "Random seed. -1 for random." )
54- n_batch : int = Field (
55- default = 512 , ge = 1 , description = "The batch size to use per eval."
62+ low_vram : bool = Field (
63+ default = False ,
64+ description = "Whether to use less VRAM. This will reduce performance." ,
5665 )
57- n_threads : int = Field (
58- default = max (multiprocessing .cpu_count () // 2 , 1 ),
59- ge = 1 ,
60- description = "The number of threads to use." ,
66+ mul_mat_q : bool = Field (
67+ default = True , description = "if true, use experimental mul_mat_q kernels"
6168 )
6269 f16_kv : bool = Field (default = True , description = "Whether to use f16 key/value." )
63- use_mlock : bool = Field (
64- default = llama_cpp . llama_mlock_supported (),
65- description = "Use mlock." ,
70+ logits_all : bool = Field (default = True , description = "Whether to return logits." )
71+ vocab_only : bool = Field (
72+ default = False , description = "Whether to only return the vocabulary."
6673 )
6774 use_mmap : bool = Field (
6875 default = llama_cpp .llama_mmap_supported (),
6976 description = "Use mmap." ,
7077 )
78+ use_mlock : bool = Field (
79+ default = llama_cpp .llama_mlock_supported (),
80+ description = "Use mlock." ,
81+ )
7182 embedding : bool = Field (default = True , description = "Whether to use embeddings." )
72- low_vram : bool = Field (
73- default = False ,
74- description = "Whether to use less VRAM. This will reduce performance." ,
83+ n_threads : int = Field (
84+ default = max (multiprocessing .cpu_count () // 2 , 1 ),
85+ ge = 1 ,
86+ description = "The number of threads to use." ,
7587 )
7688 last_n_tokens_size : int = Field (
7789 default = 64 ,
7890 ge = 0 ,
7991 description = "Last n tokens to keep for repeat penalty calculation." ,
8092 )
81- logits_all : bool = Field (default = True , description = "Whether to return logits." )
93+ lora_base : Optional [str ] = Field (
94+ default = None ,
95+ description = "Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
96+ )
97+ lora_path : Optional [str ] = Field (
98+ default = None ,
99+ description = "Path to a LoRA file to apply to the model." ,
100+ )
82101 cache : bool = Field (
83102 default = False ,
84103 description = "Use a cache to reduce processing times for evaluated prompts." ,
@@ -91,9 +110,6 @@ class Settings(BaseSettings):
91110 default = 2 << 30 ,
92111 description = "The size of the cache in bytes. Only used if cache is True." ,
93112 )
94- vocab_only : bool = Field (
95- default = False , description = "Whether to only return the vocabulary."
96- )
97113 verbose : bool = Field (
98114 default = True , description = "Whether to print debug information."
99115 )
@@ -103,18 +119,6 @@ class Settings(BaseSettings):
103119 default = True ,
104120 description = "Whether to interrupt requests when a new request is received." ,
105121 )
106- n_gqa : Optional [int ] = Field (
107- default = None ,
108- description = "TEMPORARY: Set to 8 for Llama2 70B" ,
109- )
110- rms_norm_eps : Optional [float ] = Field (
111- default = None ,
112- description = "TEMPORARY" ,
113- )
114- mul_mat_q : Optional [bool ] = Field (
115- default = None ,
116- description = "TEMPORARY" ,
117- )
118122
119123
120124class ErrorResponse (TypedDict ):
@@ -334,24 +338,27 @@ def create_app(settings: Optional[Settings] = None):
334338 global llama
335339 llama = llama_cpp .Llama (
336340 model_path = settings .model ,
341+ seed = settings .seed ,
342+ n_ctx = settings .n_ctx ,
343+ n_batch = settings .n_batch ,
337344 n_gpu_layers = settings .n_gpu_layers ,
345+ main_gpu = settings .main_gpu ,
338346 tensor_split = settings .tensor_split ,
339347 rope_freq_base = settings .rope_freq_base ,
340348 rope_freq_scale = settings .rope_freq_scale ,
341- seed = settings .seed ,
349+ low_vram = settings .low_vram ,
350+ mul_mat_q = settings .mul_mat_q ,
342351 f16_kv = settings .f16_kv ,
343- use_mlock = settings .use_mlock ,
352+ logits_all = settings .logits_all ,
353+ vocab_only = settings .vocab_only ,
344354 use_mmap = settings .use_mmap ,
355+ use_mlock = settings .use_mlock ,
345356 embedding = settings .embedding ,
346- logits_all = settings .logits_all ,
347357 n_threads = settings .n_threads ,
348- n_batch = settings .n_batch ,
349- n_ctx = settings .n_ctx ,
350358 last_n_tokens_size = settings .last_n_tokens_size ,
351- vocab_only = settings .vocab_only ,
359+ lora_base = settings .lora_base ,
360+ lora_path = settings .lora_path ,
352361 verbose = settings .verbose ,
353- n_gqa = settings .n_gqa ,
354- rms_norm_eps = settings .rms_norm_eps ,
355362 )
356363 if settings .cache :
357364 if settings .cache_type == "disk" :
0 commit comments