renamed max tokens

topoteretes · Vasilije1990 · Aug 27, 2025 · Jul 26, 2025 · Aug 4, 2025 · Aug 4, 2025
commit 1bd40f1401ffcfb4c1ca4256ecaef45e76ad1383
diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py
@@ -91,7 +91,7 @@ async def cognify(
                 - LangchainChunker: Recursive character splitting with overlap
                 Determines how documents are segmented for processing.
         chunk_size: Maximum tokens per chunk. Auto-calculated based on LLM if None.
-                   Formula: min(embedding_max_tokens, llm_max_tokens // 2)
+                   Formula: min(embedding_max_completion_tokens, llm_max_completion_tokens // 2)
                    Default limits: ~512-8192 tokens depending on models.
                    Smaller chunks = more granular but potentially fragmented knowledge.
         ontology_file_path: Path to RDF/OWL ontology file for domain-specific entity types.

diff --git a/cognee/api/v1/responses/models.py b/cognee/api/v1/responses/models.py
@@ -70,7 +70,7 @@ class ResponseRequest(InDTO):
     tool_choice: Optional[Union[str, Dict[str, Any]]] = "auto"
     user: Optional[str] = None
     temperature: Optional[float] = 1.0
-    max_tokens: Optional[int] = None
+    max_completion_tokens: Optional[int] = None
 
 
 class ToolCallOutput(BaseModel):

diff --git a/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/FastembedEmbeddingEngine.py
@@ -41,11 +41,11 @@ def __init__(
         self,
         model: Optional[str] = "openai/text-embedding-3-large",
         dimensions: Optional[int] = 3072,
-        max_tokens: int = 512,
+        max_completion_tokens: int = 512,
     ):
         self.model = model
         self.dimensions = dimensions
-        self.max_tokens = max_tokens
+        self.max_completion_tokens = max_completion_tokens
         self.tokenizer = self.get_tokenizer()
         # self.retry_count = 0
         self.embedding_model = TextEmbedding(model_name=model)
@@ -112,7 +112,9 @@ def get_tokenizer(self):
         """
         logger.debug("Loading tokenizer for FastembedEmbeddingEngine...")
 
-        tokenizer = TikTokenTokenizer(model="gpt-4o", max_tokens=self.max_tokens)
+        tokenizer = TikTokenTokenizer(
+            model="gpt-4o", max_completion_tokens=self.max_completion_tokens
+        )
 
         logger.debug("Tokenizer loaded for for FastembedEmbeddingEngine")
         return tokenizer
diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -57,15 +57,15 @@ def __init__(
         api_key: str = None,
         endpoint: str = None,
         api_version: str = None,
-        max_tokens: int = 512,
+        max_completion_tokens: int = 512,
     ):
         self.api_key = api_key
         self.endpoint = endpoint
         self.api_version = api_version
         self.provider = provider
         self.model = model
         self.dimensions = dimensions
-        self.max_tokens = max_tokens
+        self.max_completion_tokens = max_completion_tokens
         self.tokenizer = self.get_tokenizer()
         self.retry_count = 0
 
@@ -179,20 +179,29 @@ def get_tokenizer(self):
         model = self.model.split("/")[-1]
 
         if "openai" in self.provider.lower():
-            tokenizer = TikTokenTokenizer(model=model, max_tokens=self.max_tokens)
+            tokenizer = TikTokenTokenizer(
+                model=model, max_completion_tokens=self.max_completion_tokens
+            )
         elif "gemini" in self.provider.lower():
-            tokenizer = GeminiTokenizer(model=model, max_tokens=self.max_tokens)
+            tokenizer = GeminiTokenizer(
+                model=model, max_completion_tokens=self.max_completion_tokens
+            )
         elif "mistral" in self.provider.lower():
-            tokenizer = MistralTokenizer(model=model, max_tokens=self.max_tokens)
+            tokenizer = MistralTokenizer(
+                model=model, max_completion_tokens=self.max_completion_tokens
+            )
         else:
             try:
                 tokenizer = HuggingFaceTokenizer(
-                    model=self.model.replace("hosted_vllm/", ""), max_tokens=self.max_tokens
+                    model=self.model.replace("hosted_vllm/", ""),
+                    max_completion_tokens=self.max_completion_tokens,
                 )
             except Exception as e:
                 logger.warning(f"Could not get tokenizer from HuggingFace due to: {e}")
                 logger.info("Switching to TikToken default tokenizer.")
-                tokenizer = TikTokenTokenizer(model=None, max_tokens=self.max_tokens)
+                tokenizer = TikTokenTokenizer(
+                    model=None, max_completion_tokens=self.max_completion_tokens
+                )
 
         logger.debug(f"Tokenizer loaded for model: {self.model}")
         return tokenizer
diff --git a/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/OllamaEmbeddingEngine.py
@@ -30,7 +30,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
     Instance variables:
     - model
     - dimensions
-    - max_tokens
+    - max_completion_tokens
     - endpoint
     - mock
     - huggingface_tokenizer_name
@@ -39,7 +39,7 @@ class OllamaEmbeddingEngine(EmbeddingEngine):
 
     model: str
     dimensions: int
-    max_tokens: int
+    max_completion_tokens: int
     endpoint: str
     mock: bool
     huggingface_tokenizer_name: str
@@ -50,13 +50,13 @@ def __init__(
         self,
         model: Optional[str] = "avr/sfr-embedding-mistral:latest",
         dimensions: Optional[int] = 1024,
-        max_tokens: int = 512,
+        max_completion_tokens: int = 512,
         endpoint: Optional[str] = "http://localhost:11434/api/embeddings",
         huggingface_tokenizer: str = "Salesforce/SFR-Embedding-Mistral",
     ):
         self.model = model
         self.dimensions = dimensions
-        self.max_tokens = max_tokens
+        self.max_completion_tokens = max_completion_tokens
         self.endpoint = endpoint
         self.huggingface_tokenizer_name = huggingface_tokenizer
         self.tokenizer = self.get_tokenizer()
@@ -132,7 +132,7 @@ def get_tokenizer(self):
         """
         logger.debug("Loading HuggingfaceTokenizer for OllamaEmbeddingEngine...")
         tokenizer = HuggingFaceTokenizer(
-            model=self.huggingface_tokenizer_name, max_tokens=self.max_tokens
+            model=self.huggingface_tokenizer_name, max_completion_tokens=self.max_completion_tokens
         )
         logger.debug("Tokenizer loaded for OllamaEmbeddingEngine")
         return tokenizer
diff --git a/cognee/infrastructure/databases/vector/embeddings/config.py b/cognee/infrastructure/databases/vector/embeddings/config.py
@@ -18,7 +18,7 @@ class EmbeddingConfig(BaseSettings):
     embedding_endpoint: Optional[str] = None
     embedding_api_key: Optional[str] = None
     embedding_api_version: Optional[str] = None
-    embedding_max_tokens: Optional[int] = 8191
+    embedding_max_completion_tokens: Optional[int] = 8191
-    embedding_max_completion_tokens: Optional[int] = 8191
+from typing import Optional
+from pydantic import Field, AliasChoices
+
+    embedding_max_completion_tokens: Optional[int] = Field(
+        8191,
+        validation_alias=AliasChoices(
+            "embedding_max_completion_tokens",
+            "embedding_max_tokens",
+            "EMBEDDING_MAX_COMPLETION_TOKENS",
+            "EMBEDDING_MAX_TOKENS",
+        ),
+    )
-    embedding_max_completion_tokens: Optional[int] = 8191
+from typing import Optional
+from pydantic import Field, AliasChoices
+
+    embedding_max_completion_tokens: Optional[int] = Field(
+        8191,
+        validation_alias=AliasChoices(
+            "embedding_max_completion_tokens",
+            "embedding_max_tokens",
+            "EMBEDDING_MAX_COMPLETION_TOKENS",
+            "EMBEDDING_MAX_TOKENS",
+        ),
+    )
     huggingface_tokenizer: Optional[str] = None
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
@@ -38,7 +38,7 @@ def to_dict(self) -> dict:
             "embedding_endpoint": self.embedding_endpoint,
             "embedding_api_key": self.embedding_api_key,
             "embedding_api_version": self.embedding_api_version,
-            "embedding_max_tokens": self.embedding_max_tokens,
+            "embedding_max_completion_tokens": self.embedding_max_completion_tokens,
             "huggingface_tokenizer": self.huggingface_tokenizer,
         }
 

diff --git a/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py b/cognee/infrastructure/databases/vector/embeddings/get_embedding_engine.py
@@ -27,7 +27,7 @@ def get_embedding_engine() -> EmbeddingEngine:
         config.embedding_provider,
         config.embedding_model,
         config.embedding_dimensions,
-        config.embedding_max_tokens,
+        config.embedding_max_completion_tokens,
         config.embedding_endpoint,
         config.embedding_api_key,
         config.embedding_api_version,
@@ -41,7 +41,7 @@ def create_embedding_engine(
     embedding_provider,
     embedding_model,
     embedding_dimensions,
-    embedding_max_tokens,
+    embedding_max_completion_tokens,
     embedding_endpoint,
     embedding_api_key,
     embedding_api_version,
@@ -58,7 +58,7 @@ def create_embedding_engine(
           'ollama', or another supported provider.
         - embedding_model: The model to be used for the embedding engine.
         - embedding_dimensions: The number of dimensions for the embeddings.
-        - embedding_max_tokens: The maximum number of tokens for the embeddings.
+        - embedding_max_completion_tokens: The maximum number of tokens for the embeddings.
         - embedding_endpoint: The endpoint for the embedding service, relevant for certain
           providers.
         - embedding_api_key: API key to authenticate with the embedding service, if
@@ -81,7 +81,7 @@ def create_embedding_engine(
         return FastembedEmbeddingEngine(
             model=embedding_model,
             dimensions=embedding_dimensions,
-            max_tokens=embedding_max_tokens,
+            max_completion_tokens=embedding_max_completion_tokens,
         )
 
     if embedding_provider == "ollama":
@@ -90,7 +90,7 @@ def create_embedding_engine(
         return OllamaEmbeddingEngine(
             model=embedding_model,
             dimensions=embedding_dimensions,
-            max_tokens=embedding_max_tokens,
+            max_completion_tokens=embedding_max_completion_tokens,
             endpoint=embedding_endpoint,
             huggingface_tokenizer=huggingface_tokenizer,
         )
@@ -104,5 +104,5 @@ def create_embedding_engine(
         api_version=embedding_api_version,
         model=embedding_model,
         dimensions=embedding_dimensions,
-        max_tokens=embedding_max_tokens,
+        max_completion_tokens=embedding_max_completion_tokens,
     )
diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py
@@ -18,7 +18,7 @@ class LLMConfig(BaseSettings):
     - llm_api_version
     - llm_temperature
     - llm_streaming
-    - llm_max_tokens
+    - llm_max_completion_tokens
     - transcription_model
     - graph_prompt_path
     - llm_rate_limit_enabled
@@ -41,7 +41,7 @@ class LLMConfig(BaseSettings):
     llm_api_version: Optional[str] = None
     llm_temperature: float = 0.0
     llm_streaming: bool = False
-    llm_max_tokens: int = 16384
+    llm_max_completion_tokens: int = 16384
 
     baml_llm_provider: str = "openai"
     baml_llm_model: str = "gpt-5-mini"
@@ -171,7 +171,7 @@ def to_dict(self) -> dict:
             "api_version": self.llm_api_version,
             "temperature": self.llm_temperature,
             "streaming": self.llm_streaming,
-            "max_tokens": self.llm_max_tokens,
+            "max_completion_tokens": self.llm_max_completion_tokens,
             "transcription_model": self.transcription_model,
             "graph_prompt_path": self.graph_prompt_path,
             "rate_limit_enabled": self.llm_rate_limit_enabled,

diff --git a/...nfrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py b/...nfrastructure/llm/structured_output_framework/litellm_instructor/llm/anthropic/adapter.py
@@ -23,15 +23,15 @@ class AnthropicAdapter(LLMInterface):
     name = "Anthropic"
     model: str
 
-    def __init__(self, max_tokens: int, model: str = None):
+    def __init__(self, max_completion_tokens: int, model: str = None):
         import anthropic
 
         self.aclient = instructor.patch(
             create=anthropic.AsyncAnthropic().messages.create, mode=instructor.Mode.ANTHROPIC_TOOLS
         )
 
         self.model = model
-        self.max_tokens = max_tokens
+        self.max_completion_tokens = max_completion_tokens
 
     @sleep_and_retry_async()
     @rate_limit_async
@@ -57,7 +57,7 @@ async def acreate_structured_output(
 
         return await self.aclient(
             model=self.model,
-            max_tokens=4096,
+            max_completion_tokens=4096,
             max_retries=5,
             messages=[
                 {

diff --git a/...e/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py b/...e/infrastructure/llm/structured_output_framework/litellm_instructor/llm/gemini/adapter.py
@@ -34,7 +34,7 @@ def __init__(
         self,
         api_key: str,
         model: str,
-        max_tokens: int,
+        max_completion_tokens: int,
         endpoint: Optional[str] = None,
         api_version: Optional[str] = None,
         streaming: bool = False,
@@ -44,7 +44,7 @@ def __init__(
         self.endpoint = endpoint
         self.api_version = api_version
         self.streaming = streaming
-        self.max_tokens = max_tokens
+        self.max_completion_tokens = max_completion_tokens
 
     @observe(as_type="generation")
     @sleep_and_retry_async()
@@ -90,7 +90,7 @@ async def acreate_structured_output(
                     model=f"{self.model}",
                     messages=messages,
                     api_key=self.api_key,
-                    max_tokens=self.max_tokens,
+                    max_completion_tokens=self.max_completion_tokens,
                     temperature=0.1,
                     response_format=response_schema,
                     timeout=100,

diff --git a/...ructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py b/...ructure/llm/structured_output_framework/litellm_instructor/llm/generic_llm_api/adapter.py
@@ -41,7 +41,7 @@ def __init__(
         api_key: str,
         model: str,
         name: str,
-        max_tokens: int,
+        max_completion_tokens: int,
         fallback_model: str = None,
         fallback_api_key: str = None,
         fallback_endpoint: str = None,
@@ -50,7 +50,7 @@ def __init__(
         self.model = model
         self.api_key = api_key
         self.endpoint = endpoint
-        self.max_tokens = max_tokens
+        self.max_completion_tokens = max_completion_tokens
 
         self.fallback_model = fallback_model
         self.fallback_api_key = fallback_api_key

diff --git a/...e/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py b/...e/infrastructure/llm/structured_output_framework/litellm_instructor/llm/get_llm_client.py
@@ -54,11 +54,15 @@ def get_llm_client():
     # Check if max_token value is defined in liteLLM for given model
     # if not use value from cognee configuration
     from cognee.infrastructure.llm.utils import (
-        get_model_max_tokens,
+        get_model_max_completion_tokens,
     )  # imported here to avoid circular imports
 
-    model_max_tokens = get_model_max_tokens(llm_config.llm_model)
-    max_tokens = model_max_tokens if model_max_tokens else llm_config.llm_max_tokens
+    model_max_completion_tokens = get_model_max_completion_tokens(llm_config.llm_model)
+    max_completion_tokens = (
+        model_max_completion_tokens
+        if model_max_completion_tokens
+        else llm_config.llm_max_completion_tokens
+    )
 
     if provider == LLMProvider.OPENAI:
         if llm_config.llm_api_key is None:
@@ -74,7 +78,7 @@ def get_llm_client():
             api_version=llm_config.llm_api_version,
             model=llm_config.llm_model,
             transcription_model=llm_config.transcription_model,
-            max_tokens=max_tokens,
+            max_completion_tokens=max_completion_tokens,
             streaming=llm_config.llm_streaming,
             fallback_api_key=llm_config.fallback_api_key,
             fallback_endpoint=llm_config.fallback_endpoint,
@@ -94,15 +98,17 @@ def get_llm_client():
             llm_config.llm_api_key,
             llm_config.llm_model,
             "Ollama",
-            max_tokens=max_tokens,
+            max_completion_tokens=max_completion_tokens,
         )
 
     elif provider == LLMProvider.ANTHROPIC:
         from cognee.infrastructure.llm.structured_output_framework.litellm_instructor.llm.anthropic.adapter import (
             AnthropicAdapter,
         )
 
-        return AnthropicAdapter(max_tokens=max_tokens, model=llm_config.llm_model)
+        return AnthropicAdapter(
+            max_completion_tokens=max_completion_tokens, model=llm_config.llm_model
+        )
 
     elif provider == LLMProvider.CUSTOM:
         if llm_config.llm_api_key is None:
@@ -117,7 +123,7 @@ def get_llm_client():
             llm_config.llm_api_key,
             llm_config.llm_model,
             "Custom",
-            max_tokens=max_tokens,
+            max_completion_tokens=max_completion_tokens,
             fallback_api_key=llm_config.fallback_api_key,
             fallback_endpoint=llm_config.fallback_endpoint,
             fallback_model=llm_config.fallback_model,
@@ -134,7 +140,7 @@ def get_llm_client():
         return GeminiAdapter(
             api_key=llm_config.llm_api_key,
             model=llm_config.llm_model,
-            max_tokens=max_tokens,
+            max_completion_tokens=max_completion_tokens,
             endpoint=llm_config.llm_endpoint,
             api_version=llm_config.llm_api_version,
             streaming=llm_config.llm_streaming,