Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions .github/workflows/notebooks_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ on:
workflow_call:

jobs:
run-main-notebook:
name: Main Notebook Test
uses: ./.github/workflows/reusable_notebook.yml
with:
notebook-location: notebooks/cognee_demo.ipynb
secrets: inherit
# run-main-notebook:
# name: Main Notebook Test
# uses: ./.github/workflows/reusable_notebook.yml
# with:
# notebook-location: notebooks/cognee_demo.ipynb
# secrets: inherit

run-llama-index-integration:
name: LlamaIndex Integration Notebook
Expand All @@ -32,9 +32,9 @@ jobs:
notebook-location: notebooks/cognee_multimedia_demo.ipynb
secrets: inherit

run-graphrag-vs-rag:
name: Graphrag vs Rag notebook
uses: ./.github/workflows/reusable_notebook.yml
with:
notebook-location: notebooks/graphrag_vs_rag.ipynb
secrets: inherit
# run-graphrag-vs-rag:
# name: Graphrag vs Rag notebook
# uses: ./.github/workflows/reusable_notebook.yml
# with:
# notebook-location: notebooks/graphrag_vs_rag.ipynb
# secrets: inherit
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
from cognee.infrastructure.llm.tokenizer.Mistral import MistralTokenizer
from cognee.infrastructure.llm.tokenizer.TikToken import TikTokenTokenizer
from cognee.infrastructure.llm.embedding_rate_limiter import (
embedding_rate_limit_async,
embedding_sleep_and_retry_async,
)

litellm.set_verbose = False
logger = get_logger("LiteLLMEmbeddingEngine")
Expand Down Expand Up @@ -51,17 +55,12 @@ def __init__(
enable_mocking = str(enable_mocking).lower()
self.mock = enable_mocking in ("true", "1", "yes")

@embedding_sleep_and_retry_async()
@embedding_rate_limit_async
async def embed_text(self, text: List[str]) -> List[List[float]]:
async def exponential_backoff(attempt):
wait_time = min(10 * (2**attempt), 60) # Max 60 seconds
await asyncio.sleep(wait_time)

try:
if self.mock:
response = {"data": [{"embedding": [0.0] * self.dimensions} for _ in text]}

self.retry_count = 0

return [data["embedding"] for data in response["data"]]
else:
response = await litellm.aembedding(
Expand All @@ -72,8 +71,6 @@ async def exponential_backoff(attempt):
api_version=self.api_version,
)

self.retry_count = 0 # Reset retry count on successful call

return [data["embedding"] for data in response.data]

except litellm.exceptions.ContextWindowExceededError as error:
Expand All @@ -95,15 +92,6 @@ async def exponential_backoff(attempt):
logger.error("Context window exceeded for embedding text: %s", str(error))
raise error

except litellm.exceptions.RateLimitError:
if self.retry_count >= self.MAX_RETRIES:
raise Exception("Rate limit exceeded and no more retries left.")

await exponential_backoff(self.retry_count)
self.retry_count += 1

return await self.embed_text(text)

except (
litellm.exceptions.BadRequestError,
litellm.exceptions.NotFoundError,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
from cognee.infrastructure.databases.exceptions.EmbeddingException import EmbeddingException
from cognee.infrastructure.llm.tokenizer.HuggingFace import HuggingFaceTokenizer
from cognee.infrastructure.llm.rate_limiter import (
embedding_rate_limit_async,
embedding_sleep_and_retry_async,
)

logger = get_logger("OllamaEmbeddingEngine")

Expand Down Expand Up @@ -43,6 +47,7 @@ def __init__(
enable_mocking = str(enable_mocking).lower()
self.mock = enable_mocking in ("true", "1", "yes")

@embedding_rate_limit_async
async def embed_text(self, text: List[str]) -> List[List[float]]:
"""
Given a list of text prompts, returns a list of embedding vectors.
Expand All @@ -53,6 +58,7 @@ async def embed_text(self, text: List[str]) -> List[List[float]]:
embeddings = await asyncio.gather(*[self._get_embedding(prompt) for prompt in text])
return embeddings

@embedding_sleep_and_retry_async()
async def _get_embedding(self, prompt: str) -> List[float]:
"""
Internal method to call the Ollama embeddings endpoint for a single prompt.
Expand All @@ -66,26 +72,12 @@ async def _get_embedding(self, prompt: str) -> List[float]:
if api_key:
headers["Authorization"] = f"Bearer {api_key}"

retries = 0
while retries < self.MAX_RETRIES:
try:
async with aiohttp.ClientSession() as session:
async with session.post(
self.endpoint, json=payload, headers=headers, timeout=60.0
) as response:
data = await response.json()
return data["embedding"]
except aiohttp.http_exceptions.HttpBadRequest as e:
logger.error(f"HTTP error on attempt {retries + 1}: {e}")
retries += 1
await asyncio.sleep(min(2**retries, 60))
except Exception as e:
logger.error(f"Error on attempt {retries + 1}: {e}")
retries += 1
await asyncio.sleep(min(2**retries, 60))
raise EmbeddingException(
f"Failed to embed text using model {self.model} after {self.MAX_RETRIES} retries"
)
async with aiohttp.ClientSession() as session:
async with session.post(
self.endpoint, json=payload, headers=headers, timeout=60.0
) as response:
data = await response.json()
return data["embedding"]

def get_vector_size(self) -> int:
return self.dimensions
Expand Down
3 changes: 3 additions & 0 deletions cognee/infrastructure/llm/anthropic/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from cognee.exceptions import InvalidValueError
from cognee.infrastructure.llm.llm_interface import LLMInterface
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.infrastructure.llm.rate_limiter import rate_limit_async, sleep_and_retry_async


class AnthropicAdapter(LLMInterface):
Expand All @@ -23,6 +24,8 @@ def __init__(self, max_tokens: int, model: str = None):
self.model = model
self.max_tokens = max_tokens

@sleep_and_retry_async()
@rate_limit_async
async def acreate_structured_output(
self, text_input: str, system_prompt: str, response_model: Type[BaseModel]
) -> BaseModel:
Expand Down
12 changes: 12 additions & 0 deletions cognee/infrastructure/llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ class LLMConfig(BaseSettings):
llm_max_tokens: int = 16384
transcription_model: str = "whisper-1"
graph_prompt_path: str = "generate_graph_prompt.txt"
llm_rate_limit_enabled: bool = False
llm_rate_limit_requests: int = 60
llm_rate_limit_interval: int = 60 # in seconds (default is 60 requests per minute)
embedding_rate_limit_enabled: bool = False
embedding_rate_limit_requests: int = 60
embedding_rate_limit_interval: int = 60 # in seconds (default is 60 requests per minute)

model_config = SettingsConfigDict(env_file=".env", extra="allow")

Expand Down Expand Up @@ -85,6 +91,12 @@ def to_dict(self) -> dict:
"max_tokens": self.llm_max_tokens,
"transcription_model": self.transcription_model,
"graph_prompt_path": self.graph_prompt_path,
"rate_limit_enabled": self.llm_rate_limit_enabled,
"rate_limit_requests": self.llm_rate_limit_requests,
"rate_limit_interval": self.llm_rate_limit_interval,
"embedding_rate_limit_enabled": self.embedding_rate_limit_enabled,
"embedding_rate_limit_requests": self.embedding_rate_limit_requests,
"embedding_rate_limit_interval": self.embedding_rate_limit_interval,
}


Expand Down
Loading
Loading