Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cognee/api/v1/add/add_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines import run_tasks, Task
from cognee.tasks.ingestion import ingest_data_with_metadata, resolve_data_directories
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
from cognee.infrastructure.databases.relational import (
create_db_and_tables as create_relational_db_and_tables,
)
Expand All @@ -22,7 +22,7 @@ async def add(
if user is None:
user = await get_default_user()

tasks = [Task(resolve_data_directories), Task(ingest_data_with_metadata, dataset_name, user)]
tasks = [Task(resolve_data_directories), Task(ingest_data, dataset_name, user)]

pipeline = run_tasks(tasks, data, "add_pipeline")

Expand Down
4 changes: 2 additions & 2 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
from cognee.tasks.documents import classify_documents, extract_chunks_from_documents
from cognee.tasks.graph import extract_graph_from_data
from cognee.tasks.ingestion import ingest_data_with_metadata
from cognee.tasks.ingestion import ingest_data
from cognee.tasks.repo_processor import (
enrich_dependency_graph,
expand_dependency_graph,
Expand Down Expand Up @@ -68,7 +68,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
if include_docs:
non_code_tasks = [
Task(get_non_py_files, task_config={"batch_size": 50}),
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(ingest_data, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def exponential_backoff(attempt):

return await self.embed_text(text)

except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
except litellm.exceptions.BadRequestError:
raise EmbeddingException("Failed to index data points.")

Comment on lines +98 to 100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Enhance error handling granularity.

The current implementation might miss important OpenAI-specific errors. Consider:

  1. Restoring specific handling for OpenAIError
  2. Adding specific error messages for different failure scenarios
-        except litellm.exceptions.BadRequestError:
-            raise EmbeddingException("Failed to index data points.")
+        except litellm.exceptions.BadRequestError as e:
+            raise EmbeddingException(f"Bad request while indexing data points: {str(e)}")
+        except litellm.llms.OpenAI.openai.OpenAIError as e:
+            raise EmbeddingException(f"OpenAI service error: {str(e)}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except litellm.exceptions.BadRequestError:
raise EmbeddingException("Failed to index data points.")
except litellm.exceptions.BadRequestError as e:
raise EmbeddingException(f"Bad request while indexing data points: {str(e)}")
except litellm.llms.OpenAI.openai.OpenAIError as e:
raise EmbeddingException(f"OpenAI service error: {str(e)}")

except Exception as error:
Expand Down
4 changes: 1 addition & 3 deletions cognee/modules/chunking/TextChunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def read(self):
contains=[],
_metadata={
"index_fields": ["text"],
"metadata_id": self.document.metadata_id,
},
)
paragraph_chunks = []
Expand All @@ -74,7 +73,6 @@ def read(self):
contains=[],
_metadata={
"index_fields": ["text"],
"metadata_id": self.document.metadata_id,
},
)
except Exception as e:
Expand All @@ -95,7 +93,7 @@ def read(self):
chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
contains=[],
_metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id},
_metadata={"index_fields": ["text"]},
)
except Exception as e:
print(e)
Comment on lines +96 to 99
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve error handling in DocumentChunk creation.

Replace print statements with proper logging:

-            except Exception as e:
-                print(e)
+            except Exception as e:
+                logger.error("Failed to create DocumentChunk: %s", str(e))

Also consider:

  1. Adding more specific exception types
  2. Including context information in the error message
  3. Potentially propagating the error up the stack

Committable suggestion skipped: line range outside the PR's diff.

14 changes: 3 additions & 11 deletions cognee/modules/data/models/Data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from datetime import datetime, timezone
from typing import List
from uuid import uuid4
from sqlalchemy import UUID, Column, DateTime, String
from sqlalchemy.orm import Mapped, relationship
from sqlalchemy import UUID, Column, DateTime, String, JSON
from sqlalchemy.orm import relationship

from cognee.infrastructure.databases.relational import Base

from .DatasetData import DatasetData
from .Metadata import Metadata


class Data(Base):
Expand All @@ -21,6 +19,7 @@ class Data(Base):
raw_data_location = Column(String)
owner_id = Column(UUID, index=True)
content_hash = Column(String)
external_metadata = Column(JSON)
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))

Expand All @@ -32,13 +31,6 @@ class Data(Base):
cascade="all, delete",
)

metadata_relationship = relationship(
"Metadata",
back_populates="data",
lazy="noload",
cascade="all, delete",
)

def to_json(self) -> dict:
return {
"id": str(self.id),
Expand Down
21 changes: 0 additions & 21 deletions cognee/modules/data/models/Metadata.py

This file was deleted.

19 changes: 0 additions & 19 deletions cognee/modules/data/operations/delete_metadata.py

This file was deleted.

17 changes: 0 additions & 17 deletions cognee/modules/data/operations/get_metadata.py

This file was deleted.

65 changes: 0 additions & 65 deletions cognee/modules/data/operations/write_metadata.py

This file was deleted.

2 changes: 1 addition & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class Document(DataPoint):
name: str
raw_data_location: str
metadata_id: UUID
external_metadata: Optional[str]
mime_type: str
_metadata: dict = {"index_fields": ["name"], "type": "Document"}

Expand Down
5 changes: 2 additions & 3 deletions cognee/tasks/documents/classify_documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cognee.modules.data.models import Data
import json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add error handling for JSON serialization and remove unnecessary indentation.

The current implementation has two potential issues:

  1. No error handling for JSON serialization failures
  2. Unnecessary overhead from pretty-printing JSON with indent=4

Consider this implementation:

 async def classify_documents(data_documents: list[Data]) -> list[Document]:
     documents = []
     for data_item in data_documents:
+        try:
+            metadata_json = json.dumps(data_item.foreign_metadata)
+        except (TypeError, ValueError) as e:
+            logger.error(f"Failed to serialize foreign_metadata for {data_item.id}: {e}")
+            metadata_json = "{}"
+
         document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
             id=data_item.id,
             title=f"{data_item.name}.{data_item.extension}",
             raw_data_location=data_item.raw_data_location,
             name=data_item.name,
             mime_type=data_item.mime_type,
-            foreign_metadata=json.dumps(data_item.foreign_metadata, indent=4),
+            foreign_metadata=metadata_json,
         )

Also applies to: 61-61

from cognee.modules.data.processing.document_types import (
Document,
PdfDocument,
Expand All @@ -7,7 +8,6 @@
TextDocument,
UnstructuredDocument,
)
from cognee.modules.data.operations.get_metadata import get_metadata

EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
Expand Down Expand Up @@ -59,14 +59,13 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
"""
documents = []
for data_item in data_documents:
metadata = await get_metadata(data_item.id)
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
id=data_item.id,
title=f"{data_item.name}.{data_item.extension}",
raw_data_location=data_item.raw_data_location,
name=data_item.name,
mime_type=data_item.mime_type,
metadata_id=metadata.id,
external_metadata=json.dumps(data_item.external_metadata, indent=4),
)
documents.append(document)

Expand Down
5 changes: 1 addition & 4 deletions cognee/tasks/ingestion/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from .ingest_data import ingest_data
from .save_data_to_storage import save_data_to_storage
from .save_data_item_to_storage import save_data_item_to_storage
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage
from .ingest_data_with_metadata import ingest_data_with_metadata
from .ingest_data import ingest_data
from .resolve_data_directories import resolve_data_directories
Loading
Loading