-
Notifications
You must be signed in to change notification settings - Fork 1k
Cog 793 metadata rework #460
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0c7c1d7
49ad292
ab8d95c
5c17501
4196a4c
77f0b45
bd3a5a7
655ab0b
80e67b0
40c0279
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,7 +54,6 @@ def read(self): | |
| contains=[], | ||
| _metadata={ | ||
| "index_fields": ["text"], | ||
| "metadata_id": self.document.metadata_id, | ||
| }, | ||
| ) | ||
| paragraph_chunks = [] | ||
|
|
@@ -74,7 +73,6 @@ def read(self): | |
| contains=[], | ||
| _metadata={ | ||
| "index_fields": ["text"], | ||
| "metadata_id": self.document.metadata_id, | ||
| }, | ||
| ) | ||
| except Exception as e: | ||
|
|
@@ -95,7 +93,7 @@ def read(self): | |
| chunk_index=self.chunk_index, | ||
| cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], | ||
| contains=[], | ||
| _metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id}, | ||
| _metadata={"index_fields": ["text"]}, | ||
| ) | ||
| except Exception as e: | ||
| print(e) | ||
|
Comment on lines
+96
to
99
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Improve error handling in DocumentChunk creation. Replace print statements with proper logging: - except Exception as e:
- print(e)
+ except Exception as e:
+ logger.error("Failed to create DocumentChunk: %s", str(e))Also consider:
|
||
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| from cognee.modules.data.models import Data | ||
| import json | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Add error handling for JSON serialization and remove unnecessary indentation. The current implementation has two potential issues:
Consider this implementation: async def classify_documents(data_documents: list[Data]) -> list[Document]:
documents = []
for data_item in data_documents:
+ try:
+ metadata_json = json.dumps(data_item.foreign_metadata)
+ except (TypeError, ValueError) as e:
+ logger.error(f"Failed to serialize foreign_metadata for {data_item.id}: {e}")
+ metadata_json = "{}"
+
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
id=data_item.id,
title=f"{data_item.name}.{data_item.extension}",
raw_data_location=data_item.raw_data_location,
name=data_item.name,
mime_type=data_item.mime_type,
- foreign_metadata=json.dumps(data_item.foreign_metadata, indent=4),
+ foreign_metadata=metadata_json,
)Also applies to: 61-61 |
||
| from cognee.modules.data.processing.document_types import ( | ||
| Document, | ||
| PdfDocument, | ||
|
|
@@ -7,7 +8,6 @@ | |
| TextDocument, | ||
| UnstructuredDocument, | ||
| ) | ||
| from cognee.modules.data.operations.get_metadata import get_metadata | ||
|
|
||
| EXTENSION_TO_DOCUMENT_CLASS = { | ||
| "pdf": PdfDocument, # Text documents | ||
|
|
@@ -59,14 +59,13 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]: | |
| """ | ||
| documents = [] | ||
| for data_item in data_documents: | ||
| metadata = await get_metadata(data_item.id) | ||
| document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension]( | ||
| id=data_item.id, | ||
| title=f"{data_item.name}.{data_item.extension}", | ||
| raw_data_location=data_item.raw_data_location, | ||
| name=data_item.name, | ||
| mime_type=data_item.mime_type, | ||
| metadata_id=metadata.id, | ||
| external_metadata=json.dumps(data_item.external_metadata, indent=4), | ||
| ) | ||
| documents.append(document) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,3 @@ | ||
| from .ingest_data import ingest_data | ||
| from .save_data_to_storage import save_data_to_storage | ||
| from .save_data_item_to_storage import save_data_item_to_storage | ||
| from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage | ||
| from .ingest_data_with_metadata import ingest_data_with_metadata | ||
| from .ingest_data import ingest_data | ||
| from .resolve_data_directories import resolve_data_directories |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Enhance error handling granularity.
The current implementation might miss important OpenAI-specific errors. Consider:
📝 Committable suggestion