Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions cognee/infrastructure/files/utils/get_file_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import os.path
from typing import BinaryIO, TypedDict
from typing import BinaryIO, TypedDict, Optional
from pathlib import Path

from cognee.shared.logging_utils import get_logger
Expand All @@ -27,7 +27,7 @@ class FileMetadata(TypedDict):
file_size: int


async def get_file_metadata(file: BinaryIO) -> FileMetadata:
async def get_file_metadata(file: BinaryIO, name: Optional[str] = None) -> FileMetadata:
"""
Retrieve metadata from a file object.

Expand All @@ -53,7 +53,7 @@ async def get_file_metadata(file: BinaryIO) -> FileMetadata:
except io.UnsupportedOperation as error:
logger.error(f"Error retrieving content hash for file: {file.name} \n{str(error)}\n\n")

file_type = guess_file_type(file)
file_type = guess_file_type(file, name)

file_path = getattr(file, "name", None) or getattr(file, "full_name", None)

Expand Down
24 changes: 19 additions & 5 deletions cognee/infrastructure/files/utils/guess_file_type.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from typing import BinaryIO
import io
from pathlib import Path
from typing import BinaryIO, Optional, Any
import filetype
from .is_text_content import is_text_content
from tempfile import SpooledTemporaryFile
from filetype.types.base import Type


class FileTypeException(Exception):
Expand All @@ -22,7 +25,7 @@ def __init__(self, message: str):
self.message = message


def guess_file_type(file: BinaryIO) -> filetype.Type:
def guess_file_type(file: BinaryIO, name: Optional[str] = None) -> filetype.Type:
"""
Guess the file type from the given binary file stream.

Expand All @@ -39,12 +42,23 @@ def guess_file_type(file: BinaryIO) -> filetype.Type:

- filetype.Type: The guessed file type, represented as filetype.Type.
"""

# Note: If file has .txt or .text extension, consider it a plain text file as filetype.guess may not detect it properly
# as it contains no magic number encoding
ext = None
if isinstance(file, str):
ext = Path(file).suffix
elif name is not None:
ext = Path(name).suffix

if ext in [".txt", ".text"]:
file_type = Type("text/plain", "txt")
return file_type

file_type = filetype.guess(file)

# If file type could not be determined consider it a plain text file as they don't have magic number encoding
if file_type is None:
from filetype.types.base import Type

file_type = Type("text/plain", "txt")

if file_type is None:
Expand Down
1 change: 1 addition & 0 deletions cognee/infrastructure/loaders/core/audio_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def supported_mime_types(self) -> List[str]:
"audio/wav",
"audio/amr",
"audio/aiff",
"audio/x-wav",
]

@property
Expand Down
2 changes: 1 addition & 1 deletion cognee/modules/ingestion/data_types/BinaryData.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_metadata(self):

async def ensure_metadata(self):
if self.metadata is None:
self.metadata = await get_file_metadata(self.data)
self.metadata = await get_file_metadata(self.data, name=self.name)

if self.metadata["name"] is None:
self.metadata["name"] = self.name
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "cognee"

version = "0.3.8"
version = "0.3.9"
description = "Cognee - is a library for enriching LLM context with a semantic layer for better understanding and reasoning."
authors = [
{ name = "Vasilije Markovic" },
Expand Down
10 changes: 9 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading