Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/e2e_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ jobs:
with:
python-version: '3.11.x'

- name: Install specific S3 dependency
run: |
poetry install -E aws

- name: Run S3 Bucket Test
env:
ENV: 'dev'
Expand Down Expand Up @@ -243,6 +247,4 @@ jobs:
EMBEDDING_ENDPOINT: ${{ secrets.EMBEDDING_ENDPOINT }}
EMBEDDING_API_KEY: ${{ secrets.EMBEDDING_API_KEY }}
EMBEDDING_API_VERSION: ${{ secrets.EMBEDDING_API_VERSION }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: poetry run python ./cognee/tests/test_parallel_databases.py
60 changes: 0 additions & 60 deletions cognee/fetch_secret.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import s3fs
from typing import IO, Optional
from cognee.api.v1.add.config import get_s3_config

Expand All @@ -9,6 +8,8 @@ def open_data_file(
if file_path.startswith("s3://"):
s3_config = get_s3_config()
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs

fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
Expand Down
34 changes: 26 additions & 8 deletions cognee/modules/ingestion/classify.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,40 @@
from io import BufferedReader
from typing import Union, BinaryIO, Optional
from .data_types import TextData, BinaryData, S3BinaryData
from typing import Union, BinaryIO, Optional, Any
from .data_types import TextData, BinaryData
from tempfile import SpooledTemporaryFile
from s3fs.core import S3File, S3FileSystem

from cognee.modules.ingestion.exceptions import IngestionError

try:
from s3fs.core import S3File
from cognee.modules.ingestion.data_types.S3BinaryData import S3BinaryData
except ImportError:
S3File = None
S3BinaryData = None


def classify(data: Union[str, BinaryIO], filename: str = None, s3fs: Optional[S3FileSystem] = None):
def classify(data: Union[str, BinaryIO], filename: str = None, s3fs: Optional[Any] = None):
if isinstance(data, str):
return TextData(data)

if isinstance(data, BufferedReader) or isinstance(data, SpooledTemporaryFile):
return BinaryData(data, str(data.name).split("/")[-1] if data.name else filename)

if isinstance(data, S3File):
derived_filename = str(data.full_name).split("/")[-1] if data.full_name else filename
return S3BinaryData(s3_path=data.full_name, name=derived_filename, s3=s3fs)
try:
from importlib import import_module

s3core = import_module("s3fs.core")
S3File = s3core.S3File
except ImportError:
S3File = None

if S3File is not None:
from cognee.modules.ingestion.data_types.S3BinaryData import S3BinaryData

if isinstance(data, S3File):
derived_filename = str(data.full_name).split("/")[-1] if data.full_name else filename
return S3BinaryData(s3_path=data.full_name, name=derived_filename, s3=s3fs)

raise IngestionError(
message=f"Type of data sent to classify(data: Union[str, BinaryIO) not supported: {type(data)}"
message=f"Type of data sent to classify(data: Union[str, BinaryIO) not supported or s3fs is not installed: {type(data)}"
)
1 change: 0 additions & 1 deletion cognee/modules/ingestion/data_types/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .TextData import TextData, create_text_data
from .BinaryData import BinaryData, create_binary_data
from .S3BinaryData import S3BinaryData, create_s3_binary_data
from .IngestionData import IngestionData
2 changes: 0 additions & 2 deletions cognee/tasks/documents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from .translate_text import translate_text
from .detect_language import detect_language
from .classify_documents import classify_documents
from .extract_chunks_from_documents import extract_chunks_from_documents
from .check_permissions_on_dataset import check_permissions_on_dataset
39 changes: 0 additions & 39 deletions cognee/tasks/documents/detect_language.py

This file was deleted.

46 changes: 0 additions & 46 deletions cognee/tasks/documents/translate_text.py

This file was deleted.

3 changes: 2 additions & 1 deletion cognee/tasks/ingestion/ingest_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import dlt
import s3fs
import json
import inspect
from uuid import UUID
Expand Down Expand Up @@ -40,6 +39,8 @@ async def ingest_data(

fs = None
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs

fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
Expand Down
3 changes: 2 additions & 1 deletion cognee/tasks/ingestion/resolve_data_directories.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import s3fs
from typing import List, Union, BinaryIO
from urllib.parse import urlparse
from cognee.api.v1.add.config import get_s3_config
Expand Down Expand Up @@ -27,6 +26,8 @@ async def resolve_data_directories(

fs = None
if s3_config.aws_access_key_id is not None and s3_config.aws_secret_access_key is not None:
import s3fs

fs = s3fs.S3FileSystem(
key=s3_config.aws_access_key_id, secret=s3_config.aws_secret_access_key, anon=False
)
Expand Down
30 changes: 22 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading