topoteretes · dexters1 · Dec 6, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 5, 2024
diff --git a/.github/workflows/test_deduplication.yml b/.github/workflows/test_deduplication.yml
@@ -0,0 +1,69 @@
+name: test | deduplication
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+    types: [labeled, synchronize]
+
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  RUNTIME__LOG_LEVEL: ERROR
+
+jobs:
+  get_docs_changes:
+    name: docs changes
+    uses: ./.github/workflows/get_docs_changes.yml
+
+  run_deduplication_test:
+    name: test
+    needs: get_docs_changes
+    if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }}
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    services:
+      postgres:
+        image: pgvector/pgvector:pg17
+        env:
+          POSTGRES_USER: cognee
+          POSTGRES_PASSWORD: cognee
+          POSTGRES_DB: cognee_db
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 5432:5432
+
+    steps:
+      - name: Check out
+        uses: actions/checkout@master
-        uses: actions/checkout@master
+        uses: actions/checkout@v4
-        uses: actions/checkout@master
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11.x'
+
+      - name: Install Poetry
+        uses: snok/[email protected]
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+          installer-parallel: true
+
+      - name: Install dependencies
+        run: poetry install -E postgres --no-interaction
+
+      - name: Run deduplication test
+        env:
+          ENV: 'dev'
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: poetry run python ./cognee/tests/test_deduplication.py
diff --git a/cognee/api/v1/add/add.py b/cognee/api/v1/add/add.py
@@ -33,25 +33,25 @@ async def add(data: Union[BinaryIO, List[BinaryIO], str, List[str]], dataset_nam
 
         # data is text
         else:
-            file_path = save_data_to_file(data, dataset_name)
+            file_path = save_data_to_file(data)
             return await add([file_path], dataset_name)
 
     if hasattr(data, "file"):
-        file_path = save_data_to_file(data.file, dataset_name, filename = data.filename)
+        file_path = save_data_to_file(data.file, filename = data.filename)
         return await add([file_path], dataset_name)
 
     # data is a list of file paths or texts
     file_paths = []
 
     for data_item in data:
         if hasattr(data_item, "file"):
-            file_paths.append(save_data_to_file(data_item, dataset_name, filename = data_item.filename))
+            file_paths.append(save_data_to_file(data_item, filename = data_item.filename))
         elif isinstance(data_item, str) and (
             data_item.startswith("/") or data_item.startswith("file://")
         ):
             file_paths.append(data_item)
         elif isinstance(data_item, str):
-            file_paths.append(save_data_to_file(data_item, dataset_name))
+            file_paths.append(save_data_to_file(data_item))
 
     if len(file_paths) > 0:
         return await add_files(file_paths, dataset_name, user)

diff --git a/cognee/infrastructure/files/utils/get_file_metadata.py b/cognee/infrastructure/files/utils/get_file_metadata.py
@@ -1,4 +1,5 @@
 from typing import BinaryIO, TypedDict
+import hashlib
 from .guess_file_type import guess_file_type
 
 
@@ -7,10 +8,14 @@ class FileMetadata(TypedDict):
     file_path: str
     mime_type: str
     extension: str
+    content_hash: str
 
 def get_file_metadata(file: BinaryIO) -> FileMetadata:
     """Get metadata from a file"""
     file.seek(0)
+    content_hash = hashlib.md5(file.read()).hexdigest()
+    file.seek(0)
+
     file_type = guess_file_type(file)
 
     file_path = file.name
@@ -21,4 +26,5 @@ def get_file_metadata(file: BinaryIO) -> FileMetadata:
         file_path = file_path,
         mime_type = file_type.mime,
         extension = file_type.extension,
+        content_hash = content_hash,
     )
diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py
@@ -1,7 +1,6 @@
 from datetime import datetime, timezone
 from typing import List
 from uuid import uuid4
-
 from sqlalchemy import UUID, Column, DateTime, String
 from sqlalchemy.orm import Mapped, relationship
 
@@ -19,6 +18,8 @@ class Data(Base):
     extension = Column(String)
     mime_type = Column(String)
     raw_data_location = Column(String)
+    owner_id = Column(UUID, index=True)
+    content_hash = Column(String)
     created_at = Column(
         DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
     )

diff --git a/cognee/modules/data/operations/write_metadata.py b/cognee/modules/data/operations/write_metadata.py
@@ -2,7 +2,6 @@
 import json
 import re
 import warnings
-from typing import Any
 from uuid import UUID
 from sqlalchemy import select
 from typing import Any, BinaryIO, Union

diff --git a/cognee/modules/ingestion/data_types/BinaryData.py b/cognee/modules/ingestion/data_types/BinaryData.py
@@ -17,7 +17,7 @@ def __init__(self, data: BinaryIO, name: str = None):
     def get_identifier(self):
         metadata = self.get_metadata()
 
-        return self.name + "." + metadata["extension"]
+        return metadata["content_hash"]
 
     def get_metadata(self):
         self.ensure_metadata()

diff --git a/cognee/modules/ingestion/identify.py b/cognee/modules/ingestion/identify.py
@@ -1,7 +1,9 @@
 from uuid import uuid5, NAMESPACE_OID
 from .data_types import IngestionData
 
-def identify(data: IngestionData) -> str:
-    data_id: str = data.get_identifier()
+from cognee.modules.users.models import User
 
-    return uuid5(NAMESPACE_OID, data_id)
+def identify(data: IngestionData, user: User) -> str:
+    data_content_hash: str = data.get_identifier()
+    # return UUID hash of file contents + owner id
+    return uuid5(NAMESPACE_OID,f"{data_content_hash}{user.id}")
diff --git a/cognee/modules/ingestion/save_data_to_file.py b/cognee/modules/ingestion/save_data_to_file.py
@@ -1,25 +1,28 @@
-import string
-import random
+import os.path
+import hashlib
 from typing import BinaryIO, Union
 from cognee.base_config import get_base_config
 from cognee.infrastructure.files.storage import LocalStorage
 from .classify import classify
 
-def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None):
+def save_data_to_file(data: Union[str, BinaryIO], filename: str = None):
     base_config = get_base_config()
     data_directory_path = base_config.data_root_directory
 
     classified_data = classify(data, filename)
 
-    storage_path = data_directory_path + "/" + dataset_name.replace(".", "/")
+    storage_path = os.path.join(data_directory_path, "data")
     LocalStorage.ensure_directory_exists(storage_path)
 
     file_metadata = classified_data.get_metadata()
     if "name" not in file_metadata or file_metadata["name"] is None:
-        letters = string.ascii_lowercase
-        random_string = "".join(random.choice(letters) for _ in range(32))
-        file_metadata["name"] = "text_" + random_string + ".txt"
+        data_contents = classified_data.get_data().encode('utf-8')
+        hash_contents = hashlib.md5(data_contents).hexdigest()
+        file_metadata["name"] = "text_" + hash_contents + ".txt"
     file_name = file_metadata["name"]
-    LocalStorage(storage_path).store(file_name, classified_data.get_data())
+
+    # Don't save file if it already exists
+    if not os.path.isfile(os.path.join(storage_path, file_name)):
+        LocalStorage(storage_path).store(file_name, classified_data.get_data())
 
     return "file://" + storage_path + "/" + file_name
diff --git a/cognee/tasks/ingestion/ingest_data_with_metadata.py b/cognee/tasks/ingestion/ingest_data_with_metadata.py
@@ -4,7 +4,7 @@
 import cognee.modules.ingestion as ingestion
 from cognee.infrastructure.databases.relational import get_relational_engine
 from cognee.modules.data.methods import create_dataset
-from cognee.modules.data.operations.delete_metadata import delete_metadata
+from cognee.modules.data.models.DatasetData import DatasetData
 from cognee.modules.users.models import User
 from cognee.modules.users.permissions.methods import give_permission_on_document
 from cognee.shared.utils import send_telemetry
@@ -23,19 +23,21 @@ async def ingest_data_with_metadata(data: Any, dataset_name: str, user: User):
         destination = destination,
     )
 
-    @dlt.resource(standalone=True, merge_key="id")
-    async def data_resources(file_paths: str):
+    @dlt.resource(standalone=True, primary_key="id", merge_key="id")
+    async def data_resources(file_paths: str, user: User):
         for file_path in file_paths:
             with open(file_path.replace("file://", ""), mode="rb") as file:
                 classified_data = ingestion.classify(file)
-                data_id = ingestion.identify(classified_data)
+                data_id = ingestion.identify(classified_data, user)
                 file_metadata = classified_data.get_metadata()
                 yield {
                     "id": data_id,
                     "name": file_metadata["name"],
                     "file_path": file_metadata["file_path"],
                     "extension": file_metadata["extension"],
                     "mime_type": file_metadata["mime_type"],
+                    "content_hash": file_metadata["content_hash"],
+                    "owner_id": str(user.id),
                 }
 
     async def data_storing(data: Any, dataset_name: str, user: User):
@@ -57,7 +59,8 @@ async def data_storing(data: Any, dataset_name: str, user: User):
             with open(file_path.replace("file://", ""), mode = "rb") as file:
                 classified_data = ingestion.classify(file)
 
-                data_id = ingestion.identify(classified_data)
+                # data_id is the hash of file contents + owner id to avoid duplicate data
+                data_id = ingestion.identify(classified_data, user)
 
                 file_metadata = classified_data.get_metadata()
 
@@ -70,6 +73,7 @@ async def data_storing(data: Any, dataset_name: str, user: User):
                 async with db_engine.get_async_session() as session:
                     dataset = await create_dataset(dataset_name, user.id, session)
 
+                    # Check to see if data should be updated
                     data_point = (
                         await session.execute(select(Data).filter(Data.id == data_id))
                     ).scalar_one_or_none()
@@ -79,17 +83,29 @@ async def data_storing(data: Any, dataset_name: str, user: User):
                         data_point.raw_data_location = file_metadata["file_path"]
                         data_point.extension = file_metadata["extension"]
                         data_point.mime_type = file_metadata["mime_type"]
+                        data_point.owner_id = user.id
+                        data_point.content_hash = file_metadata["content_hash"]
                         await session.merge(data_point)
                     else:
                         data_point = Data(
                             id = data_id,
                             name = file_metadata["name"],
                             raw_data_location = file_metadata["file_path"],
                             extension = file_metadata["extension"],
-                            mime_type = file_metadata["mime_type"]
+                            mime_type = file_metadata["mime_type"],
+                            owner_id = user.id,
+                            content_hash = file_metadata["content_hash"],
                         )
 
+                    # Check if data is already in dataset
+                    dataset_data = (
+                        await session.execute(select(DatasetData).filter(DatasetData.data_id == data_id,
+                                                                         DatasetData.dataset_id == dataset.id))
+                    ).scalar_one_or_none()
+                    # If data is not present in dataset add it
+                    if dataset_data is None:
                         dataset.data.append(data_point)
+
                     await session.commit()
                     await write_metadata(data_item, data_point.id, file_metadata)
 
@@ -109,16 +125,17 @@ async def data_storing(data: Any, dataset_name: str, user: User):
         # To use sqlite with dlt dataset_name must be set to "main".
         # Sqlite doesn't support schemas
         run_info = pipeline.run(
-            data_resources(file_paths),
+            data_resources(file_paths, user),
             table_name="file_metadata",
             dataset_name="main",
             write_disposition="merge",
         )
     else:
+        # Data should be stored in the same schema to allow deduplication
         run_info = pipeline.run(
-            data_resources(file_paths),
+            data_resources(file_paths, user),
             table_name="file_metadata",
-            dataset_name=dataset_name,
+            dataset_name="public",
             write_disposition="merge",
         )
 

diff --git a/cognee/tasks/ingestion/save_data_item_to_storage.py b/cognee/tasks/ingestion/save_data_item_to_storage.py
@@ -7,15 +7,15 @@ def save_data_item_to_storage(data_item: Union[BinaryIO, str], dataset_name: str
 
     # data is a file object coming from upload.
     if hasattr(data_item, "file"):
-        file_path = save_data_to_file(data_item.file, dataset_name, filename=data_item.filename)
+        file_path = save_data_to_file(data_item.file, filename=data_item.filename)
 
     elif isinstance(data_item, str):
         # data is a file path
         if data_item.startswith("file://") or data_item.startswith("/"):
             file_path = data_item.replace("file://", "")
         # data is text
         else:
-            file_path = save_data_to_file(data_item, dataset_name)
+            file_path = save_data_to_file(data_item)
     else:
         raise IngestionError(message=f"Data type not supported: {type(data_item)}")
 

diff --git a/cognee/tasks/ingestion/save_data_item_with_metadata_to_storage.py b/cognee/tasks/ingestion/save_data_item_with_metadata_to_storage.py
@@ -17,7 +17,7 @@ async def save_data_item_with_metadata_to_storage(
     # data is a file object coming from upload.
     elif hasattr(data_item, "file"):
         file_path = save_data_to_file(
-            data_item.file, dataset_name, filename=data_item.filename
+            data_item.file, filename=data_item.filename
         )
 
     elif isinstance(data_item, str):
@@ -26,7 +26,7 @@ async def save_data_item_with_metadata_to_storage(
             file_path = data_item.replace("file://", "")
         # data is text
         else:
-            file_path = save_data_to_file(data_item, dataset_name)
+            file_path = save_data_to_file(data_item)
     else:
         raise IngestionError(message=f"Data type not supported: {type(data_item)}")
 

diff --git a/cognee/tasks/ingestion/transform_data.py b/cognee/tasks/ingestion/transform_data.py
@@ -8,11 +8,11 @@ def get_data_from_llama_index(data_point: Union[Document, ImageDocument], datase
     if type(data_point) == Document:
         file_path = data_point.metadata.get("file_path")
         if file_path is None:
-            file_path = save_data_to_file(data_point.text, dataset_name)
+            file_path = save_data_to_file(data_point.text)
             return file_path
         return file_path
     elif type(data_point) == ImageDocument:
         if data_point.image_path is None:
-            file_path = save_data_to_file(data_point.text, dataset_name)
+            file_path = save_data_to_file(data_point.text)
             return file_path
         return data_point.image_path
diff --git a/cognee/tests/test_data/Natural_language_processing_copy.txt b/cognee/tests/test_data/Natural_language_processing_copy.txt
@@ -0,0 +1,2 @@
+Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them. To this end, natural language processing often borrows ideas from theoretical linguistics. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
+Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.
diff --git a/cognee/tests/test_data/example.png b/cognee/tests/test_data/example.png
diff --git a/cognee/tests/test_data/example_copy.png b/cognee/tests/test_data/example_copy.png
diff --git a/cognee/tests/test_data/text_to_speech.mp3 b/cognee/tests/test_data/text_to_speech.mp3
diff --git a/cognee/tests/test_data/text_to_speech_copy.mp3 b/cognee/tests/test_data/text_to_speech_copy.mp3
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of "understanding"[citation needed] the contents of documents, including the contextual nuances of the language within them. To this end, natural language processing often borrows ideas from theoretical linguistics. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
dexters1 marked this conversation as resolved. Show resolved Hide resolved
		Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.