topoteretes · 0xideas · Nov 13, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml
@@ -50,8 +50,11 @@ jobs:
       - name: Install dependencies
         run: poetry install --no-interaction
 
-      - name: Run tests
-        run: poetry run pytest tests/
+      - name: Run unit tests
+        run: poetry run pytest cognee/tests/unit/
+
+      - name: Run integration tests
+        run: poetry run pytest cognee/tests/integration/
 
       - name: Run default basic pipeline
         env:

diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml
@@ -50,8 +50,11 @@ jobs:
       - name: Install dependencies
         run: poetry install --no-interaction
 
-      - name: Run tests
-        run: poetry run pytest tests/
+      - name: Run unit tests
+        run: poetry run pytest cognee/tests/unit/
+
+      - name: Run integration tests
+        run: poetry run pytest cognee/tests/integration/
 
       - name: Run default basic pipeline
         env:

diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml
@@ -50,8 +50,11 @@ jobs:
       - name: Install dependencies
         run: poetry install --no-interaction
 
-      - name: Run tests
-        run: poetry run pytest tests/
+      - name: Run unit tests
+        run: poetry run pytest cognee/tests/unit/
+
+      - name: Run integration tests
+        run: poetry run pytest cognee/tests/integration/
 
       - name: Run default basic pipeline
         env:

diff --git a/cognee/infrastructure/engine/__tests__/model_to_graph_to_model.test.py b/cognee/infrastructure/engine/__tests__/model_to_graph_to_model.test.py
diff --git a/cognee/modules/data/processing/document_types/__tests__/PdfDocument.test.py b/cognee/modules/data/processing/document_types/__tests__/PdfDocument.test.py
diff --git a/cognee/modules/data/processing/document_types/__tests__/artificial-inteligence.pdf b/cognee/modules/data/processing/document_types/__tests__/artificial-inteligence.pdf
diff --git a/cognee/modules/data/processing/document_types/__tests__/soldiers-home.pdf b/cognee/modules/data/processing/document_types/__tests__/soldiers-home.pdf
diff --git a/cognee/modules/pipelines/operations/__tests__/__init__.py b/cognee/modules/pipelines/operations/__tests__/__init__.py
diff --git a/cognee/modules/pipelines/operations/__tests__/artificial-inteligence.v1.pdf b/cognee/modules/pipelines/operations/__tests__/artificial-inteligence.v1.pdf
diff --git a/cognee/modules/pipelines/operations/__tests__/artificial-inteligence.v2.pdf b/cognee/modules/pipelines/operations/__tests__/artificial-inteligence.v2.pdf
diff --git a/cognee/modules/pipelines/operations/__tests__/get_graph_url.py b/cognee/modules/pipelines/operations/__tests__/get_graph_url.py
diff --git a/cognee/tasks/chunks/__tests__/chunk_by_paragraph.test.py b/cognee/tasks/chunks/__tests__/chunk_by_paragraph.test.py
diff --git a/cognee/tasks/documents/classify_documents.py b/cognee/tasks/documents/classify_documents.py
@@ -5,14 +5,12 @@
     "pdf": PdfDocument,
     "audio": AudioDocument,
     "image": ImageDocument,
-    "pdf": TextDocument,
     "txt": TextDocument
 }
 
 def classify_documents(data_documents: list[Data]) -> list[Document]:
     documents = [
-        EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location)
+        EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](id = data_item.id, title=f"{data_item.name}.{data_item.extension}", raw_data_location=data_item.raw_data_location, name=data_item.name)
         for data_item in data_documents
     ]
-
     return documents
diff --git a/cognee/tests/integration/run_toy_tasks/conftest.py b/cognee/tests/integration/run_toy_tasks/conftest.py
@@ -0,0 +1,11 @@
+import os
+
+import pytest
+
+
+@pytest.fixture(autouse=True, scope="session")
+def copy_cognee_db_to_target_location():
+    os.makedirs("cognee/.cognee_system/databases/", exist_ok=True)
+    os.system(
+        "cp cognee/tests/integration/run_toy_tasks/data/cognee_db cognee/.cognee_system/databases/cognee_db"
+    )
diff --git a/cognee/tests/integration/run_toy_tasks/data/cognee_db b/cognee/tests/integration/run_toy_tasks/data/cognee_db
diff --git a/...ns/__tests__/run_tasks_from_queue.test.py → ...run_toy_tasks/run_task_from_queue_test.py b/...ns/__tests__/run_tasks_from_queue.test.py → ...run_toy_tasks/run_task_from_queue_test.py
@@ -1,8 +1,10 @@
 import asyncio
 from queue import Queue
+
 from cognee.modules.pipelines.operations.run_tasks import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
 
+
 async def pipeline(data_queue):
     async def queue_consumer():
         while not data_queue.is_closed:
@@ -17,20 +19,25 @@ async def add_one(num):
     async def multiply_by_two(num):
         yield num * 2
 
-    tasks_run = run_tasks([
-        Task(queue_consumer),
-        Task(add_one),
-        Task(multiply_by_two),
-    ])
+    tasks_run = run_tasks(
+        [
+            Task(queue_consumer),
+            Task(add_one),
+            Task(multiply_by_two),
+        ],
+        pipeline_name="test_run_tasks_from_queue",
+    )
 
-    results = [2, 4, 6, 8, 10, 12, 14, 16, 18]
+    results = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
     index = 0
     async for result in tasks_run:
-        print(result)
-        assert result == results[index]
+        assert (
+            result == results[index]
+        ), f"at {index = }: {result = } != {results[index] = }"
         index += 1
 
-async def main():
+
+async def run_queue():
     data_queue = Queue()
     data_queue.is_closed = False
 
@@ -42,5 +49,6 @@ async def queue_producer():
 
     await asyncio.gather(pipeline(data_queue), queue_producer())
 
-if __name__ == "__main__":
-    asyncio.run(main())
+
+def test_run_tasks_from_queue():
+    asyncio.run(run_queue())
diff --git a/...es/operations/__tests__/run_tasks.test.py → ...tegration/run_toy_tasks/run_tasks_test.py b/...es/operations/__tests__/run_tasks.test.py → ...tegration/run_toy_tasks/run_tasks_test.py
@@ -1,9 +1,10 @@
 import asyncio
+
 from cognee.modules.pipelines.operations.run_tasks import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
 
 
-async def main():
+async def run_and_check_tasks():
     def number_generator(num):
         for i in range(num):
             yield i + 1
@@ -18,19 +19,25 @@ async def multiply_by_two(num):
     async def add_one_single(num):
         yield num + 1
 
-    pipeline = run_tasks([
-        Task(number_generator),
-        Task(add_one, task_config = {"batch_size": 5}),
-        Task(multiply_by_two, task_config = {"batch_size": 1}),
-        Task(add_one_single),
-    ], 10)
+    pipeline = run_tasks(
+        [
+            Task(number_generator),
+            Task(add_one, task_config={"batch_size": 5}),
+            Task(multiply_by_two, task_config={"batch_size": 1}),
+            Task(add_one_single),
+        ],
+        10,
+        pipeline_name="test_run_tasks",
+    )
 
     results = [5, 7, 9, 11, 13, 15, 17, 19, 21, 23]
     index = 0
     async for result in pipeline:
-        print(result)
-        assert result == results[index]
+        assert (
+            result == results[index]
+        ), f"at {index = }: {result = } != {results[index] = }"
         index += 1
 
-if __name__ == "__main__":
-    asyncio.run(main())
+
+def test_run_tasks():
+    asyncio.run(run_and_check_tasks())
diff --git a/cognee/tests/unit/documents/PdfDocument_test.py b/cognee/tests/unit/documents/PdfDocument_test.py
@@ -0,0 +1,34 @@
+import os
+import uuid
+
+from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
+
+GROUND_TRUTH = [
+    {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
+    {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
+]
+
+
+def test_PdfDocument():
+    test_file_path = os.path.join(
+        os.sep,
+        *(os.path.dirname(__file__).split(os.sep)[:-2]),
+        "test_data",
+        "artificial-intelligence.pdf",
+    )
+    pdf_doc = PdfDocument(
+        id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path
+    )
+
+    for ground_truth, paragraph_data in zip(
+        GROUND_TRUTH, pdf_doc.read(chunk_size=1024)
+    ):
+        assert (
+            ground_truth["word_count"] == paragraph_data.word_count
+        ), f'{ground_truth["word_count"] = } != {paragraph_data.word_count = }'
+        assert ground_truth["len_text"] == len(
+            paragraph_data.text
+        ), f'{ground_truth["len_text"] = } != {len(paragraph_data.text) = }'
+        assert (
+            ground_truth["cut_type"] == paragraph_data.cut_type
+        ), f'{ground_truth["cut_type"] = } != {paragraph_data.cut_type = }'