diff --git a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py index fdc7db0699..324ee7bcd9 100644 --- a/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py +++ b/cognee/infrastructure/databases/hybrid/falkordb/FalkorDBAdapter.py @@ -1,21 +1,26 @@ import asyncio # from datetime import datetime import json -from uuid import UUID from textwrap import dedent +from uuid import UUID + from falkordb import FalkorDB from cognee.exceptions import InvalidValueError -from cognee.infrastructure.engine import DataPoint -from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface +from cognee.infrastructure.databases.graph.graph_db_interface import \ + GraphDBInterface from cognee.infrastructure.databases.vector.embeddings import EmbeddingEngine -from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface +from cognee.infrastructure.databases.vector.vector_db_interface import \ + VectorDBInterface +from cognee.infrastructure.engine import DataPoint + class IndexSchema(DataPoint): text: str _metadata: dict = { - "index_fields": ["text"] + "index_fields": ["text"], + "type": "IndexSchema" } class FalkorDBAdapter(VectorDBInterface, GraphDBInterface): diff --git a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py index 37d3400047..1b3fc55c34 100644 --- a/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py +++ b/cognee/infrastructure/databases/vector/lancedb/LanceDBAdapter.py @@ -1,25 +1,29 @@ -from typing import List, Optional, get_type_hints, Generic, TypeVar import asyncio +from typing import Generic, List, Optional, TypeVar, get_type_hints from uuid import UUID + import lancedb +from lancedb.pydantic import LanceModel, Vector from pydantic import BaseModel -from lancedb.pydantic import Vector, LanceModel from cognee.exceptions import InvalidValueError from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.files.storage import LocalStorage from cognee.modules.storage.utils import copy_model, get_own_properties + +from ..embeddings.EmbeddingEngine import EmbeddingEngine from ..models.ScoredResult import ScoredResult -from ..vector_db_interface import VectorDBInterface from ..utils import normalize_distances -from ..embeddings.EmbeddingEngine import EmbeddingEngine +from ..vector_db_interface import VectorDBInterface + class IndexSchema(DataPoint): id: str text: str _metadata: dict = { - "index_fields": ["text"] + "index_fields": ["text"], + "type": "IndexSchema" } class LanceDBAdapter(VectorDBInterface): diff --git a/cognee/infrastructure/databases/vector/milvus/MilvusAdapter.py b/cognee/infrastructure/databases/vector/milvus/MilvusAdapter.py index 4e5290dd14..0d4ea05d3b 100644 --- a/cognee/infrastructure/databases/vector/milvus/MilvusAdapter.py +++ b/cognee/infrastructure/databases/vector/milvus/MilvusAdapter.py @@ -4,10 +4,12 @@ import logging from typing import List, Optional from uuid import UUID + from cognee.infrastructure.engine import DataPoint -from ..vector_db_interface import VectorDBInterface -from ..models.ScoredResult import ScoredResult + from ..embeddings.EmbeddingEngine import EmbeddingEngine +from ..models.ScoredResult import ScoredResult +from ..vector_db_interface import VectorDBInterface logger = logging.getLogger("MilvusAdapter") @@ -16,7 +18,8 @@ class IndexSchema(DataPoint): text: str _metadata: dict = { - "index_fields": ["text"] + "index_fields": ["text"], + "type": "IndexSchema" } diff --git a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py index a6b458cbd8..3f05652531 100644 --- a/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py +++ b/cognee/infrastructure/databases/vector/pgvector/PGVectorAdapter.py @@ -1,6 +1,7 @@ import asyncio -from uuid import UUID from typing import List, Optional, get_type_hints +from uuid import UUID + from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import JSON, Column, Table, select, delete, MetaData from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker @@ -9,19 +10,21 @@ from cognee.infrastructure.databases.exceptions import EntityNotFoundError from cognee.infrastructure.engine import DataPoint -from .serialize_data import serialize_data +from ...relational.ModelBase import Base +from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter +from ..embeddings.EmbeddingEngine import EmbeddingEngine from ..models.ScoredResult import ScoredResult -from ..vector_db_interface import VectorDBInterface from ..utils import normalize_distances -from ..embeddings.EmbeddingEngine import EmbeddingEngine -from ...relational.sqlalchemy.SqlAlchemyAdapter import SQLAlchemyAdapter -from ...relational.ModelBase import Base +from ..vector_db_interface import VectorDBInterface +from .serialize_data import serialize_data + class IndexSchema(DataPoint): text: str _metadata: dict = { - "index_fields": ["text"] + "index_fields": ["text"], + "type": "IndexSchema" } class PGVectorAdapter(SQLAlchemyAdapter, VectorDBInterface): @@ -89,6 +92,7 @@ def __init__(self, id, payload, vector): async def create_data_points( self, collection_name: str, data_points: List[DataPoint] ): + data_point_types = get_type_hints(DataPoint) if not await self.has_collection(collection_name): await self.create_collection( collection_name = collection_name, @@ -108,7 +112,7 @@ class PGVectorDataPoint(Base): primary_key: Mapped[int] = mapped_column( primary_key=True, autoincrement=True ) - id: Mapped[type(data_points[0].id)] + id: Mapped[data_point_types["id"]] payload = Column(JSON) vector = Column(self.Vector(vector_size)) diff --git a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py index d5d2a1a5ca..b63139bc58 100644 --- a/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py +++ b/cognee/infrastructure/databases/vector/qdrant/QDrantAdapter.py @@ -1,13 +1,16 @@ import logging +from typing import Dict, List, Optional from uuid import UUID -from typing import List, Dict, Optional + from qdrant_client import AsyncQdrantClient, models from cognee.exceptions import InvalidValueError -from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult +from cognee.infrastructure.databases.vector.models.ScoredResult import \ + ScoredResult from cognee.infrastructure.engine import DataPoint -from ..vector_db_interface import VectorDBInterface + from ..embeddings.EmbeddingEngine import EmbeddingEngine +from ..vector_db_interface import VectorDBInterface logger = logging.getLogger("QDrantAdapter") @@ -15,7 +18,8 @@ class IndexSchema(DataPoint): text: str _metadata: dict = { - "index_fields": ["text"] + "index_fields": ["text"], + "type": "IndexSchema" } # class CollectionConfig(BaseModel, extra = "forbid"): diff --git a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py index c16f765b0b..31162b1b55 100644 --- a/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py +++ b/cognee/infrastructure/databases/vector/weaviate_db/WeaviateAdapter.py @@ -5,9 +5,10 @@ from cognee.exceptions import InvalidValueError from cognee.infrastructure.engine import DataPoint -from ..vector_db_interface import VectorDBInterface -from ..models.ScoredResult import ScoredResult + from ..embeddings.EmbeddingEngine import EmbeddingEngine +from ..models.ScoredResult import ScoredResult +from ..vector_db_interface import VectorDBInterface logger = logging.getLogger("WeaviateAdapter") @@ -15,7 +16,8 @@ class IndexSchema(DataPoint): text: str _metadata: dict = { - "index_fields": ["text"] + "index_fields": ["text"], + "type": "IndexSchema" } class WeaviateAdapter(VectorDBInterface): diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py index abb924f2f7..e080411464 100644 --- a/cognee/infrastructure/engine/models/DataPoint.py +++ b/cognee/infrastructure/engine/models/DataPoint.py @@ -1,8 +1,10 @@ -from typing_extensions import TypedDict -from uuid import UUID, uuid4 -from typing import Optional from datetime import datetime, timezone +from typing import Optional +from uuid import UUID, uuid4 + from pydantic import BaseModel, Field +from typing_extensions import TypedDict + class MetaData(TypedDict): index_fields: list[str] @@ -13,7 +15,8 @@ class DataPoint(BaseModel): updated_at: Optional[datetime] = datetime.now(timezone.utc) topological_rank: Optional[int] = 0 _metadata: Optional[MetaData] = { - "index_fields": [] + "index_fields": [], + "type": "DataPoint" } # class Config: @@ -39,4 +42,4 @@ def get_embeddable_properties(self, data_point): @classmethod def get_embeddable_property_names(self, data_point): - return data_point._metadata["index_fields"] or [] + return data_point._metadata["index_fields"] or [] \ No newline at end of file diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py index b5faea5600..8729596df3 100644 --- a/cognee/modules/chunking/models/DocumentChunk.py +++ b/cognee/modules/chunking/models/DocumentChunk.py @@ -1,8 +1,10 @@ from typing import List, Optional + from cognee.infrastructure.engine import DataPoint from cognee.modules.data.processing.document_types import Document from cognee.modules.engine.models import Entity + class DocumentChunk(DataPoint): __tablename__ = "document_chunk" text: str @@ -12,6 +14,7 @@ class DocumentChunk(DataPoint): is_part_of: Document contains: List[Entity] = None - _metadata: Optional[dict] = { + _metadata: dict = { "index_fields": ["text"], + "type": "DocumentChunk" } diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 45441dcce2..924ffabac5 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,12 +1,17 @@ -from cognee.infrastructure.engine import DataPoint from uuid import UUID +from cognee.infrastructure.engine import DataPoint + + class Document(DataPoint): - type: str name: str raw_data_location: str metadata_id: UUID mime_type: str + _metadata: dict = { + "index_fields": ["name"], + "type": "Document" + } def read(self, chunk_size: int) -> str: - pass + pass \ No newline at end of file diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index b805d3d114..16e0ca3d81 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -10,4 +10,5 @@ class Entity(DataPoint): _metadata: dict = { "index_fields": ["name"], + "type": "Entity" } diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py index 1c7843cfd6..d3cc543113 100644 --- a/cognee/modules/engine/models/EntityType.py +++ b/cognee/modules/engine/models/EntityType.py @@ -1,11 +1,12 @@ from cognee.infrastructure.engine import DataPoint + class EntityType(DataPoint): __tablename__ = "entity_type" name: str - type: str description: str _metadata: dict = { "index_fields": ["name"], + "type": "EntityType" } diff --git a/cognee/modules/graph/models/EdgeType.py b/cognee/modules/graph/models/EdgeType.py index f9554d25d6..998f08d8d5 100644 --- a/cognee/modules/graph/models/EdgeType.py +++ b/cognee/modules/graph/models/EdgeType.py @@ -1,11 +1,14 @@ from typing import Optional + from cognee.infrastructure.engine import DataPoint + class EdgeType(DataPoint): __tablename__ = "edge_type" relationship_name: str number_of_edges: int - _metadata: Optional[dict] = { + _metadata: dict = { "index_fields": ["relationship_name"], + "type": "EdgeType" } \ No newline at end of file diff --git a/cognee/modules/graph/utils/convert_node_to_data_point.py b/cognee/modules/graph/utils/convert_node_to_data_point.py index 292f537338..602a7ffa3d 100644 --- a/cognee/modules/graph/utils/convert_node_to_data_point.py +++ b/cognee/modules/graph/utils/convert_node_to_data_point.py @@ -2,7 +2,7 @@ def convert_node_to_data_point(node_data: dict) -> DataPoint: - subclass = find_subclass_by_name(DataPoint, node_data["type"]) + subclass = find_subclass_by_name(DataPoint, node_data._metadata["type"]) return subclass(**node_data) diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index 8859fd0d69..23b8879c27 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -1,15 +1,19 @@ from typing import List, Optional + from cognee.infrastructure.engine import DataPoint + class Repository(DataPoint): __tablename__ = "Repository" path: str - type: Optional[str] = "Repository" + _metadata: dict = { + "index_fields": ["source_code"], + "type": "Repository" + } class CodeFile(DataPoint): __tablename__ = "codefile" extracted_id: str # actually file path - type: Optional[str] = "CodeFile" source_code: Optional[str] = None part_of: Optional[Repository] = None depends_on: Optional[List["CodeFile"]] = None @@ -17,24 +21,27 @@ class CodeFile(DataPoint): contains: Optional[List["CodePart"]] = None _metadata: dict = { - "index_fields": ["source_code"] + "index_fields": ["source_code"], + "type": "CodeFile" } class CodePart(DataPoint): __tablename__ = "codepart" # part_of: Optional[CodeFile] source_code: str - type: Optional[str] = "CodePart" - + _metadata: dict = { - "index_fields": ["source_code"] + "index_fields": ["source_code"], + "type": "CodePart" } class CodeRelationship(DataPoint): source_id: str target_id: str - type: str # between files relation: str # depends on or depends directly + _metadata: dict = { + "type": "CodeRelationship" + } CodeFile.model_rebuild() CodePart.model_rebuild() diff --git a/cognee/shared/SourceCodeGraph.py b/cognee/shared/SourceCodeGraph.py index 0fc8f9487c..3de72c5fdd 100644 --- a/cognee/shared/SourceCodeGraph.py +++ b/cognee/shared/SourceCodeGraph.py @@ -1,79 +1,90 @@ -from typing import Any, List, Union, Literal, Optional +from typing import Any, List, Literal, Optional, Union + from cognee.infrastructure.engine import DataPoint + class Variable(DataPoint): id: str name: str - type: Literal["Variable"] = "Variable" description: str is_static: Optional[bool] = False default_value: Optional[str] = None data_type: str _metadata = { - "index_fields": ["name"] + "index_fields": ["name"], + "type": "Variable" } class Operator(DataPoint): id: str name: str - type: Literal["Operator"] = "Operator" description: str return_type: str + _metadata = { + "index_fields": ["name"], + "type": "Operator" + } class Class(DataPoint): id: str name: str - type: Literal["Class"] = "Class" description: str constructor_parameters: List[Variable] extended_from_class: Optional["Class"] = None has_methods: List["Function"] _metadata = { - "index_fields": ["name"] + "index_fields": ["name"], + "type": "Class" } class ClassInstance(DataPoint): id: str name: str - type: Literal["ClassInstance"] = "ClassInstance" description: str from_class: Class instantiated_by: Union["Function"] instantiation_arguments: List[Variable] _metadata = { - "index_fields": ["name"] + "index_fields": ["name"], + "type": "ClassInstance" } class Function(DataPoint): id: str name: str - type: Literal["Function"] = "Function" description: str parameters: List[Variable] return_type: str is_static: Optional[bool] = False _metadata = { - "index_fields": ["name"] + "index_fields": ["name"], + "type": "Function" } class FunctionCall(DataPoint): id: str - type: Literal["FunctionCall"] = "FunctionCall" called_by: Union[Function, Literal["main"]] function_called: Function function_arguments: List[Any] + _metadata = { + "index_fields": [], + "type": "FunctionCall" + } class Expression(DataPoint): id: str name: str - type: Literal["Expression"] = "Expression" description: str expression: str members: List[Union[Variable, Function, Operator, "Expression"]] + _metadata = { + "index_fields": ["name"], + "type": "Expression" + } class SourceCodeGraph(DataPoint): id: str @@ -89,8 +100,13 @@ class SourceCodeGraph(DataPoint): Operator, Expression, ]] + _metadata = { + "index_fields": ["name"], + "type": "SourceCodeGraph" + } + Class.model_rebuild() ClassInstance.model_rebuild() Expression.model_rebuild() FunctionCall.model_rebuild() -SourceCodeGraph.model_rebuild() +SourceCodeGraph.model_rebuild() \ No newline at end of file diff --git a/cognee/tasks/storage/index_data_points.py b/cognee/tasks/storage/index_data_points.py index 786168b58e..857e4d777c 100644 --- a/cognee/tasks/storage/index_data_points.py +++ b/cognee/tasks/storage/index_data_points.py @@ -1,6 +1,7 @@ from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint + async def index_data_points(data_points: list[DataPoint]): created_indexes = {} index_points = {} @@ -80,11 +81,20 @@ async def get_data_points_from_model(data_point: DataPoint, added_data_points = class Car(DataPoint): model: str color: str + _metadata = { + "index_fields": ["name"], + "type": "Car" + } + class Person(DataPoint): name: str age: int owns_car: list[Car] + _metadata = { + "index_fields": ["name"], + "type": "Person" + } car1 = Car(model = "Tesla Model S", color = "Blue") car2 = Car(model = "Toyota Camry", color = "Red") @@ -92,4 +102,4 @@ class Person(DataPoint): data_points = get_data_points_from_model(person) - print(data_points) + print(data_points) \ No newline at end of file diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 6fef4fb025..add4481557 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -10,6 +10,7 @@ class TextSummary(DataPoint): _metadata: dict = { "index_fields": ["text"], + "type": "TextSummary" } @@ -20,4 +21,5 @@ class CodeSummary(DataPoint): _metadata: dict = { "index_fields": ["text"], + "type": "CodeSummary" } diff --git a/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py b/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py index 016f2be33f..06c74c8545 100644 --- a/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py +++ b/cognee/tests/unit/interfaces/graph/get_graph_from_huge_model_test.py @@ -2,7 +2,7 @@ import random import time from typing import List -from uuid import uuid5, NAMESPACE_OID +from uuid import NAMESPACE_OID, uuid5 from cognee.infrastructure.engine import DataPoint from cognee.modules.graph.utils import get_graph_from_model @@ -11,16 +11,28 @@ class Repository(DataPoint): path: str + _metadata = { + "index_fields": [], + "type": "Repository" + } class CodeFile(DataPoint): part_of: Repository contains: List["CodePart"] = [] depends_on: List["CodeFile"] = [] source_code: str + _metadata = { + "index_fields": [], + "type": "CodeFile" + } class CodePart(DataPoint): part_of: CodeFile source_code: str + _metadata = { + "index_fields": [], + "type": "CodePart" + } CodeFile.model_rebuild() CodePart.model_rebuild() diff --git a/cognee/tests/unit/interfaces/graph/get_graph_from_model_test.py b/cognee/tests/unit/interfaces/graph/get_graph_from_model_test.py index 000d45c158..499dc9f3f8 100644 --- a/cognee/tests/unit/interfaces/graph/get_graph_from_model_test.py +++ b/cognee/tests/unit/interfaces/graph/get_graph_from_model_test.py @@ -1,25 +1,42 @@ import asyncio import random from typing import List -from uuid import uuid5, NAMESPACE_OID +from uuid import NAMESPACE_OID, uuid5 from cognee.infrastructure.engine import DataPoint from cognee.modules.graph.utils import get_graph_from_model + class Document(DataPoint): path: str + _metadata = { + "index_fields": [], + "type": "Document" + } class DocumentChunk(DataPoint): part_of: Document text: str contains: List["Entity"] = None + _metadata = { + "index_fields": ["text"], + "type": "DocumentChunk" + } class EntityType(DataPoint): name: str + _metadata = { + "index_fields": ["name"], + "type": "EntityType" + } class Entity(DataPoint): name: str is_type: EntityType + _metadata = { + "index_fields": ["name"], + "type": "Entity" + } DocumentChunk.model_rebuild()