Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cognee/api/v1/datasets/routers/get_datasets_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,21 +208,21 @@ async def delete_dataset(dataset_id: UUID, user: User = Depends(get_authenticate
},
)

from cognee.modules.data.methods import get_dataset, delete_dataset
from cognee.modules.data.methods import delete_dataset

dataset = await get_dataset(user.id, dataset_id)
dataset = await get_authorized_existing_datasets([dataset_id], "delete", user)

if dataset is None:
raise DatasetNotFoundError(message=f"Dataset ({str(dataset_id)}) not found.")

await delete_dataset(dataset)
await delete_dataset(dataset[0])

@router.delete(
"/{dataset_id}/data/{data_id}",
response_model=None,
responses={404: {"model": ErrorResponseDTO}},
)
async def delete_data(
async def delete_data( # TODO: Is this endpoint needed? It seems redundant with data deletion elsewhere
dataset_id: UUID, data_id: UUID, user: User = Depends(get_authenticated_user)
):
"""
Expand Down
2 changes: 2 additions & 0 deletions cognee/infrastructure/databases/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from .get_or_create_dataset_database import get_or_create_dataset_database
from .resolve_dataset_database_connection_info import resolve_dataset_database_connection_info
from .get_graph_dataset_database_handler import get_graph_dataset_database_handler
from .get_vector_dataset_database_handler import get_vector_dataset_database_handler
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from cognee.modules.users.models.DatasetDatabase import DatasetDatabase


def get_graph_dataset_database_handler(dataset_database: DatasetDatabase) -> dict:
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
supported_dataset_database_handlers,
)

handler = supported_dataset_database_handlers[dataset_database.graph_dataset_database_handler]
return handler
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from cognee.modules.users.models.DatasetDatabase import DatasetDatabase


def get_vector_dataset_database_handler(dataset_database: DatasetDatabase) -> dict:
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
supported_dataset_database_handlers,
)

handler = supported_dataset_database_handlers[dataset_database.vector_dataset_database_handler]
return handler
Original file line number Diff line number Diff line change
@@ -1,24 +1,12 @@
from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
get_graph_dataset_database_handler,
)
from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
get_vector_dataset_database_handler,
)
from cognee.modules.users.models.DatasetDatabase import DatasetDatabase


async def _get_vector_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase:
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
supported_dataset_database_handlers,
)

handler = supported_dataset_database_handlers[dataset_database.vector_dataset_database_handler]
return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database)


async def _get_graph_db_connection_info(dataset_database: DatasetDatabase) -> DatasetDatabase:
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
supported_dataset_database_handlers,
)

handler = supported_dataset_database_handlers[dataset_database.graph_dataset_database_handler]
return await handler["handler_instance"].resolve_dataset_connection_info(dataset_database)


async def resolve_dataset_database_connection_info(
dataset_database: DatasetDatabase,
) -> DatasetDatabase:
Expand All @@ -31,6 +19,12 @@ async def resolve_dataset_database_connection_info(
Returns:
DatasetDatabase instance with resolved connection info
"""
dataset_database = await _get_vector_db_connection_info(dataset_database)
dataset_database = await _get_graph_db_connection_info(dataset_database)
vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
dataset_database = await vector_dataset_database_handler[
"handler_instance"
].resolve_dataset_connection_info(dataset_database)
dataset_database = await graph_dataset_database_handler[
"handler_instance"
].resolve_dataset_connection_info(dataset_database)
return dataset_database
38 changes: 12 additions & 26 deletions cognee/modules/data/deletion/prune_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine
from cognee.infrastructure.databases.relational import get_relational_engine
from cognee.infrastructure.databases.utils import (
get_graph_dataset_database_handler,
get_vector_dataset_database_handler,
)
from cognee.shared.cache import delete_cache
from cognee.modules.users.models import DatasetDatabase
from cognee.shared.logging_utils import get_logger
Expand All @@ -13,22 +17,13 @@


async def prune_graph_databases():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add docstring to document function purpose.

Per coding guidelines, undocumented function definitions are considered incomplete. This function should have a docstring explaining that it deletes all graph databases for all dataset_database records when backend access control is enabled.

Apply this diff:

 async def prune_graph_databases():
+    """
+    Delete all graph databases for all dataset_database records.
+    Used when backend_access_control is enabled to prune graph databases per-dataset.
+    """
     db_engine = get_relational_engine()

As per coding guidelines, undocumented functions are assumed incomplete.

🤖 Prompt for AI Agents
In cognee/modules/data/deletion/prune_system.py around line 19, the function
prune_graph_databases lacks a docstring; add a concise docstring immediately
below the async def that states the function's purpose: that it deletes all
graph databases for all dataset_database records when backend access control is
enabled, describes input (none) and behavior (side-effect deletion) and notes
any exceptions or important preconditions (e.g., requires backend access control
enabled). Ensure the docstring follows project style (single- or triple-quoted
summary) and is brief and descriptive.

async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict:
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
supported_dataset_database_handlers,
)

handler = supported_dataset_database_handlers[
dataset_database.graph_dataset_database_handler
]
return await handler["handler_instance"].delete_dataset(dataset_database)

db_engine = get_relational_engine()
try:
data = await db_engine.get_all_data_from_table("dataset_database")
dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
# Go through each dataset database and delete the graph database
for data_item in data:
await _prune_graph_db(data_item)
for dataset_database in dataset_databases:
handler = get_graph_dataset_database_handler(dataset_database)
await handler["handler_instance"].delete_dataset(dataset_database)
Comment on lines +22 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

cat -n cognee/modules/data/deletion/prune_system.py

Repository: topoteretes/cognee

Length of output: 3544


🏁 Script executed:

# Find the db_engine.get_all_data_from_table implementation
find . -type f -name "*.py" | xargs grep -l "def get_all_data_from_table" | head -5

Repository: topoteretes/cognee

Length of output: 138


🏁 Script executed:

# Find get_graph_dataset_database_handler function
rg -n "def get_graph_dataset_database_handler" --type=py

Repository: topoteretes/cognee

Length of output: 222


🏁 Script executed:

# Find DatasetDatabase class definition
rg -n "class DatasetDatabase" --type=py

Repository: topoteretes/cognee

Length of output: 280


🏁 Script executed:

cat -n cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | grep -A 20 "def get_all_data_from_table"

Repository: topoteretes/cognee

Length of output: 1057


🏁 Script executed:

cat -n cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py

Repository: topoteretes/cognee

Length of output: 569


🏁 Script executed:

cat -n cognee/modules/users/models/DatasetDatabase.py

Repository: topoteretes/cognee

Length of output: 2140


🏁 Script executed:

cat -n cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | sed -n '417,460p'

Repository: topoteretes/cognee

Length of output: 1885


🏁 Script executed:

# Search for complete implementation of get_all_data_from_table
rg -A 30 "async def get_all_data_from_table" cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py

Repository: topoteretes/cognee

Length of output: 1234


🏁 Script executed:

# Check how get_all_data_from_table is used elsewhere in the codebase to see if there are other examples
rg -B 2 -A 5 "get_all_data_from_table\(" --type=py | head -50

Repository: topoteretes/cognee

Length of output: 3949


🏁 Script executed:

# Check SQLAlchemy Row behavior - does mappings() support attribute access?
python3 << 'EOF'
# Check SQLAlchemy documentation for Row behavior
import sys
try:
    from sqlalchemy import Row
    # Check if Row supports attribute access
    print("SQLAlchemy Row class:", Row)
    print("Row supports attribute access via __getattr__")
    help(Row)
except Exception as e:
    print(f"Error: {e}")
EOF

Repository: topoteretes/cognee

Length of output: 97


🏁 Script executed:

# Let's check if there are any similar patterns in the codebase where Row mappings are used with attribute access
rg -B 3 -A 3 "\.mappings\(\)" --type=py cognee/infrastructure/databases/relational/sqlalchemy/

Repository: topoteretes/cognee

Length of output: 802


🏁 Script executed:

# Check if DatasetDatabase is instantiated directly elsewhere or if Row objects are used directly
rg -B 2 -A 2 "DatasetDatabase\(" --type=py cognee/ | grep -v test | head -40

Repository: topoteretes/cognee

Length of output: 942


🏁 Script executed:

# Check SQLAlchemy version in requirements or setup files
find . -name "requirements*.txt" -o -name "pyproject.toml" -o -name "setup.py" | xargs grep -i sqlalchemy

Repository: topoteretes/cognee

Length of output: 230


🏁 Script executed:

# Check if there are other usages where get_all_data_from_table results are used with attribute access
rg -B 2 -A 5 "get_all_data_from_table" --type=py cognee/ | grep -A 5 "for.*in.*get_all_data_from_table"

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Look for prune_vector_databases which has same pattern as prune_graph_databases
cat -n cognee/modules/data/deletion/prune_system.py | sed -n '35,48p'

Repository: topoteretes/cognee

Length of output: 825


🏁 Script executed:

# Check SQLAlchemy Row attribute access support in 2.0+
# Search for any existing uses of Row attribute access in the codebase
rg "result\[" cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | head -10

Repository: topoteretes/cognee

Length of output: 140


🏁 Script executed:

# Look for any test that verifies the attribute access on Row objects from get_all_data_from_table
rg -B 2 -A 2 "result\[.*\]" cognee/tests/test_deduplication.py | head -20

Repository: topoteretes/cognee

Length of output: 1194


🏁 Script executed:

# Check if get_vector_dataset_database_handler also uses attribute access like get_graph_dataset_database_handler
cat -n cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.py

Repository: topoteretes/cognee

Length of output: 571


🏁 Script executed:

# Check if there's a typing issue - look at the actual parameter types expected by the handlers
rg -B 5 -A 10 "def get_graph_dataset_database_handler" cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.py

Repository: topoteretes/cognee

Length of output: 499


🏁 Script executed:

# Check if Row objects from SQLAlchemy 2.0 are being used with attribute access anywhere in the codebase
rg "for .* in .*get_all_data_from_table" -A 3 --type=py cognee/ | grep -v test

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Check if there are any Row to Model conversions happening anywhere
rg "Row\|mappings" --type=py cognee/infrastructure/databases/relational/sqlalchemy/

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Let me verify SQLAlchemy 2.0 Row behavior by checking pyproject.toml more carefully
cat -n pyproject.toml | grep -A 5 -B 5 "sqlalchemy"

Repository: topoteretes/cognee

Length of output: 1354


🏁 Script executed:

# Check if Row objects truly support both dict and attribute access in actual usage
# Look for any error handling or type: ignore comments related to this
rg "type: ignore|# noqa" cognee/modules/data/deletion/prune_system.py

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Search for any other functions that receive Row objects and use attribute access
rg -B 3 "\.graph_dataset_database_handler|\.vector_dataset_database_handler" --type=py cognee/

Repository: topoteretes/cognee

Length of output: 3574


🏁 Script executed:

# Verify SQLAlchemy 2.0+ Row attribute access support by checking actual Row class behavior
# Look for any issues or type annotations in the codebase related to this
rg "RoomRowMapping|LegacyRow" --type=py cognee/

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Check if there are type checking issues - look for mypy or pyright configurations
cat pyproject.toml | grep -A 10 "\[tool.mypy\]"

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Verify the actual usage pattern - if the code works, Row objects must support attribute access
# Check if there are any integration tests that use prune_system
rg "prune_graph_databases\|prune_vector_databases" --type=py cognee/tests/

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Final verification: Check SQLAlchemy Row documentation behavior in 2.0
# by looking at actual attribute access patterns in the codebase
rg "for .* in .*await.*\." -A 3 --type=py cognee/modules/data/ | head -30

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Check if Row objects from mappings() actually support attribute access
# by looking for any similar patterns where Row attributes are accessed
rg "dataset_databases\[.*\]\." --type=py cognee/

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Verify by checking actual usage in context_global_variables where similar access happens
cat -n cognee/context_global_variables.py | sed -n '1,80p'

Repository: topoteretes/cognee

Length of output: 5277


🏁 Script executed:

# Final check: SQLAlchemy 2.0+ Row objects support both dict and attribute access
# Let's verify this by checking the actual Row implementation usage
rg "from sqlalchemy" cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | head -10

Repository: topoteretes/cognee

Length of output: 309


🏁 Script executed:

# Check if there's any Row wrapping or conversion happening
rg "class.*Row|def.*mappings" --type=py cognee/infrastructure/databases/

Repository: topoteretes/cognee

Length of output: 44


🏁 Script executed:

# Verify the complete picture - check what happens in a similar function that works
cat -n cognee/infrastructure/databases/utils/get_or_create_dataset_database.py | head -50

Repository: topoteretes/cognee

Length of output: 2232


🏁 Script executed:

# Let's verify SQLAlchemy Row attribute access by examining the actual imports and checking documentation
# SQLAlchemy 2.0+ Row objects support both dict and attribute access by default
python3 << 'EOF'
# Check SQLAlchemy 2.0 Row behavior documentation
# According to SQLAlchemy 2.0 docs, Row objects returned from mappings() support attribute access
# This is a documented feature of SQLAlchemy 2.0+

# The Row class in SQLAlchemy 2.0+ implements both:
# - __getitem__ for dict-like access: row["column_name"]
# - __getattr__ for attribute access: row.column_name

print("SQLAlchemy 2.0+ Row objects support BOTH dict-like and attribute access:")
print("- row['column_name'] works")
print("- row.column_name works")
print("\nThis is documented behavior in SQLAlchemy 2.0+")
print("See: https://docs.sqlalchemy.org/en/20/orm/queryguide.html#rows-are-namedtuples")
EOF

Repository: topoteretes/cognee

Length of output: 313


🏁 Script executed:

# Verify that the code actually works with Row attribute access by checking for any actual runtime errors
# or exception handling around this pattern
rg "AttributeError|except.*Error" cognee/modules/data/deletion/prune_system.py

Repository: topoteretes/cognee

Length of output: 250


Add docstring to prune_graph_databases function.

The function lacks documentation as required by project guidelines (code style guideline: "undocumented function definitions in the project's Python code are assumed incomplete").

🤖 Prompt for AI Agents
cognee/modules/data/deletion/prune_system.py lines 22-26: The function
prune_graph_databases is missing a docstring; add a concise triple-quoted
docstring immediately below the def line that states the function purpose
(prune/delete graph DBs for all dataset_database records), describes parameters
(if any, e.g., db_engine) and their types, notes return value (None) and side
effects (calls handler.delete_dataset for each dataset_database), and documents
possible exceptions or errors that may be raised; keep it short (2–4 sentences)
and follow project docstring style.

Comment on lines +25 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Consider validating handler dictionary structure.

The code accesses handler["handler_instance"] without verifying the key exists, which could raise a KeyError if the handler structure changes.

Consider adding defensive validation:

             handler = get_graph_dataset_database_handler(dataset_database)
+            if "handler_instance" not in handler:
+                logger.warning("Handler missing 'handler_instance' key for dataset_database")
+                continue
             await handler["handler_instance"].delete_dataset(dataset_database)

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In cognee/modules/data/deletion/prune_system.py around lines 25-26, the code
assumes handler["handler_instance"] exists which can raise KeyError; validate
that handler is a dict and contains the "handler_instance" key (and that its
value is not None and exposes delete_dataset) before calling await
handler["handler_instance"].delete_dataset(dataset_database); if validation
fails, log or raise a clear error with context (including dataset_database) or
skip gracefully to avoid unhandled exceptions.

except (OperationalError, EntityNotFoundError) as e:
logger.debug(
"Skipping pruning of graph DB. Error when accessing dataset_database table: %s",
Expand All @@ -38,22 +33,13 @@ async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict:


async def prune_vector_databases():
async def _prune_vector_db(dataset_database: DatasetDatabase) -> dict:
from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import (
supported_dataset_database_handlers,
)

handler = supported_dataset_database_handlers[
dataset_database.vector_dataset_database_handler
]
return await handler["handler_instance"].delete_dataset(dataset_database)

db_engine = get_relational_engine()
try:
data = await db_engine.get_all_data_from_table("dataset_database")
dataset_databases = await db_engine.get_all_data_from_table("dataset_database")
# Go through each dataset database and delete the vector database
for data_item in data:
await _prune_vector_db(data_item)
for dataset_database in dataset_databases:
handler = get_vector_dataset_database_handler(dataset_database)
await handler["handler_instance"].delete_dataset(dataset_database)
except (OperationalError, EntityNotFoundError) as e:
logger.debug(
"Skipping pruning of vector DB. Error when accessing dataset_database table: %s",
Expand Down
24 changes: 24 additions & 0 deletions cognee/modules/data/methods/delete_dataset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
from cognee.modules.users.models import DatasetDatabase
from sqlalchemy import select

from cognee.modules.data.models import Dataset
from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import (
get_vector_dataset_database_handler,
)
from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import (
get_graph_dataset_database_handler,
)
from cognee.infrastructure.databases.relational import get_relational_engine


async def delete_dataset(dataset: Dataset):
db_engine = get_relational_engine()

async with db_engine.get_async_session() as session:
stmt = select(DatasetDatabase).where(
DatasetDatabase.dataset_id == dataset.id,
)
dataset_database: DatasetDatabase = await session.scalar(stmt)
if dataset_database:
graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database)
vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database)
await graph_dataset_database_handler["handler_instance"].delete_dataset(
dataset_database
)
await vector_dataset_database_handler["handler_instance"].delete_dataset(
dataset_database
)

return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id)
Loading