-
Notifications
You must be signed in to change notification settings - Fork 954
feat: Add database deletion on dataset delete #1893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cb16dc0
47c0278
0398663
cc72e7f
031cc46
87d8fe0
31a3410
d36fb2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,4 @@ | ||
| from .get_or_create_dataset_database import get_or_create_dataset_database | ||
| from .resolve_dataset_database_connection_info import resolve_dataset_database_connection_info | ||
| from .get_graph_dataset_database_handler import get_graph_dataset_database_handler | ||
| from .get_vector_dataset_database_handler import get_vector_dataset_database_handler |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| from cognee.modules.users.models.DatasetDatabase import DatasetDatabase | ||
|
|
||
|
|
||
| def get_graph_dataset_database_handler(dataset_database: DatasetDatabase) -> dict: | ||
| from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( | ||
| supported_dataset_database_handlers, | ||
| ) | ||
|
|
||
| handler = supported_dataset_database_handlers[dataset_database.graph_dataset_database_handler] | ||
| return handler |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| from cognee.modules.users.models.DatasetDatabase import DatasetDatabase | ||
|
|
||
|
|
||
| def get_vector_dataset_database_handler(dataset_database: DatasetDatabase) -> dict: | ||
| from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( | ||
| supported_dataset_database_handlers, | ||
| ) | ||
|
|
||
| handler = supported_dataset_database_handlers[dataset_database.vector_dataset_database_handler] | ||
| return handler |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,10 @@ | |
| from cognee.infrastructure.databases.vector import get_vector_engine | ||
| from cognee.infrastructure.databases.graph.get_graph_engine import get_graph_engine | ||
| from cognee.infrastructure.databases.relational import get_relational_engine | ||
| from cognee.infrastructure.databases.utils import ( | ||
| get_graph_dataset_database_handler, | ||
| get_vector_dataset_database_handler, | ||
| ) | ||
| from cognee.shared.cache import delete_cache | ||
| from cognee.modules.users.models import DatasetDatabase | ||
| from cognee.shared.logging_utils import get_logger | ||
|
|
@@ -13,22 +17,13 @@ | |
|
|
||
|
|
||
| async def prune_graph_databases(): | ||
| async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict: | ||
| from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( | ||
| supported_dataset_database_handlers, | ||
| ) | ||
|
|
||
| handler = supported_dataset_database_handlers[ | ||
| dataset_database.graph_dataset_database_handler | ||
| ] | ||
| return await handler["handler_instance"].delete_dataset(dataset_database) | ||
|
|
||
| db_engine = get_relational_engine() | ||
| try: | ||
| data = await db_engine.get_all_data_from_table("dataset_database") | ||
| dataset_databases = await db_engine.get_all_data_from_table("dataset_database") | ||
| # Go through each dataset database and delete the graph database | ||
| for data_item in data: | ||
| await _prune_graph_db(data_item) | ||
| for dataset_database in dataset_databases: | ||
borisarzentar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| handler = get_graph_dataset_database_handler(dataset_database) | ||
| await handler["handler_instance"].delete_dataset(dataset_database) | ||
|
Comment on lines
+22
to
+26
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: cat -n cognee/modules/data/deletion/prune_system.pyRepository: topoteretes/cognee Length of output: 3544 🏁 Script executed: # Find the db_engine.get_all_data_from_table implementation
find . -type f -name "*.py" | xargs grep -l "def get_all_data_from_table" | head -5Repository: topoteretes/cognee Length of output: 138 🏁 Script executed: # Find get_graph_dataset_database_handler function
rg -n "def get_graph_dataset_database_handler" --type=pyRepository: topoteretes/cognee Length of output: 222 🏁 Script executed: # Find DatasetDatabase class definition
rg -n "class DatasetDatabase" --type=pyRepository: topoteretes/cognee Length of output: 280 🏁 Script executed: cat -n cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | grep -A 20 "def get_all_data_from_table"Repository: topoteretes/cognee Length of output: 1057 🏁 Script executed: cat -n cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.pyRepository: topoteretes/cognee Length of output: 569 🏁 Script executed: cat -n cognee/modules/users/models/DatasetDatabase.pyRepository: topoteretes/cognee Length of output: 2140 🏁 Script executed: cat -n cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | sed -n '417,460p'Repository: topoteretes/cognee Length of output: 1885 🏁 Script executed: # Search for complete implementation of get_all_data_from_table
rg -A 30 "async def get_all_data_from_table" cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.pyRepository: topoteretes/cognee Length of output: 1234 🏁 Script executed: # Check how get_all_data_from_table is used elsewhere in the codebase to see if there are other examples
rg -B 2 -A 5 "get_all_data_from_table\(" --type=py | head -50Repository: topoteretes/cognee Length of output: 3949 🏁 Script executed: # Check SQLAlchemy Row behavior - does mappings() support attribute access?
python3 << 'EOF'
# Check SQLAlchemy documentation for Row behavior
import sys
try:
from sqlalchemy import Row
# Check if Row supports attribute access
print("SQLAlchemy Row class:", Row)
print("Row supports attribute access via __getattr__")
help(Row)
except Exception as e:
print(f"Error: {e}")
EOFRepository: topoteretes/cognee Length of output: 97 🏁 Script executed: # Let's check if there are any similar patterns in the codebase where Row mappings are used with attribute access
rg -B 3 -A 3 "\.mappings\(\)" --type=py cognee/infrastructure/databases/relational/sqlalchemy/Repository: topoteretes/cognee Length of output: 802 🏁 Script executed: # Check if DatasetDatabase is instantiated directly elsewhere or if Row objects are used directly
rg -B 2 -A 2 "DatasetDatabase\(" --type=py cognee/ | grep -v test | head -40Repository: topoteretes/cognee Length of output: 942 🏁 Script executed: # Check SQLAlchemy version in requirements or setup files
find . -name "requirements*.txt" -o -name "pyproject.toml" -o -name "setup.py" | xargs grep -i sqlalchemyRepository: topoteretes/cognee Length of output: 230 🏁 Script executed: # Check if there are other usages where get_all_data_from_table results are used with attribute access
rg -B 2 -A 5 "get_all_data_from_table" --type=py cognee/ | grep -A 5 "for.*in.*get_all_data_from_table"Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Look for prune_vector_databases which has same pattern as prune_graph_databases
cat -n cognee/modules/data/deletion/prune_system.py | sed -n '35,48p'Repository: topoteretes/cognee Length of output: 825 🏁 Script executed: # Check SQLAlchemy Row attribute access support in 2.0+
# Search for any existing uses of Row attribute access in the codebase
rg "result\[" cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | head -10Repository: topoteretes/cognee Length of output: 140 🏁 Script executed: # Look for any test that verifies the attribute access on Row objects from get_all_data_from_table
rg -B 2 -A 2 "result\[.*\]" cognee/tests/test_deduplication.py | head -20Repository: topoteretes/cognee Length of output: 1194 🏁 Script executed: # Check if get_vector_dataset_database_handler also uses attribute access like get_graph_dataset_database_handler
cat -n cognee/infrastructure/databases/utils/get_vector_dataset_database_handler.pyRepository: topoteretes/cognee Length of output: 571 🏁 Script executed: # Check if there's a typing issue - look at the actual parameter types expected by the handlers
rg -B 5 -A 10 "def get_graph_dataset_database_handler" cognee/infrastructure/databases/utils/get_graph_dataset_database_handler.pyRepository: topoteretes/cognee Length of output: 499 🏁 Script executed: # Check if Row objects from SQLAlchemy 2.0 are being used with attribute access anywhere in the codebase
rg "for .* in .*get_all_data_from_table" -A 3 --type=py cognee/ | grep -v testRepository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Check if there are any Row to Model conversions happening anywhere
rg "Row\|mappings" --type=py cognee/infrastructure/databases/relational/sqlalchemy/Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Let me verify SQLAlchemy 2.0 Row behavior by checking pyproject.toml more carefully
cat -n pyproject.toml | grep -A 5 -B 5 "sqlalchemy"Repository: topoteretes/cognee Length of output: 1354 🏁 Script executed: # Check if Row objects truly support both dict and attribute access in actual usage
# Look for any error handling or type: ignore comments related to this
rg "type: ignore|# noqa" cognee/modules/data/deletion/prune_system.pyRepository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Search for any other functions that receive Row objects and use attribute access
rg -B 3 "\.graph_dataset_database_handler|\.vector_dataset_database_handler" --type=py cognee/Repository: topoteretes/cognee Length of output: 3574 🏁 Script executed: # Verify SQLAlchemy 2.0+ Row attribute access support by checking actual Row class behavior
# Look for any issues or type annotations in the codebase related to this
rg "RoomRowMapping|LegacyRow" --type=py cognee/Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Check if there are type checking issues - look for mypy or pyright configurations
cat pyproject.toml | grep -A 10 "\[tool.mypy\]"Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Verify the actual usage pattern - if the code works, Row objects must support attribute access
# Check if there are any integration tests that use prune_system
rg "prune_graph_databases\|prune_vector_databases" --type=py cognee/tests/Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Final verification: Check SQLAlchemy Row documentation behavior in 2.0
# by looking at actual attribute access patterns in the codebase
rg "for .* in .*await.*\." -A 3 --type=py cognee/modules/data/ | head -30Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Check if Row objects from mappings() actually support attribute access
# by looking for any similar patterns where Row attributes are accessed
rg "dataset_databases\[.*\]\." --type=py cognee/Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Verify by checking actual usage in context_global_variables where similar access happens
cat -n cognee/context_global_variables.py | sed -n '1,80p'Repository: topoteretes/cognee Length of output: 5277 🏁 Script executed: # Final check: SQLAlchemy 2.0+ Row objects support both dict and attribute access
# Let's verify this by checking the actual Row implementation usage
rg "from sqlalchemy" cognee/infrastructure/databases/relational/sqlalchemy/SqlAlchemyAdapter.py | head -10Repository: topoteretes/cognee Length of output: 309 🏁 Script executed: # Check if there's any Row wrapping or conversion happening
rg "class.*Row|def.*mappings" --type=py cognee/infrastructure/databases/Repository: topoteretes/cognee Length of output: 44 🏁 Script executed: # Verify the complete picture - check what happens in a similar function that works
cat -n cognee/infrastructure/databases/utils/get_or_create_dataset_database.py | head -50Repository: topoteretes/cognee Length of output: 2232 🏁 Script executed: # Let's verify SQLAlchemy Row attribute access by examining the actual imports and checking documentation
# SQLAlchemy 2.0+ Row objects support both dict and attribute access by default
python3 << 'EOF'
# Check SQLAlchemy 2.0 Row behavior documentation
# According to SQLAlchemy 2.0 docs, Row objects returned from mappings() support attribute access
# This is a documented feature of SQLAlchemy 2.0+
# The Row class in SQLAlchemy 2.0+ implements both:
# - __getitem__ for dict-like access: row["column_name"]
# - __getattr__ for attribute access: row.column_name
print("SQLAlchemy 2.0+ Row objects support BOTH dict-like and attribute access:")
print("- row['column_name'] works")
print("- row.column_name works")
print("\nThis is documented behavior in SQLAlchemy 2.0+")
print("See: https://docs.sqlalchemy.org/en/20/orm/queryguide.html#rows-are-namedtuples")
EOFRepository: topoteretes/cognee Length of output: 313 🏁 Script executed: # Verify that the code actually works with Row attribute access by checking for any actual runtime errors
# or exception handling around this pattern
rg "AttributeError|except.*Error" cognee/modules/data/deletion/prune_system.pyRepository: topoteretes/cognee Length of output: 250 Add docstring to prune_graph_databases function. The function lacks documentation as required by project guidelines (code style guideline: "undocumented function definitions in the project's Python code are assumed incomplete"). 🤖 Prompt for AI Agents
Comment on lines
+25
to
+26
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider validating handler dictionary structure. The code accesses Consider adding defensive validation: handler = get_graph_dataset_database_handler(dataset_database)
+ if "handler_instance" not in handler:
+ logger.warning("Handler missing 'handler_instance' key for dataset_database")
+ continue
await handler["handler_instance"].delete_dataset(dataset_database)
🤖 Prompt for AI Agents |
||
| except (OperationalError, EntityNotFoundError) as e: | ||
| logger.debug( | ||
| "Skipping pruning of graph DB. Error when accessing dataset_database table: %s", | ||
|
|
@@ -38,22 +33,13 @@ async def _prune_graph_db(dataset_database: DatasetDatabase) -> dict: | |
|
|
||
|
|
||
| async def prune_vector_databases(): | ||
| async def _prune_vector_db(dataset_database: DatasetDatabase) -> dict: | ||
| from cognee.infrastructure.databases.dataset_database_handler.supported_dataset_database_handlers import ( | ||
| supported_dataset_database_handlers, | ||
| ) | ||
|
|
||
| handler = supported_dataset_database_handlers[ | ||
| dataset_database.vector_dataset_database_handler | ||
| ] | ||
| return await handler["handler_instance"].delete_dataset(dataset_database) | ||
|
|
||
| db_engine = get_relational_engine() | ||
| try: | ||
| data = await db_engine.get_all_data_from_table("dataset_database") | ||
| dataset_databases = await db_engine.get_all_data_from_table("dataset_database") | ||
| # Go through each dataset database and delete the vector database | ||
| for data_item in data: | ||
| await _prune_vector_db(data_item) | ||
| for dataset_database in dataset_databases: | ||
| handler = get_vector_dataset_database_handler(dataset_database) | ||
| await handler["handler_instance"].delete_dataset(dataset_database) | ||
| except (OperationalError, EntityNotFoundError) as e: | ||
| logger.debug( | ||
| "Skipping pruning of vector DB. Error when accessing dataset_database table: %s", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,34 @@ | ||
| from cognee.modules.users.models import DatasetDatabase | ||
| from sqlalchemy import select | ||
|
|
||
| from cognee.modules.data.models import Dataset | ||
| from cognee.infrastructure.databases.utils.get_vector_dataset_database_handler import ( | ||
| get_vector_dataset_database_handler, | ||
| ) | ||
| from cognee.infrastructure.databases.utils.get_graph_dataset_database_handler import ( | ||
| get_graph_dataset_database_handler, | ||
| ) | ||
| from cognee.infrastructure.databases.relational import get_relational_engine | ||
|
|
||
|
|
||
| async def delete_dataset(dataset: Dataset): | ||
| db_engine = get_relational_engine() | ||
|
|
||
| async with db_engine.get_async_session() as session: | ||
| stmt = select(DatasetDatabase).where( | ||
| DatasetDatabase.dataset_id == dataset.id, | ||
| ) | ||
| dataset_database: DatasetDatabase = await session.scalar(stmt) | ||
| if dataset_database: | ||
| graph_dataset_database_handler = get_graph_dataset_database_handler(dataset_database) | ||
| vector_dataset_database_handler = get_vector_dataset_database_handler(dataset_database) | ||
| await graph_dataset_database_handler["handler_instance"].delete_dataset( | ||
| dataset_database | ||
| ) | ||
| await vector_dataset_database_handler["handler_instance"].delete_dataset( | ||
| dataset_database | ||
| ) | ||
| # TODO: Remove dataset from pipeline_run_status in Data objects related to dataset as well | ||
| # This blocks recreation of the dataset with the same name and data after deletion as | ||
| # it's marked as completed and will be just skipped even though it's empty. | ||
| return await db_engine.delete_entity_by_id(dataset.__tablename__, dataset.id) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| import os | ||
| import asyncio | ||
| import pathlib | ||
| from uuid import UUID | ||
|
|
||
| import cognee | ||
| from cognee.shared.logging_utils import setup_logging, ERROR | ||
| from cognee.modules.data.methods.delete_dataset import delete_dataset | ||
| from cognee.modules.data.methods.get_dataset import get_dataset | ||
| from cognee.modules.users.methods import get_default_user | ||
|
|
||
|
|
||
| async def main(): | ||
| # Set data and system directory paths | ||
| data_directory_path = str( | ||
| pathlib.Path( | ||
| os.path.join(pathlib.Path(__file__).parent, ".data_storage/test_dataset_delete") | ||
| ).resolve() | ||
| ) | ||
| cognee.config.data_root_directory(data_directory_path) | ||
| cognee_directory_path = str( | ||
| pathlib.Path( | ||
| os.path.join(pathlib.Path(__file__).parent, ".cognee_system/test_dataset_delete") | ||
| ).resolve() | ||
| ) | ||
| cognee.config.system_root_directory(cognee_directory_path) | ||
|
|
||
| # Create a clean slate for cognee -- reset data and system state | ||
| print("Resetting cognee data...") | ||
| await cognee.prune.prune_data() | ||
| await cognee.prune.prune_system(metadata=True) | ||
| print("Data reset complete.\n") | ||
|
|
||
| # cognee knowledge graph will be created based on this text | ||
| text = """ | ||
| Natural language processing (NLP) is an interdisciplinary | ||
| subfield of computer science and information retrieval. | ||
| """ | ||
|
|
||
| # Add the text, and make it available for cognify | ||
| await cognee.add(text, "nlp_dataset") | ||
| await cognee.add("Quantum computing is the study of quantum computers.", "quantum_dataset") | ||
|
|
||
| # Use LLMs and cognee to create knowledge graph | ||
| ret_val = await cognee.cognify() | ||
| user = await get_default_user() | ||
|
|
||
| for val in ret_val: | ||
| dataset_id = str(val) | ||
| vector_db_path = os.path.join( | ||
| cognee_directory_path, "databases", str(user.id), dataset_id + ".lance.db" | ||
| ) | ||
| graph_db_path = os.path.join( | ||
| cognee_directory_path, "databases", str(user.id), dataset_id + ".pkl" | ||
| ) | ||
|
|
||
| # Check if databases are properly created and exist before deletion | ||
| assert os.path.exists(graph_db_path), "Graph database file not found." | ||
| assert os.path.exists(vector_db_path), "Vector database file not found." | ||
|
|
||
| dataset = await get_dataset(user_id=user.id, dataset_id=UUID(dataset_id)) | ||
| await delete_dataset(dataset) | ||
|
|
||
| # Confirm databases have been deleted | ||
| assert not os.path.exists(graph_db_path), "Graph database file found." | ||
| assert not os.path.exists(vector_db_path), "Vector database file found." | ||
borisarzentar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| logger = setup_logging(log_level=ERROR) | ||
| loop = asyncio.new_event_loop() | ||
| asyncio.set_event_loop(loop) | ||
| try: | ||
| loop.run_until_complete(main()) | ||
| finally: | ||
| loop.run_until_complete(loop.shutdown_asyncgens()) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add docstring to document function purpose.
Per coding guidelines, undocumented function definitions are considered incomplete. This function should have a docstring explaining that it deletes all graph databases for all dataset_database records when backend access control is enabled.
Apply this diff:
As per coding guidelines, undocumented functions are assumed incomplete.
🤖 Prompt for AI Agents