-
Notifications
You must be signed in to change notification settings - Fork 963
Cog 505 data dataset model changes #260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6bb0f3d
0ce254b
f5b5e56
378e7b8
349ddfe
813b76c
387002d
add6730
9ba5d49
e80377b
1e098ae
9429e5e
cc6fbe2
d7fa9f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,69 @@ | ||||||
| name: test | deduplication | ||||||
|
|
||||||
| on: | ||||||
| workflow_dispatch: | ||||||
| pull_request: | ||||||
| branches: | ||||||
| - main | ||||||
| types: [labeled, synchronize] | ||||||
|
|
||||||
|
|
||||||
| concurrency: | ||||||
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | ||||||
| cancel-in-progress: true | ||||||
|
|
||||||
| env: | ||||||
| RUNTIME__LOG_LEVEL: ERROR | ||||||
|
|
||||||
| jobs: | ||||||
| get_docs_changes: | ||||||
| name: docs changes | ||||||
| uses: ./.github/workflows/get_docs_changes.yml | ||||||
|
|
||||||
| run_deduplication_test: | ||||||
| name: test | ||||||
| needs: get_docs_changes | ||||||
| if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' && ${{ github.event.label.name == 'run-checks' }} | ||||||
| runs-on: ubuntu-latest | ||||||
| defaults: | ||||||
| run: | ||||||
| shell: bash | ||||||
| services: | ||||||
| postgres: | ||||||
| image: pgvector/pgvector:pg17 | ||||||
| env: | ||||||
| POSTGRES_USER: cognee | ||||||
| POSTGRES_PASSWORD: cognee | ||||||
| POSTGRES_DB: cognee_db | ||||||
| options: >- | ||||||
| --health-cmd pg_isready | ||||||
| --health-interval 10s | ||||||
| --health-timeout 5s | ||||||
| --health-retries 5 | ||||||
| ports: | ||||||
| - 5432:5432 | ||||||
|
|
||||||
| steps: | ||||||
| - name: Check out | ||||||
| uses: actions/checkout@master | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Security: Use specific version for checkout action Using @master for actions can be dangerous as it may introduce breaking changes or security issues. - uses: actions/checkout@master
+ uses: actions/checkout@v4📝 Committable suggestion
Suggested change
|
||||||
|
|
||||||
| - name: Setup Python | ||||||
| uses: actions/setup-python@v5 | ||||||
| with: | ||||||
| python-version: '3.11.x' | ||||||
|
|
||||||
| - name: Install Poetry | ||||||
| uses: snok/[email protected] | ||||||
| with: | ||||||
| virtualenvs-create: true | ||||||
| virtualenvs-in-project: true | ||||||
| installer-parallel: true | ||||||
|
|
||||||
| - name: Install dependencies | ||||||
| run: poetry install -E postgres --no-interaction | ||||||
|
|
||||||
| - name: Run deduplication test | ||||||
| env: | ||||||
| ENV: 'dev' | ||||||
| LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} | ||||||
| run: poetry run python ./cognee/tests/test_deduplication.py | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,11 @@ | ||
| from uuid import uuid5, NAMESPACE_OID | ||
| from .data_types import IngestionData | ||
|
|
||
| def identify(data: IngestionData) -> str: | ||
| data_id: str = data.get_identifier() | ||
| from cognee.modules.users.models import User | ||
|
|
||
| return uuid5(NAMESPACE_OID, data_id) | ||
|
|
||
| def identify(data: IngestionData, user: User) -> str: | ||
| data_content_hash: str = data.get_identifier() | ||
|
|
||
| # return UUID hash of file contents + owner id | ||
| return uuid5(NAMESPACE_OID, f"{data_content_hash}{user.id}") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,25 +1,28 @@ | ||
| import string | ||
| import random | ||
| import os.path | ||
| import hashlib | ||
| from typing import BinaryIO, Union | ||
| from cognee.base_config import get_base_config | ||
| from cognee.infrastructure.files.storage import LocalStorage | ||
| from .classify import classify | ||
|
|
||
| def save_data_to_file(data: Union[str, BinaryIO], dataset_name: str, filename: str = None): | ||
| def save_data_to_file(data: Union[str, BinaryIO], filename: str = None): | ||
| base_config = get_base_config() | ||
| data_directory_path = base_config.data_root_directory | ||
|
|
||
| classified_data = classify(data, filename) | ||
|
|
||
| storage_path = data_directory_path + "/" + dataset_name.replace(".", "/") | ||
| storage_path = os.path.join(data_directory_path, "data") | ||
| LocalStorage.ensure_directory_exists(storage_path) | ||
|
|
||
| file_metadata = classified_data.get_metadata() | ||
| if "name" not in file_metadata or file_metadata["name"] is None: | ||
| letters = string.ascii_lowercase | ||
| random_string = "".join(random.choice(letters) for _ in range(32)) | ||
| file_metadata["name"] = "text_" + random_string + ".txt" | ||
| data_contents = classified_data.get_data().encode('utf-8') | ||
| hash_contents = hashlib.md5(data_contents).hexdigest() | ||
| file_metadata["name"] = "text_" + hash_contents + ".txt" | ||
dexters1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| file_name = file_metadata["name"] | ||
| LocalStorage(storage_path).store(file_name, classified_data.get_data()) | ||
|
|
||
| # Don't save file if it already exists | ||
| if not os.path.isfile(os.path.join(storage_path, file_name)): | ||
| LocalStorage(storage_path).store(file_name, classified_data.get_data()) | ||
|
|
||
| return "file://" + storage_path + "/" + file_name | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,9 @@ | ||
| """ | ||
| Custom exceptions for the Cognee API. | ||
|
|
||
| This module defines a set of exceptions for handling various shared utility errors | ||
| """ | ||
|
|
||
| from .exceptions import ( | ||
| IngestionError, | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| from cognee.exceptions import CogneeApiError | ||
| from fastapi import status | ||
|
|
||
| class IngestionError(CogneeApiError): | ||
| def __init__( | ||
| self, | ||
| message: str = "Failed to load data.", | ||
| name: str = "IngestionError", | ||
| status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
| ): | ||
| super().__init__(message, name, status_code) |
Uh oh!
There was an error while loading. Please reload this page.