diff --git a/.github/workflows/daily_pypi_download_stats.yaml b/.github/workflows/daily_pypi_download_stats.yaml deleted file mode 100644 index 5192eb9609..0000000000 --- a/.github/workflows/daily_pypi_download_stats.yaml +++ /dev/null @@ -1,36 +0,0 @@ -name: analytics | Update Cognee Stats Daily - -on: - schedule: - - cron: '0 1 * * *' # Runs every day at 01:00 UTC - -jobs: - update_stats: - runs-on: ubuntu-latest - - steps: - - name: Checkout Repository - uses: actions/checkout@v3 - with: - persist-credentials: false - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Install Dependencies - run: | - pip install requests posthog - - - name: Run Update Script - env: - POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} - POSTHOG_API_HOST: ${{ secrets.POSTHOG_API_HOST }} - run: | - cd tools # Change to the 'tools' directory - echo "Current working directory after changing to tools:" - pwd # Print the working directory again - echo "List of folders in the tools directory:" - ls -la # List all files and folders in the 'tools' directory - python daily_pypi_downloads.py # Run the script \ No newline at end of file diff --git a/.github/workflows/get_docs_changes.yml b/.github/workflows/get_docs_changes.yml deleted file mode 100644 index 5ac194d9ee..0000000000 --- a/.github/workflows/get_docs_changes.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: util | get docs changes - -on: - workflow_call: - outputs: - changes_outside_docs: - description: "Changes outside docs" - value: ${{ jobs.get_docs_changes.outputs.changes_outside_docs }} - -env: - EXCLUDED_FILE_PATTERNS: '^docs/|^README.md|^CONTRIBUTING.md|^LICENSE\.txt|\.editorconfig|\.gitignore|get_docs_changes.yml' - - -jobs: - get_docs_changes: - name: docs changes - runs-on: ubuntu-latest - outputs: - changes_outside_docs: ${{ steps.check_changes.outputs.changes_outside_docs }} - - steps: - - name: Checkout code - uses: actions/checkout@master - with: - fetch-depth: 0 - - - name: Check changes outside docs - id: check_changes - run: | - echo "base.sha: ${{ github.event.pull_request.base.sha }}" - echo "head.sha: ${{ github.event.pull_request.head.sha }}" - - merge_base_sha=$(git merge-base ${{ github.event.pull_request.base.sha }} ${{ github.event.pull_request.head.sha }}) - - echo "merge_base_sha: $merge_base_sha" - - changes_outside_docs=$( - git diff --name-only $merge_base_sha ${{ github.event.pull_request.head.sha }} \ - | grep -vE "${{ env.EXCLUDED_FILE_PATTERNS }}" || true - ) - - echo $changes_outside_docs - if [ -z "$changes_outside_docs" ]; then - echo "No changes outside docs. Skipping tests." - echo "changes_outside_docs=false" >> $GITHUB_OUTPUT - else - echo "Changes detected outside docs." - echo "changes_outside_docs=true" >> $GITHUB_OUTPUT - fi \ No newline at end of file diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml deleted file mode 100644 index 08a123c6bc..0000000000 --- a/.github/workflows/mkdocs.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: util docs | build and deploy - -on: - pull_request: - branches: - - main - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - RUNTIME__LOG_LEVEL: ERROR - -permissions: - contents: write - -jobs: - get_docs_changes: - name: run docs changes - uses: ./.github/workflows/get_docs_changes.yml - deploy: - runs-on: ubuntu-latest - needs: get_docs_changes - if: needs.get_docs_changes.outputs.changes_outside_docs == 'false' - - steps: - - name: Checkout code - uses: actions/checkout@master - - - name: Install Poetry - uses: snok/install-poetry@v1.3.1 - - - name: Use output - run: echo "The stage is finished" - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: '3.11.x' - - - name: Install APT packages - run: | - sudo apt-get update && - sudo apt-get install pngquant - - - name: Install via Poetry - run: poetry install --with dev,docs - env: - GH_TOKEN: ${{ secrets.PAT_FOR_CROSS_REPOS_CICD_TRIGGERING }} - - - name: Build and deploy MkDocs - run: poetry run mkdocs gh-deploy --force - env: - DOCS_SEGMENT_KEY: ${{ secrets.DOCS_SEGMENT_KEY }} diff --git a/.github/workflows/posthog_pipeline.yaml b/.github/workflows/posthog_pipeline.yaml deleted file mode 100644 index 1e34ee8cf2..0000000000 --- a/.github/workflows/posthog_pipeline.yaml +++ /dev/null @@ -1,44 +0,0 @@ -name: analytics | Push GitHub Data to PostHog - -on: - schedule: - - cron: '0 0 * * *' # Runs every day at midnight - workflow_dispatch: - -jobs: - push-data: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install requests posthog - - - name: Print working directory, list folders, and run script - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - POSTHOG_API_KEY: ${{ secrets.POSTHOG_API_KEY }} - POSTHOG_HOST: ${{ secrets.POSTHOG_HOST }} - GITHUB_REPOSITORY: ${{ github.repository }} - run: | - echo "Current working directory:" - pwd # Print the current working directory - echo "List of folders in the current directory:" - ls -la # List all files and folders in the current directory - echo "Changing to tools directory..." - cd tools # Change to the 'tools' directory - echo "Current working directory after changing to tools:" - pwd # Print the working directory again - echo "List of folders in the tools directory:" - ls -la # List all files and folders in the 'tools' directory - python push_to_posthog.py # Run the script - diff --git a/docs/api_reference.md b/docs/api_reference.md deleted file mode 100644 index b0fda0b190..0000000000 --- a/docs/api_reference.md +++ /dev/null @@ -1,299 +0,0 @@ -# Cognee API Reference - -## Overview - -The Cognee API provides a set of endpoints for managing datasets, performing cognitive tasks, and configuring various settings in the system. The API is built on FastAPI and includes multiple routes to handle different functionalities. This reference outlines the available endpoints and their usage. - -## Base URL - -The base URL for all API requests is determined by the server's deployment environment. Typically, this will be: - -- **Development**: `http://localhost:8000` -- **Production**: Depending on your server setup. - -## Endpoints - -### 1. Root - -- **URL**: `/` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Root endpoint that returns a welcome message. - - **Response**: - ```json - { - "status": 200, - "body": { - "message": "Hello, World, I am alive!" - } - } - ``` - -### 2. Health Check - -- **URL**: `/health` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Health check endpoint that returns the server status. - - **Response**: - ```json - { - "status": 200 - } - ``` - -### 3. Get Datasets - -- **URL**: `/datasets` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Retrieve a list of available datasets. - - **Response**: - ```json - { - "status": 200, - "body": [ - { - "id": "dataset_id_1", - "name": "Dataset Name 1", - "description": "Description of Dataset 1", - ... - }, - ... - ] - } - ``` - -### 4. Delete Dataset - -- **URL**: `/datasets/{dataset_id}` -- **Method**: `DELETE` -- **Auth Required**: No -- **Description**: Delete a specific dataset by its ID. - - **Path Parameters**: - - `dataset_id`: The ID of the dataset to delete. - - **Response**: - ```json - { - "status": 200 - } - ``` - -### 5. Get Dataset Graph - -- **URL**: `/datasets/{dataset_id}/graph` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Retrieve the graph visualization URL for a specific dataset. - - **Path Parameters**: - - `dataset_id`: The ID of the dataset. - - **Response**: - ```json - "http://example.com/path/to/graph" - ``` - -### 6. Get Dataset Data - -- **URL**: `/datasets/{dataset_id}/data` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Retrieve data associated with a specific dataset. - - **Path Parameters**: - - `dataset_id`: The ID of the dataset. - - **Response**: - ```json - { - "status": 200, - "body": [ - { - "data_id": "data_id_1", - "content": "Data content here", - ... - }, - ... - ] - } - ``` - -### 7. Get Dataset Status - -- **URL**: `/datasets/status` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Retrieve the status of one or more datasets. - - **Query Parameters**: - - `dataset`: A list of dataset IDs to check status for. - - **Response**: - ```json - { - "status": 200, - "body": { - "dataset_id_1": "Status 1", - "dataset_id_2": "Status 2", - ... - } - } - ``` - -### 8. Get Raw Data - -- **URL**: `/datasets/{dataset_id}/data/{data_id}/raw` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Retrieve the raw data file for a specific data entry in a dataset. - - **Path Parameters**: - - `dataset_id`: The ID of the dataset. - - `data_id`: The ID of the data entry. - - **Response**: Raw file download. - -### 9. Add Data - -- **URL**: `/add` -- **Method**: `POST` -- **Auth Required**: No -- **Description**: Add new data to a dataset. The data can be uploaded from a file or a URL. - - **Form Parameters**: - - `datasetId`: The ID of the dataset to add data to. - - `data`: A list of files to upload. - - **Request** - ```json - { - "dataset_id": "ID_OF_THE_DATASET_TO_PUT_DATA_IN", // Optional, we use "main" as default. - "files": File[] - } - ``` - - **Response**: - ```json - { - "status": 200 - } - ``` - -### 10. Cognify - -- **URL**: `/cognify` -- **Method**: `POST` -- **Auth Required**: No -- **Description**: Perform cognitive processing on the specified datasets. - - **Request Body**: - ```json - { - "datasets": ["ID_OF_THE_DATASET_1", "ID_OF_THE_DATASET_2", ...] - } - ``` - - **Response**: - ```json - { - "status": 200 - } - ``` - -### 11. Search - -- **URL**: `/search` -- **Method**: `POST` -- **Auth Required**: No -- **Description**: Search for nodes in the graph based on the provided query parameters. - - **Request Body**: - ```json - { - "searchType": "INSIGHTS", // Or "SUMMARIES" or "CHUNKS" - "query": "QUERY_TO_MATCH_DATA" - } - ``` - - **Response** - - For "INSIGHTS" search type: - ```json - { - "status": 200, - "body": [[ - { "name" "source_node_name" }, - { "relationship_name" "between_nodes_relationship_name" }, - { "name" "target_node_name" }, - ]] - } - ``` - - For "SUMMARIES" search type: - ```json - { - "status": 200, - "body": [ - { "text" "summary_text" }, - { "text" "summary_text" }, - { "text" "summary_text" }, - ... - ] - } - ``` - - For "CHUNKS" search type: - ```json - { - "status": 200, - "body": [ - { "text" "chunk_text" }, - { "text" "chunk_text" }, - { "text" "chunk_text" }, - ... - ] - } - ``` - -### 12. Get Settings - -- **URL**: `/settings` -- **Method**: `GET` -- **Auth Required**: No -- **Description**: Retrieve the current system settings. - - **Response**: - ```json - { - "status": 200, - "body": { - "llm": {...}, - "vectorDB": {...}, - ... - } - } - ``` - -### 13. Save Settings - -- **URL**: `/settings` -- **Method**: `POST` -- **Auth Required**: No -- **Description**: Save new settings for the system, including LLM and vector DB configurations. - - **Request Body**: - - `llm`: Optional. The configuration for the LLM provider. - - `vectorDB`: Optional. The configuration for the vector database provider. - - **Response**: - ```json - { - "status": 200 - } - ``` diff --git a/docs/assets/favicon.png b/docs/assets/favicon.png deleted file mode 100644 index c3c39b7ed8..0000000000 Binary files a/docs/assets/favicon.png and /dev/null differ diff --git a/docs/assets/logo.png b/docs/assets/logo.png deleted file mode 100644 index c37c647ea1..0000000000 Binary files a/docs/assets/logo.png and /dev/null differ diff --git a/docs/concepts/graph_data_models.md b/docs/concepts/graph_data_models.md deleted file mode 100644 index f5f25c4639..0000000000 --- a/docs/concepts/graph_data_models.md +++ /dev/null @@ -1,6 +0,0 @@ -Graph data models are fundamental structures used to represent and store data in the form of graphs, which consist of nodes (or vertices) and edges (or links). This model is particularly effective for illustrating relationships and connections among various data entities, making it invaluable in domains such as social networks, recommendation systems, logistics, biological networks, and more. Here's an overview of key concepts and types of graph data models: - -Key Concepts: -Nodes (Vertices): Represent entities or objects within the graph, such as people in a social network, stations in a transportation map, or proteins in biological networks. -Edges (Links): Depict the relationships or interactions between nodes. Edges can be directed (indicating a one-way relationship) or undirected (indicating a mutual relationship). -Properties: Both nodes and edges can have properties (key-value pairs) that provide additional information, such as weights, types, or other attributes relevant to the application. \ No newline at end of file diff --git a/docs/concepts/llm_structured_outputs.md b/docs/concepts/llm_structured_outputs.md deleted file mode 100644 index b2fcedf989..0000000000 --- a/docs/concepts/llm_structured_outputs.md +++ /dev/null @@ -1,8 +0,0 @@ -Function calling in the context of Large Language Models (LLMs) like GPT-3, GPT-4, and their derivatives extends beyond traditional programming paradigms. In this scenario, function calling involves prompting the LLM to simulate the behavior of a function within its generated output. This capability allows users to interact with LLMs in a structured way, effectively requesting specific operations or information retrieval tasks by framing their prompts as function calls. - -How LLM Function Calling Works: -Prompt Construction: The user constructs a prompt that mimics a function call in programming. This prompt includes the "name" of the function (often a description of the task) and the "arguments" (the specific inputs or conditions for the task). For example, a prompt might look like "Generate a summary for the following article:" followed by the article text. - -LLM Interpretation: The LLM interprets this structured prompt and understands it as a request to perform a specific task, similar to how a function in a program would be invoked. The model then generates an output that aligns with the expected behavior of the function described in the prompt. - -Parameters and Outputs: In LLM function calling, the parameters are the details provided in the prompt, and the output is the generated text that the model produces in response. This output is intended to fulfill the function's "purpose" as inferred from the prompt. \ No newline at end of file diff --git a/docs/concepts/multilayer_graph_networks.md b/docs/concepts/multilayer_graph_networks.md deleted file mode 100644 index 0481c0d8be..0000000000 --- a/docs/concepts/multilayer_graph_networks.md +++ /dev/null @@ -1 +0,0 @@ -A multilayer graph network is a sophisticated structure used to model complex systems where entities and their interactions can exist in multiple layers, each representing a different type of relationship, context, or domain. Unlike traditional graphs that capture connections in a single, uniform setting, multilayer graphs provide a more nuanced framework, allowing for the representation of diverse interconnections and dependencies across various dimensions or layers. \ No newline at end of file diff --git a/docs/concepts/propositions.md b/docs/concepts/propositions.md deleted file mode 100644 index 82b06bf5be..0000000000 --- a/docs/concepts/propositions.md +++ /dev/null @@ -1,11 +0,0 @@ - -Propositions are fundamental elements in the study of logic, linguistics, and natural language processing. They represent atomic expressions within texts that encapsulate distinct factoids, conveying specific pieces of information. In essence, a proposition is a declarative statement that can either be true or false, but not both simultaneously. -This binary nature makes propositions crucial for logical deductions, reasoning, and the construction of arguments. - -In a natural language context, propositions are presented in a concise and self-contained format. -They are designed to convey information clearly and unambiguously, making them easily interpretable by humans and computable by machines. For example, the statement "The Eiffel Tower is in Paris" is a proposition because it presents a specific fact about the location of the Eiffel Tower, and its truth value can be assessed as either true or false. - -The concept of propositions extends beyond mere statements of fact to include assertions about concepts, relationships, and conditions. -For instance, "If it rains, the ground gets wet" is a conditional proposition that establishes a cause-and-effect relationship between two events. - -In computational linguistics and natural language processing, propositions are vital for tasks such as information extraction, knowledge representation, and question answering. \ No newline at end of file diff --git a/docs/conceptual_overview.md b/docs/conceptual_overview.md deleted file mode 100644 index f82ac22bc6..0000000000 --- a/docs/conceptual_overview.md +++ /dev/null @@ -1,87 +0,0 @@ -# Conceptual Overview - cognee - -## Introduction - -!!! info "What is cognee?" - cognee is a data processing framework that enables LLMs to produce deterministic and traceable outputs. - - -cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. - -Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy. -## Core Concepts - - -### Concept 1: Data Pipelines -Most of the data we provide to a system can be categorized as unstructured, semi-structured, or structured. Rows from a database would belong to structured data, jsons to semi-structured data, and logs that we input into the system could be considered unstructured. -To organize and process this data, we need to ensure we have custom loaders for all data types, which can help us unify and organize it properly. -
-![Data Pipelines](img/pipelines.png) -
Data Pipeline Example
-
- -In the example above, we have a pipeline in which data has been imported from various sources, normalized, and stored in a database. Relevant identifiers and relationships between the data are also created in this process. -To create an effective data pipeline for processing various types of data—structured, semi-structured, and unstructured—it’s crucial to understand each type's specific handling and processing needs. Let's expand on the concepts involved in setting up such a data pipeline. - -Data Types and Their Handling -- Structured Data: This includes data that adheres to a fixed schema, such as rows in a relational database or data in CSV files. The processing of structured data typically involves SQL queries for extraction, transformations through simple functions or procedures, and loading into destination tables or databases. - -- Semi-structured Data: JSON files, XML, or even some APIs' data fit this category. These data types don't have a rigid schema but have some organizational properties that can be exploited. Semi-structured data often requires parsers that can navigate its structure (like trees for XML or key-value pairs for JSON) to extract necessary information. Libraries such as json in Python or lxml for XML handling can be very useful here. - -- Unstructured Data: This category includes text files, logs, or even images and videos. - - -### Concept 2: Data Enrichment with LLMs -LLMs are adept at processing unstructured data. They can easily extract summaries, keywords, and other useful information from documents. We use function calling with Pydantic models to extract information from the unstructured data. -
-![Data Enrichment](img/enrichment.png) -
Data Enrichment Example
-
-We decompose the loaded content into graphs, allowing us to more precisely map out the relationships between entities and concepts. - -### Concept 3: Graphs -Knowledge graphs simply map out knowledge, linking specific facts and their connections. -When Large Language Models (LLMs) process text, they infer these links, leading to occasional inaccuracies due to their probabilistic nature. - -Clearly defined relationships enhance their accuracy. - -This structured approach can extend beyond concepts to document layouts, pages, or other organizational schemas. -
-![Graph structure](img/graph_structure.png) -
Graph Structure
-
- -### Concept 4: Vector and Graph Retrieval -Cognee lets you use multiple vector and graph retrieval methods to find the most relevant information. -!!! info "Learn more?" - Check out learning materials to see how you can use these methods in your projects. -### Concept 5: Auto-Optimizing Pipelines -Integrating knowledge graphs into Retrieval-Augmented Generation (RAG) pipelines leads to an intriguing outcome: the system's adeptness at contextual understanding allows it to be evaluated in a way Machine Learning (ML) engineers are accustomed to. - -This involves bombarding the RAG system with hundreds of synthetic questions, enabling the knowledge graph to evolve and refine its context autonomously over time. - -This method paves the way for developing self-improving memory engines that can adapt to new data and user feedback. - -## Architecture Overview -A high-level diagram of cognee's architecture, illustrating the main components and their interactions. - -
-![Architecture](img/architecture.png) -
Architecture
-
- -Main components: - -- **Data Pipelines**: Responsible for ingesting, processing, and transforming data from various sources. -- **LLMs**: Large Language Models that process unstructured data and generate text. -- **Graph Store**: Knowledge graphs that represent relationships between entities and concepts. -- **Vector Store**: Database that stores vector representations of data for efficient retrieval. -- **Search**: Retrieves relevant information from the knowledge graph and vector stores. - -## How It Fits Into Your Projects - -!!! info "How cognee fits into your projects" - cognee is a self-contained library that simplifies the process of loading and structuring data in LLMs. - -By integrating cognee into your data pipelines, you can leverage the power of LLMs, knowledge graphs, and vector retrieval to create accurate and explainable AI solutions. -cognee provides a self-contained library that simplifies the process of loading and structuring LLM context, enabling you to create accurate and explainable AI solutions. diff --git a/docs/configuration.md b/docs/configuration.md deleted file mode 100644 index 91ec6ecbac..0000000000 --- a/docs/configuration.md +++ /dev/null @@ -1,93 +0,0 @@ -# Configuration - - - -## 🚀 Configure Vector and Graph Stores - -You can configure the vector and graph stores using the environment variables in your .env file or programmatically. -We use [Pydantic Settings](https://docs.pydantic.dev/latest/concepts/pydantic_settings/#dotenv-env-support) - -We have a global configuration object (cognee.config) and individual configurations on pipeline and data store levels - -Check available configuration options: -``` python -from cognee.infrastructure.databases.vector import get_vectordb_config -from cognee.infrastructure.databases.graph.config import get_graph_config -from cognee.infrastructure.databases.relational import get_relational_config -from cognee.infrastructure.llm.config import get_llm_config -print(get_vectordb_config().to_dict()) -print(get_graph_config().to_dict()) -print(get_relational_config().to_dict()) -print(get_llm_config().to_dict()) - -``` - -Setting the environment variables in your .env file, and Pydantic will pick them up: - -```bash -GRAPH_DATABASE_PROVIDER = 'lancedb' - -``` -Otherwise, you can set the configuration yourself: - -```python -cognee.config.set_llm_provider('ollama') -``` - -## 🚀 Getting Started with Local Models - -You'll need to run the local model on your machine or use one of the providers hosting the model. -!!! note "We had some success with mixtral, but 7b models did not work well. We recommend using mixtral for now." - -### Ollama - -Set up Ollama by following instructions on [Ollama website](https://ollama.com/) - - -Set the environment variable in your .env to use the model - -```bash -LLM_PROVIDER = 'ollama' - -``` -Otherwise, you can set the configuration for the model: - -```bash -cognee.config.set_llm_provider('ollama') - -``` -You can also set the HOST and model name: - -```bash -cognee.config.set_llm_endpoint("http://localhost:11434/v1") -cognee.config.set_llm_model("mistral:instruct") -``` - - -### Anyscale - -```bash -LLM_PROVIDER = 'custom' - -``` -Otherwise, you can set the configuration for the model: - -```bash -cognee.config.set_llm_provider('custom') - -``` -You can also set the HOST and model name: -```bash -LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" -LLM_ENDPOINT = "https://api.endpoints.anyscale.com/v1" -LLM_API_KEY = "your_api_key" -``` - -You can set the same way HOST and model name for any other provider that has an API endpoint. - - - - - - - diff --git a/docs/data_engineering_llm_ops.md b/docs/data_engineering_llm_ops.md deleted file mode 100644 index bbd408b979..0000000000 --- a/docs/data_engineering_llm_ops.md +++ /dev/null @@ -1,32 +0,0 @@ -# Data Engineering and LLMOps - -!!! tip "This is a work in progress and any feedback is welcome" - -## Table of Contents -1. [Data Engineering](#data-engineering) -2. [Large Language Model Operations (LLM Ops)](#large-language-model-operations-llm-ops) - -## Data Engineering - -Data Engineering focuses on managing and analyzing big data. It revolves around five key aspects: - -### Volume -The size and amount of data that companies manage and analyze. - -### Value -The insights and patterns derived from data that lead to business benefits. - -### Variety -The diversity of data types, including unstructured, semi-structured, and raw data. - -### Velocity -The speed at which data is received, stored, and managed. - -### Veracity -The accuracy or truthfulness of data. - -## Large Language Model Operations (LLM Ops) - -The emerging field of Large Language Model Operations (LLM Ops) inherits many practices from data engineering. LLM Ops involves the deployment, monitoring, and maintenance of systems using LLMs to manage and build new generation of AI powered applications. - -For more in-depth information on LLM Ops, see [Resource Name](link-to-resource). \ No newline at end of file diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md deleted file mode 100644 index ff2e7309b2..0000000000 --- a/docs/data_ingestion.md +++ /dev/null @@ -1,46 +0,0 @@ -# How data ingestion with cognee works - - - - -# Why bother with data ingestion? - -In order to use cognee, you need to ingest data into the cognee data store. -This data can be events, customer data, or third-party data. - -In order to build reliable models and pipelines, we need to structure and process various types of datasets and data sources in the same way. -Some of the operations like normalization, deduplication, and data cleaning are common across all data sources. - - -This is where cognee comes in. It provides a unified interface to ingest data from various sources and process it in a consistent way. -For this we use dlt (Data Loading Tool) which is a part of cognee infrastructure. - - -# Example - -Let's say you have a dataset of customer reviews in a PDF file. You want to ingest this data into cognee and use it to train a model. - -You can use the following code to ingest the data: - -```python -dataset_name = "artificial_intelligence" - -ai_text_file_path = os.path.join(pathlib.Path(__file__).parent, "test_data/artificial-intelligence.pdf") -await cognee.add([ai_text_file_path], dataset_name) - -``` - -cognee uses dlt to ingest the data and allows you to use: - -1. SQL databases. Supports PostgreSQL, MySQL, MS SQL Server, BigQuery, Redshift, and more. -2. REST API generic source. Loads data from REST APIs using declarative configuration. -3. OpenAPI source generator. Generates a source from an OpenAPI 3.x spec using the REST API source. -4. Cloud and local storage. Retrieves data from AWS S3, Google Cloud Storage, Azure Blob Storage, local files, and more. - - - -# What happens under the hood? - -We use dlt as a loader to ingest data into the cognee metadata store. We can ingest data from various sources like SQL databases, REST APIs, OpenAPI specs, and cloud storage. -This enables us to have a common data model we can then use to build models and pipelines. -The models and pipelines we build in this way end up in the cognee data store, which is a unified interface to access the data. \ No newline at end of file diff --git a/docs/img/architecture.png b/docs/img/architecture.png deleted file mode 100644 index 3911fdc21b..0000000000 Binary files a/docs/img/architecture.png and /dev/null differ diff --git a/docs/img/bad_architecture.png b/docs/img/bad_architecture.png deleted file mode 100644 index 66c350802d..0000000000 Binary files a/docs/img/bad_architecture.png and /dev/null differ diff --git a/docs/img/enrichment.png b/docs/img/enrichment.png deleted file mode 100644 index 08896c0139..0000000000 Binary files a/docs/img/enrichment.png and /dev/null differ diff --git a/docs/img/good_architecture.png b/docs/img/good_architecture.png deleted file mode 100644 index acf8c5dc53..0000000000 Binary files a/docs/img/good_architecture.png and /dev/null differ diff --git a/docs/img/graph_example.png b/docs/img/graph_example.png deleted file mode 100644 index 5fe7501960..0000000000 Binary files a/docs/img/graph_example.png and /dev/null differ diff --git a/docs/img/graph_structure.png b/docs/img/graph_structure.png deleted file mode 100644 index ebc22db402..0000000000 Binary files a/docs/img/graph_structure.png and /dev/null differ diff --git a/docs/img/linguistic_analysis.png b/docs/img/linguistic_analysis.png deleted file mode 100644 index 94d5828745..0000000000 Binary files a/docs/img/linguistic_analysis.png and /dev/null differ diff --git a/docs/img/loaders.png b/docs/img/loaders.png deleted file mode 100644 index f295314e62..0000000000 Binary files a/docs/img/loaders.png and /dev/null differ diff --git a/docs/img/pipelines.png b/docs/img/pipelines.png deleted file mode 100644 index 9939736598..0000000000 Binary files a/docs/img/pipelines.png and /dev/null differ diff --git a/docs/img/roadmap.png b/docs/img/roadmap.png deleted file mode 100644 index e3e8275ae2..0000000000 Binary files a/docs/img/roadmap.png and /dev/null differ diff --git a/docs/img/sources.png b/docs/img/sources.png deleted file mode 100644 index 66e1374e85..0000000000 Binary files a/docs/img/sources.png and /dev/null differ diff --git a/docs/img/team.png b/docs/img/team.png deleted file mode 100644 index d2f0a3d951..0000000000 Binary files a/docs/img/team.png and /dev/null differ diff --git a/docs/img/vector_dbs.png b/docs/img/vector_dbs.png deleted file mode 100644 index 46488f2729..0000000000 Binary files a/docs/img/vector_dbs.png and /dev/null differ diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 7787521b2f..0000000000 --- a/docs/index.md +++ /dev/null @@ -1,31 +0,0 @@ -# New to cognee? - -The getting started guide covers adding a cognee data store to your AI app, sending data, identifying users, extracting actions and insights, and interconnecting separate datasets. - -[Get started](quickstart.md) - -## Ingest Data - -Learn how to manage the ingestion of events, customer data, or third-party data for use with cognee. - -[Explore](data_ingestion.md) - -## Tasks and Pipelines - -Analyze and enrich your data and improve LLM answers with a series of tasks and pipelines. - -[Learn about tasks](templates.md) - -## API - -Push or pull data to build custom functionality or create bespoke views for your business needs. - -[Explore](api_reference.md) - -## Resources - -### Resources - -- [Research](research.md) -- [Community](https://discord.gg/52QTb5JK){:target="_blank"} - diff --git a/docs/learning/graph.md b/docs/learning/graph.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/learning/graph_example.png b/docs/learning/graph_example.png deleted file mode 100644 index 91178fb498..0000000000 Binary files a/docs/learning/graph_example.png and /dev/null differ diff --git a/docs/learning/loader.md b/docs/learning/loader.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/learning/search.md b/docs/learning/search.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/learning/sources.md b/docs/learning/sources.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/learning/vector.md b/docs/learning/vector.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/local_models.md b/docs/local_models.md deleted file mode 100644 index f83f2f03b9..0000000000 --- a/docs/local_models.md +++ /dev/null @@ -1,60 +0,0 @@ -# Running cognee with local models - -## 🚀 Getting Started with Local Models - -You'll need to run the local model on your machine or use one of the providers hosting the model. -!!! note "We had some success with mixtral, but 7b models did not work well. We recommend using mixtral for now." - -### Ollama - -Set up Ollama by following instructions on [Ollama website](https://ollama.com/) - - -Set the environment variable in your .env to use the model - -```bash -LLM_PROVIDER = 'ollama' - -``` -Otherwise, you can set the configuration for the model: - -```bash -cognee.config.llm_provider = 'ollama' - -``` -You can also set the HOST and model name: - -```bash - -cognee.config.llm_endpoint = "http://localhost:11434/v1" -cognee.config.llm_model = "mistral:instruct" -``` - - -### Anyscale - -```bash -LLM_PROVIDER = 'custom' - -``` -Otherwise, you can set the configuration for the model: - -```bash -cognee.config.llm_provider = 'custom' - -``` -You can also set the HOST and model name: -```bash -LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" -LLM_ENDPOINT = "https://api.endpoints.anyscale.com/v1" -LLM_API_KEY = "your_api_key" -``` - -You can set the same way HOST and model name for any other provider that has an API endpoint. - - - - - - - diff --git a/docs/overrides/main.html b/docs/overrides/main.html deleted file mode 100644 index c171cfb9d3..0000000000 --- a/docs/overrides/main.html +++ /dev/null @@ -1,17 +0,0 @@ -{% extends "base.html" %} - -{% block meta %} - {{ super() }} - - - - - - - - - - - - -{% endblock %} diff --git a/docs/overrides/partials/integrations/analytics/segment.html b/docs/overrides/partials/integrations/analytics/segment.html deleted file mode 100644 index e3a50f64f7..0000000000 --- a/docs/overrides/partials/integrations/analytics/segment.html +++ /dev/null @@ -1,15 +0,0 @@ - diff --git a/docs/pipelines.md b/docs/pipelines.md deleted file mode 100644 index 2392eab80e..0000000000 --- a/docs/pipelines.md +++ /dev/null @@ -1,81 +0,0 @@ -# PIPELINES - -Cognee uses [tasks](https://github.com/topoteretes/cognee/blob/main/cognee/modules/pipelines/tasks/Task.py) grouped into pipelines that populate graph and vector stores. [These tasks](https://github.com/topoteretes/cognee/tree/main/cognee/tasks) analyze and enrich data, enhancing the quality of answers produced by Large Language Models (LLMs). - -The tasks are managed and executed asynchronously using the `run_tasks` and `run_tasks_parallel` functions. - -```python -pipeline = run_tasks(tasks, documents) -async for result in pipeline: - print(result) -``` - -## Main pipeline: [cognee.cognify](https://github.com/topoteretes/cognee/blob/168cb5d1bf1964b5b0c645b2f3d8638d84554fda/cognee/api/v1/cognify/cognify_v2.py#L38) - -This is the main pipeline currently implemented in cognee. It is designed to process data in a structured way and populate the graph and vector stores. - - -This function is the entry point for processing datasets. It handles dataset retrieval, user authorization, and manages the execution of a pipeline of tasks that process documents. - -### Parameters - -- `datasets: Union[str, list[str]] = None`: A string or list of dataset names to be processed. -- `user: User = None`: The user requesting the processing. If not provided, the default user is retrieved. - -### Steps in the Function - -#### User Authentication - -```python -if user is None: - user = await get_default_user() -``` - -If no user is provided, the function retrieves the default user. - -#### Handling Empty or String Dataset Input - -```python -existing_datasets = await get_datasets(user.id) -if datasets is None or len(datasets) == 0: - datasets = existing_datasets -if type(datasets[0]) == str: - datasets = await get_datasets_by_name(datasets, user.id) -``` - -If no datasets are provided, the function retrieves all datasets owned by the user. If a list of dataset names (strings) is provided, they are converted into dataset objects. - -#### Selecting datasets from the input list that are owned by the user - -```python -existing_datasets_map = { - generate_dataset_name(dataset.name): True for dataset in existing_datasets - } -``` - -#### Run Cognify Pipeline for Each Dataset - -```python -awaitables = [] - -for dataset in datasets: - dataset_name = generate_dataset_name(dataset.name) - - if dataset_name in existing_datasets_map: - awaitables.append(run_cognify_pipeline(dataset, user)) - -return await asyncio.gather(*awaitables) - -The `run_cognify_pipeline` function is defined within `cognify` and is responsible for processing a single dataset. This is where most of the heavy lifting occurs. The function processes multiple datasets concurrently using `asyncio.gather`. - - -#### Pipeline Tasks - -The pipeline consists of several tasks, each responsible for different parts of the processing: - -- `classify_documents`: Converts each of the documents into one of the specific Document types: PdfDocument, AudioDocument, ImageDocument or TextDocument -- `check_permissions_on_documents`: Checks if the user has the necessary permissions to access the documents. In this case, it checks for "write" permission. -- `extract_chunks_from_documents`: Extracts text chunks based on the document type. -- `add_data_points`: Creates nodes and edges from the chunks and their properties. Adds them to the graph engine. -- `extract_graph_from_data`: Generates knowledge graphs from the document chunks. -- `summarize_text`: Extracts a summary for each chunk using an llm. diff --git a/docs/quickstart.md b/docs/quickstart.md deleted file mode 100644 index 0cdc2645c0..0000000000 --- a/docs/quickstart.md +++ /dev/null @@ -1,69 +0,0 @@ -# QUICKSTART - -!!! tip "To understand how cognee works check out the [conceptual overview](conceptual_overview.md)" - -## Setup - -To run cognee, you will need the following: - -1. OpenAI API key (Ollama or Anyscale could work as [well](local_models.md)) - -Add your LLM API key to the environment variables - -``` -import os - -os.environ["LLM_API_KEY"] = "YOUR_OPENAI_API_KEY" -``` -or -``` -cognee.config.llm_api_key = "YOUR_OPENAI_API_KEY" - -``` -If you are using Networkx, create an account on Graphistry to visualize results: -``` - cognee.config.set_graphistry_config({ - "username": "YOUR_USERNAME", - "password": "YOUR_PASSWORD" - }) -``` - -If you want to run Postgres instead of Sqlite, run postgres Docker container. -Navigate to cognee folder and run: -``` -docker compose up postgres -``` - -Add the following environment variables to .env file -``` -DB_HOST=127.0.0.1 -DB_PORT=5432 -DB_USERNAME=cognee # or any username you want -DB_PASSWORD=cognee # or any password you want -DB_NAME=cognee_db # or any db name you want -DB_PROVIDER=postgres -``` - -## Run - -cognee is asynchronous by design, meaning that operations like adding information, processing it, and querying it can run concurrently without blocking the execution of other tasks. -Make sure to await the results of the functions that you call. - -``` -import cognee - -text = """Natural language processing (NLP) is an interdisciplinary - subfield of computer science and information retrieval""" - -await cognee.add(text) # Add a new piece of information - -await cognee.cognify() # Use LLMs and cognee to create knowledge - -search_results = await cognee.search(SearchType.INSIGHTS, query_text='Tell me about NLP') # Query cognee for the knowledge - -for result_text in search_results: - print(result_text) -``` - -In the example above, we add a piece of information to cognee, use LLMs to create a GraphRAG, and then query cognee for the knowledge. -cognee is composable and you can build your own cognee pipelines using our [templates.](templates.md) diff --git a/docs/rags.md b/docs/rags.md deleted file mode 100644 index d82107226a..0000000000 --- a/docs/rags.md +++ /dev/null @@ -1,78 +0,0 @@ -## RAG Stack -Core elements of a RAG stack are the building blocks that we can use to get to more personalized and deterministic outputs. - -!!! tip "This is a work in progress and any feedback is welcome" -## What is a RAG? - -!!! note "What is RAG?" - RAG stands for Retrieval Augmented Generation. It is a model that combines the power of large language models (LLMs) like GPT-4 with the efficiency of information retrieval systems. The goal of RAG is to generate text that is both fluent and factually accurate by retrieving relevant information from a knowledge base. - -To try building a simple RAG and understand the limitations, check out this simple guide with examples: [RAGs: Retrieval-Augmented Generation Explained](rag/rag_explained.md) - -## The Building Blocks of a RAG Stack - -### 1. Data Sources - -You can get your data from a variety of sources, including: - - -- APIs like Twitter, Reddit, and Google -- Web scraping tools like Scrapy and Beautiful Soup -- Documents like PDFs, Word, and Excel files -- Relational databases like DuckDB, PSQL and MySQL -- Data warehouses like Snowflake and Databricks -- Customer data platforms like Segment -
-![Data Sources](img/sources.png) -
Some data sources
-
-The goal here is to give the data structure and connect it so that it can be used in your deterministic LLM stack. - -### 2. Data Loaders -
-![Data Loader](img/loaders.png) -
Data Loaders
-
-Data loading into a data lake or warehouse involves using tools like Apache Airflow, dlt, dbt, and Databricks. The process includes data extraction, transformation, and loading for model usage, aiming for a clean, structured dataset ready for enrichment. -Check out how we do it with dlt: [Data Loading Tool (dlt)](dlt/dlt.md) -### 3. Vector Computation and Vector Stores -Data is transformed into vectors using OpenAI or custom models. Understanding where to run these models and integrating your computing infrastructure with tools like custom spark pipelines is essential. The aim is to achieve ready-to-use pipelines and models. -
-![Vector Stores](img/vector_dbs.png) -
Vector Stores
-
-Image [Source](https://blog.det.life/why-you-shouldnt-invest-in-vector-databases-c0cd3f59d23c) -### 4. Graph Computation and Graph Stores -Creating a knowledge graph from your data allows for querying and information retrieval. It's essential to know how to construct, maintain, and use it for text generation. The aim is an accurate, current, and easily queried knowledge graph. -
-![Graph Stores](img/graph_example.png) -
Graph Example
-
-### 5. Search -The process involves querying and retrieving vectors from Vector DBs or hybrid DBs, and using search tools to rank these vectors. The aim is to index vectors and search for relevant ones as needed. -#### Vector Similarity Search -Identifies objects with vector representations closest to the query vector, finding the most similar items based on various dimensions of comparison. - -#### Image Search -Utilizes images as the input for conducting a similarity search, analyzing the content of the image to find similar images based on visual features. - -#### Keyword Search -Employs the BM25F algorithm for ranking results based on keyword matches. Relevance is calculated using term frequency, inverse document frequency, and field-length normalization. - -#### Hybrid Search -Merges the BM25 algorithm with vector similarity search techniques to enhance the relevance and accuracy of search results. Leverages both textual and vector-based features for ranking. - -#### Generative Search -Utilizes the outputs of search results as prompts for a Large Language Model (LLM). Can generate summaries, extrapolations, or new content based on the aggregated search results. - -#### Reranking -Involves the application of a reranker module to adjust the initial ranking of search results. Optimizes result relevance based on additional criteria or more complex models. - -#### Aggregation -Involves compiling and summarizing data from a set of search results. Provides insights or overviews based on the collective information found. - -#### Filters -Apply constraints or conditions to the search process to narrow down the results. Filters can be based on specific attributes, metadata, or other criteria relevant to the search domain. - -#### Graph Search -Involves traversing a graph data structure to find specific nodes or paths. It can be used to find relationships between different entities in a knowledge graph. \ No newline at end of file diff --git a/docs/research.md b/docs/research.md deleted file mode 100644 index 1439dab528..0000000000 --- a/docs/research.md +++ /dev/null @@ -1,62 +0,0 @@ -# Research - -The page is dedicated to collecting all research that was collected in the past one year from various sources. - -This is not an exhaustive list, and any PRs would be welcome - -### Research Papers -- [2024/06/04] [Symbolic reasoning](https://arxiv.org/abs/2402.01817) -- [2024/06/04] [Transformers and episodic memory](https://arxiv.org/abs/2405.14992) -- [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103) -- [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143) -- [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/) -- [2015/07/30] [Multilayer Network of Language](https://arxiv.org/abs/1507.08539) -- [2023/12/12] [Dense X Retrieval: What Retrieval Granularity Should We Use?](https://arxiv.org/pdf/2312.06648.pdf) -- [2024/01/05] [Retrieval-Augmented Generation for Large Language Models: A Survey](https://arxiv.org/pdf/2312.10997.pdf) -- [2022/10/20] [Cognitive modelling with multilayer networks: Insights, advancements and future challenges](https://arxiv.org/pdf/2210.00500.pdf) -- [2023/09/20] CoAla framework and relevant literature [literature](https://github.com/ysymyth/awesome-language-agents) -- [2023/06/09] [Mind2Web: Towards a Generalist Agent for the Web](https://arxiv.org/pdf/2306.06070.pdf), Xiang Deng, et al. [[code]](https://github.com/OSU-NLP-Group/Mind2Web) [[demo]](https://osu-nlp-group.github.io/Mind2Web/) -- [2023/06/28] AI Agents in Langchain [https://docs.google.com/presentation/d/1L_CHsg26sDxPmKj285Ob5T2xsAUejBlfiGQSnsSHTk0/edit#slide=id.g254e571859c_0_164](https://docs.google.com/presentation/d/1L_CHsg26sDxPmKj285Ob5T2xsAUejBlfiGQSnsSHTk0/edit#slide=id.g254e571859c_0_164) -- [2023/06/27] Agent infra [https://lilianweng.github.io/posts/2023-06-23-agent/](https://lilianweng.github.io/posts/2023-06-23-agent/) -- [2023/06/05] [Orca: Progressive Learning from Complex Explanation Traces of GPT-4](https://arxiv.org/pdf/2306.02707.pdf), Subhabrata Mukherjee et al. -- [2023/05/25] 📚[Voyager: An Open-Ended Embodied Agent with Large Language Models](https://arxiv.org/pdf/2305.16291.pdf), Guanzhi Wang, et al. [[code]](https://github.com/MineDojo/Voyager) [[website]](https://voyager.minedojo.org/), Shishir G. Patil, et al. -- [2023/05/24] 📚[Gorilla: Gorilla: Large Language Model Connected with Massive APIs](https://arxiv.org/abs/2305.15334) -- [2023/05/17] 📚[Tree of Thoughts: Deliberate Problem Solving with Large Language Models](https://arxiv.org/abs/2305.10601), Shunyu Yao, et al.[[code]](https://github.com/kyegomez/tree-of-thoughts) [[code-orig]](https://github.com/ysymyth/tree-of-thought-llm) -- [2023/05/12] 📚[MEGABYTE: Predicting Million-byte Sequences with Multiscale Transformers](https://arxiv.org/abs/2305.07185), Lili Yu, et al. -- [2023/05/09] 📚[FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance](https://arxiv.org/abs/2305.05176), Lingjiao Chen, et al. -- [2023/05/01] 📚[Learning to Reason and Memorize with Self-Notes](https://arxiv.org/abs/2305.00833), Jack Lanchantin, et al. -- [2023/04/24] 📚[WizardLM: Empowering Large Language Models to Follow Complex Instructions](https://arxiv.org/abs/2304.12244), Can Xu, et al. -- [2023/04/22] 📚[LLM+P: Empowering Large Language Models with Optimal Planning Proficiency](https://arxiv.org/abs/2304.11477), Bo Liu, et al. -- [2023/04/07] 📚[Generative Agents: Interactive Simulacra of Human Behavior](https://arxiv.org/abs/2304.03442), Joon Sung Park, et al. [[code]](https://github.com/mkturkcan/generative-agents) -- [2023/03/30] [Self-Refine: Iterative Refinement with Self-Feedback](https://arxiv.org/abs/2303.17651), Aman Madaan, et al.[[code]](https://github.com/madaan/self-refine) -- [2023/03/30] [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace](https://arxiv.org/pdf/2303.17580.pdf), Yongliang Shen, et al. [[code]](https://github.com/microsoft/JARVIS) [[demo]](https://huggingface.co/spaces/microsoft/HuggingGPT) -- [2023/03/20] [Reflexion: Language Agents with Verbal Reinforcement Learning](https://arxiv.org/pdf/2303.11366.pdf), Noah Shinn , et al. [[code]](https://github.com/noahshinn024/reflexion) -- [2023/02/23] 📚[Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection](https://arxiv.org/abs/2302.12173), Sahar Abdelnab, et al. -- [2023/02/09] 📚[Toolformer: Language Models Can Teach Themselves to Use Tools](https://arxiv.org/pdf/2302.04761.pdf), Timo Schick, et al. [[code]](https://github.com/lucidrains/toolformer-pytorch) -- [2022/12/12] 📚[LMQL: Prompting Is Programming: A Query Language for Large Language Models](https://arxiv.org/abs/2212.06094), Luca Beurer-Kellner, et al. -- [2022/10/06] [ReAct: Synergizing Reasoning and Acting in Language Models](https://arxiv.org/pdf/2210.03629.pdf), Shunyu Yao, et al. [[code]](https://github.com/ysymyth/ReAct) -- [2022/07/12] 📚[Inner Monologue: Embodied Reasoning through Planning with Language Models](https://arxiv.org/pdf/2207.05608.pdf), Wenlong Huang, et al. [[demo]](https://innermonologue.github.io/) -- [2022/04/04] [Do As I Can, Not As I Say: Grounding Language in Robotic Affordances](https://github.com/Significant-Gravitas/Nexus/wiki/Awesome-Resources), Michael Ahn, e al. [[demo]](https://say-can.github.io/) -- [2021/12/17] [WebGPT: Browser-assisted question-answering with human feedback](https://arxiv.org/pdf/2112.09332.pdf), Reiichiro Nakano, et al. -- [2021/06/17] 📚[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685), Edward J. Hu, et al. -- [2023/04/03] [Generative Agents](https://arxiv.org/abs/2304.03442) -- [2023/05/17] [Three of thought: Deliberate Problem Solving with Large Language Mode](https://arxiv.org/abs/2305.10601)ls - - -### Knowledge Graphs - -- [2023/06/09] [Taxonomies: Overview](https://www.brighttalk.com/webcast/9273/605659?utm_source=brighttalk-portal&utm_medium=web&utm_campaign=topic&utm_content=upcoming) - -### Blog Articles - -- [2023/04/29] [AUTO-GPT: UNLEASHING THE POWER OF AUTONOMOUS AI AGENTS](https://www.leewayhertz.com/autogpt/) By Akash Takyar -- [2023/04/20] [Conscious Machines: Experiments, Theory, and Implementations(Chinese)](https://pattern.swarma.org/article/230) By Jiang Zhang -- [2023/04/18] [Autonomous Agents & Agent Simulations](https://blog.langchain.dev/agents-round/) By Langchain -- [2023/04/16] [4 Autonomous AI Agents you need to know](https://towardsdatascience.com/4-autonomous-ai-agents-you-need-to-know-d612a643fa92) By Sophia Yang -- [2023/03/31] [ChatGPT that learns to use tools](https://zhuanlan.zhihu.com/p/618448188) By Haojie Pan - -### Talks - -- [2023/06/05] [Two Paths to Intelligence](https://www.youtube.com/watch?v=rGgGOccMEiY&t=1497s) by Geoffrey Hinton -- [2023/05/24] [State of GPT](https://www.youtube.com/watch?v=bZQun8Y4L2A) by Andrej Karpathy | OpenAI -- [2024/03/15] Podcast on AI, Memory by Bill Gurley diff --git a/docs/search.md b/docs/search.md deleted file mode 100644 index 8fc204b067..0000000000 --- a/docs/search.md +++ /dev/null @@ -1,21 +0,0 @@ -## Cognee Search Module - -This module contains the search function that is used to search for nodes in the graph. It supports various search types and integrates with user permissions to filter results accordingly. - -### Search Types - -The `SearchType` enum defines the different types of searches that can be performed: - -- `INSIGHTS`: Search for insights from the knowledge graph. -- `SUMMARIES`: Search for summaries of the texts provided. -- `CHUNKS`: Search for the whole chunks of data. - - -### Search Function - -The `search` function is the main entry point for performing a search. It handles user authentication, retrieves document IDs for the user, and filters the search results based on user permissions. - -```python -from cognee import search, SearchType -await search(SearchType.INSIGHTS, "your_query") -``` diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css deleted file mode 100644 index f5eb2a7804..0000000000 --- a/docs/stylesheets/extra.css +++ /dev/null @@ -1,51 +0,0 @@ - -[data-md-color-scheme = "cognee"] { - color-scheme: dark; - - --md-default-bg-color: #0C0121; - --md-default-bg-color--light: #240067; - - --md-default-fg-color: #57DFD7; - --md-default-fg-color--light: #85ded8; - --md-default-fg-color--dark: #4dc6be; - - /* --md-primary-fg-color: #0C0121; */ - --md-primary-fg-color: #7233BA; - --md-primary-fg-color--light: #8a49d4; - --md-primary-fg-color--dark: #522488; - /* --md-primary-bg-color: hsla(0, 0%, 100%, 1); - --md-primary-bg-color--light: */ - - --md-accent-fg-color: #41a29b; - - --md-typeset-color: white; - --md-typeset-a-color: #57DFD7; - - --md-footer-bg-color: #0C0121; - --md-footer-bg-color--dark: #0C0121; -} - -.md-header { - background-color: var(--md-default-bg-color); -} - -/* Remove unnecessary title from the header */ -.md-header__title { - display: none; -} -/* Spread header elements evenly when there is no title */ -.md-header__inner { - justify-content: space-between; -} - -.md-tabs { - background-color: var(--md-default-bg-color); -} - -.md-button--primary:hover { - background-color: #8a49d4 !important; -} - -.md-typeset .md-button { - border-radius: 32px; -} diff --git a/docs/team.md b/docs/team.md deleted file mode 100644 index 27b1257b37..0000000000 --- a/docs/team.md +++ /dev/null @@ -1,4 +0,0 @@ -# Team - - -![About us](img/team.png) \ No newline at end of file diff --git a/docs/why.md b/docs/why.md deleted file mode 100644 index 040cefb76c..0000000000 --- a/docs/why.md +++ /dev/null @@ -1,29 +0,0 @@ -# Why use cognee? - -cognee is one of the first OSS tools that enables easy, scalable and flexible use of LLMs to process large volumes of documents using GraphRAG approach. - -LLMs don't have a semantic layer, and they don't have a way to understand the data they are processing. This is where cognee comes in. -We let you define logical structures for your data and then use these structures to guide the LLMs to process the data in a way that makes sense to you. - -cognee helps you avoid the overly complicated set of tools and processes to give you somewhat reliable output - - -***From*** - -![Bad Architecture](img/bad_architecture.png) - -***To*** - -![Good Architecture](img/good_architecture.png) - -??? note "Why use cognee?" - - Its hard to answer the question of why use cognee without answering why you need thin LLM frameworks in the first place.:) - - **Cost-effective** — cognee extends the capabilities of your LLMs without the need for expensive data processing tools. - - **Self-contained** — cognee runs as a simple-to-use library meaning you can add it to your application easily - - **Easy to use** — Navigate graphs instead of embeddings to understand your data faster and better - - **Flexible** — cognee lets you control your input and provide your own Pydantic data models. - - - - diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 068cf7a2c1..0000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,152 +0,0 @@ -site_name: cognee -site_author: Vasilije Markovic -site_description: desc -repo_name: cognee -repo_url: https://github.com/topoteretes/cognee -site_url: https://www.congee.ai -edit_uri: edit/main/docs/ -copyright: Copyright © 2024 cognee -theme: - name: material - logo: assets/logo.png - favicon: assets/favicon.png - icon: - repo: fontawesome/brands/github - edit: material/pencil - view: material/eye - theme: - admonition: - note: octicons/tag-16 - abstract: octicons/checklist-16 - info: octicons/info-16 - tip: octicons/squirrel-16 - success: octicons/check-16 - question: octicons/question-16 - warning: octicons/alert-16 - failure: octicons/x-circle-16 - danger: octicons/zap-16 - bug: octicons/bug-16 - example: octicons/beaker-16 - quote: octicons/quote-16 - features: - - announce.dismiss - - content.action.edit - - content.action.view - - content.code.annotate - - content.code.copy - - content.code.select - - content.tabs.link - - content.tooltips - - header.autohide - - navigation.expand - - navigation.footer - - navigation.indexes - - navigation.instant - - navigation.instant.prefetch - - navigation.instant.progress - - navigation.prune - - navigation.sections - - navigation.tabs - - navigation.top - - navigation.tracking - - navigation.path - - search.highlight - - search.share - - search.suggest - - toc.follow - # - toc.integrate - palette: - - scheme: cognee - primary: custom - font: - text: Roboto - code: Roboto Mono - custom_dir: docs/overrides - -extra: - analytics: - provider: segment - key: !ENV DOCS_SEGMENT_KEY - -extra_css: - - stylesheets/extra.css - -# Extensions -markdown_extensions: - - abbr - - admonition - - pymdownx.details - - attr_list - - def_list - - footnotes - - md_in_html - - toc: - permalink: true - - pymdownx.arithmatex: - generic: true - - pymdownx.betterem: - smart_enable: all - - pymdownx.caret - - pymdownx.details - - pymdownx.emoji: - emoji_generator: !!python/name:material.extensions.emoji.to_svg - emoji_index: !!python/name:material.extensions.emoji.twemoji - - pymdownx.highlight: - anchor_linenums: true - line_spans: __span - pygments_lang_class: true - - pymdownx.inlinehilite - - pymdownx.keys - - pymdownx.magiclink: - normalize_issue_symbols: true - repo_url_shorthand: true - user: tricalt - repo: cognee - - pymdownx.mark - - pymdownx.smartsymbols - - pymdownx.snippets: - auto_append: - - includes/mkdocs.md - - pymdownx.superfences: - custom_fences: - - name: mermaid - class: mermaid - format: !!python/name:pymdownx.superfences.fence_code_format - - pymdownx.tabbed: - alternate_style: true - combine_header_slug: true - - pymdownx.tasklist: - custom_checkbox: true -nav: - - Overview: - - Overview: 'index.md' - - Start here: - - Installation: 'quickstart.md' - - Add data: 'data_ingestion.md' - - Create LLM enriched data store: 'templates.md' - - Explore data: 'search.md' - - Configuration: 'configuration.md' - - What is cognee: - - Introduction: 'conceptual_overview.md' - - API reference: 'api_reference.md' - - -plugins: - - search: - separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' - - - minify: - minify_html: true - minify_js: true - minify_css: true - htmlmin_opts: - remove_comments: true - cache_safe: true - - - mkdocstrings: - handlers: - python: - options: - members_order: alphabetical - allow_inspection: true - show_bases: true