Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions .github/workflows/dockerhub-mcp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: build | Build and Push Cognee MCP Docker Image to dockerhub

on:
push:
branches:
- main

jobs:
docker-build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: cognee/cognee-mcp
tags: |
type=ref,event=branch
type=sha,prefix={{branch}}-
type=raw,value=latest,enable={{is_default_branch}}

- name: Build and push
id: build
uses: docker/build-push-action@v5
with:
context: .
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=cognee/cognee-mcp:buildcache
cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max

- name: Image digest
run: echo ${{ steps.build.outputs.digest }}
2 changes: 1 addition & 1 deletion .github/workflows/dockerhub.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ name: build | Build and Push Docker Image to dockerhub
on:
push:
branches:
- dev
- main

jobs:
Expand Down Expand Up @@ -34,6 +33,7 @@ jobs:
type=raw,value=latest,enable={{is_default_branch}}

- name: Build and push
id: build
uses: docker/build-push-action@v5
with:
context: .
Expand Down
49 changes: 49 additions & 0 deletions cognee-mcp/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Use a Python image with uv pre-installed
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv

# Set build argument
ARG DEBUG

# Set environment variable based on the build argument
ENV DEBUG=${DEBUG}
ENV PIP_NO_CACHE_DIR=true
ENV PATH="${PATH}:/root/.poetry/bin"

WORKDIR /app

# Enable bytecode compilation
ENV UV_COMPILE_BYTECODE=1

# Copy from the cache instead of linking since it's a mounted volume
ENV UV_LINK_MODE=copy


# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --frozen --no-install-project --no-dev --no-editable

# Then, add the rest of the project source code and install it
# Installing separately from its dependencies allows optimal layer caching
ADD . /app
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --frozen --no-dev --no-editable

# RUN apt-get update && apt-get install

# RUN apt-get install -y \
# gcc \
# libpq-dev

FROM python:3.12-slim-bookworm

WORKDIR /app

COPY --from=uv /root/.local /root/.local
COPY --from=uv --chown=app:app /app/.venv /app/.venv

# Place executables in the environment at the front of the path
ENV PATH="/app/.venv/bin:$PATH"

ENTRYPOINT ["cognee"]
7 changes: 6 additions & 1 deletion cognee-mcp/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ readme = "README.md"
requires-python = ">=3.10"

dependencies = [
"cognee[codegraph]",
"cognee[codegraph,postgres,neo4j]",
"mcp==1.2.1",
]

Expand All @@ -21,5 +21,10 @@ build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src"]

[dependency-groups]
dev = [
"debugpy>=1.8.12",
]

[project.scripts]
cognee = "src:main"
7 changes: 4 additions & 3 deletions cognee-mcp/src/server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import json
import os
import cognee
Expand Down Expand Up @@ -92,7 +93,7 @@ async def call_tools(name: str, arguments: dict) -> list[types.TextContent]:
with open(os.devnull, "w") as fnull:
with redirect_stdout(fnull), redirect_stderr(fnull):
if name == "cognify":
cognify(
await cognify(
text=arguments["text"],
graph_model_file=arguments.get("graph_model_file", None),
graph_model_name=arguments.get("graph_model_name", None),
Expand Down Expand Up @@ -161,6 +162,8 @@ async def main():
try:
from mcp.server.stdio import stdio_server

logger.info("Starting Cognee MCP server...")

async with stdio_server() as (read_stream, write_stream):
await mcp.run(
read_stream=read_stream,
Expand Down Expand Up @@ -249,6 +252,4 @@ def load_class(model_file, model_name):

if __name__ == "__main__":
# Initialize and run the server
import asyncio

asyncio.run(main())
137 changes: 135 additions & 2 deletions cognee-mcp/uv.lock

Large diffs are not rendered by default.

16 changes: 13 additions & 3 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
from uuid import NAMESPACE_OID, uuid5

from cognee.api.v1.search.search_v2 import search
from cognee.api.v1.search import SearchType
from cognee.base_config import get_base_config
from cognee.modules.cognify.config import get_cognify_config
from cognee.modules.pipelines import run_tasks
Expand Down Expand Up @@ -42,15 +44,15 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):

cognee_config = get_cognify_config()
user = await get_default_user()
detailed_extraction = False
detailed_extraction = True

tasks = [
Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
# Task(enrich_dependency_graph, task_config={"batch_size": 50}),
# Task(expand_dependency_graph, task_config={"batch_size": 50}),
# Task(get_source_code_chunks, task_config={"batch_size": 50}),
# Task(summarize_code, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 100 if detailed_extraction else 500}),
Task(add_data_points, task_config={"batch_size": 500}),
]

if include_docs:
Expand Down Expand Up @@ -84,9 +86,17 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
if __name__ == "__main__":

async def main():
async for data_points in run_code_graph_pipeline("REPO_PATH"):
async for data_points in run_code_graph_pipeline("YOUR_REPO_PATH"):
print(data_points)

await render_graph()

search_results = await search(
query_type=SearchType.CODE,
query_text="How is Relationship weight calculated?",
)

for file in search_results:
print(file.filename)

asyncio.run(main())
22 changes: 22 additions & 0 deletions cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
You are a professional file name and python code extracting expert.
Extract file names and corresponding code pieces from text while preserving formatting and structure.

### Instructions:

1. **Identify File Names:** Extract filenames from inline text, headers, or markdown formatting. Empty list of filenames is completely normal.
2. **Extract Code:** Extract code pieces that are in the text (do not add additional content) and maintain their indentation and formatting. Empty list of code pieces is completely normal
3. **Ensure Accuracy:** Avoid extraneous text, merge related snippets, and support multiple programming languages.
4. **Keep content:** Avoid additional files and code pieces that are not in the text make sure everything you extract as a code is actually a code and not a part of a sentence.
5. **Ensure relevancy:** Make sure that the extracted codepiece is not just one or two lines but a meaningful python code, extract classes and functions in one piece

Examples:

1.
query: 'I want to change the test1.py file and want to add a print statement at the end'
files: ['test1.py']
codepieces: ""

2.
query: 'print('Hello World') doesn't work in the test2.py file. What are the changes I have to do there?
files: ["test2.py"]
codepieces: "print(\'Hello World\')"
142 changes: 114 additions & 28 deletions cognee/modules/retrieval/code_graph_retrieval.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,128 @@
from cognee.low_level import DataPoint
from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
from .brute_force_triplet_search import brute_force_triplet_search
import asyncio
import aiofiles

from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
from typing import List, Dict, Any
from pydantic import BaseModel
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt

async def code_graph_retrieval(query: str) -> dict[str, str]:
subclasses = get_all_subclasses(DataPoint)

vector_index_collections = []
class CodeQueryInfo(BaseModel):
"""Response model for information extraction from the query"""

for subclass in subclasses:
index_fields = subclass.model_fields["metadata"].default.get("index_fields", [])
for field_name in index_fields:
vector_index_collections.append(f"{subclass.__name__}_{field_name}")
filenames: List[str] = []
sourcecode: str

found_triplets = await brute_force_triplet_search(
query,
top_k=5,
collections=vector_index_collections or None,
properties_to_project=["id", "file_path", "source_code"],

async def code_graph_retrieval(query: str) -> list[dict[str, Any]]:
if not query or not isinstance(query, str):
raise ValueError("The query must be a non-empty string.")

file_name_collections = ["CodeFile_name"]
classes_and_functions_collections = [
"ClassDefinition_source_code",
"FunctionDefinition_source_code",
]

try:
vector_engine = get_vector_engine()
graph_engine = await get_graph_engine()
except Exception as e:
raise RuntimeError("Database initialization error in code_graph_retriever, ") from e

system_prompt = read_query_prompt("codegraph_retriever_system.txt")

llm_client = get_llm_client()
try:
files_and_codeparts = await llm_client.acreate_structured_output(
text_input=query,
system_prompt=system_prompt,
response_model=CodeQueryInfo,
)
except Exception as e:
raise RuntimeError("Failed to retrieve structured output from LLM") from e

similar_filenames = []
similar_codepieces = []

if not files_and_codeparts.filenames or not files_and_codeparts.sourcecode:
for collection in file_name_collections:
search_results_file = await vector_engine.search(collection, query, limit=3)
for res in search_results_file:
similar_filenames.append({"id": res.id, "score": res.score, "payload": res.payload})

for collection in classes_and_functions_collections:
search_results_code = await vector_engine.search(collection, query, limit=3)
for res in search_results_code:
similar_codepieces.append(
{"id": res.id, "score": res.score, "payload": res.payload}
)

else:
for collection in file_name_collections:
for file_from_query in files_and_codeparts.filenames:
search_results_file = await vector_engine.search(
collection, file_from_query, limit=3
)
for res in search_results_file:
similar_filenames.append(
{"id": res.id, "score": res.score, "payload": res.payload}
)

for collection in classes_and_functions_collections:
for code_from_query in files_and_codeparts.sourcecode:
search_results_code = await vector_engine.search(
collection, code_from_query, limit=3
)
for res in search_results_code:
similar_codepieces.append(
{"id": res.id, "score": res.score, "payload": res.payload}
)

file_ids = [str(item["id"]) for item in similar_filenames]
code_ids = [str(item["id"]) for item in similar_codepieces]

relevant_triplets = await asyncio.gather(
*[graph_engine.get_connections(node_id) for node_id in code_ids + file_ids]
)

paths = set()

for sublist in relevant_triplets:
for tpl in sublist:
if isinstance(tpl, tuple) and len(tpl) >= 3:
if "file_path" in tpl[0]:
paths.add(tpl[0]["file_path"])
if "file_path" in tpl[2]: # Third tuple element
paths.add(tpl[2]["file_path"])

retrieved_files = {}

for triplet in found_triplets:
if triplet.node1.attributes["source_code"]:
retrieved_files[triplet.node1.attributes["file_path"]] = triplet.node1.attributes[
"source_code"
]
if triplet.node2.attributes["source_code"]:
retrieved_files[triplet.node2.attributes["file_path"]] = triplet.node2.attributes[
"source_code"
]

return [
read_tasks = []
for file_path in paths:

async def read_file(fp):
try:
async with aiofiles.open(fp, "r", encoding="utf-8") as f:
retrieved_files[fp] = await f.read()
except Exception as e:
print(f"Error reading {fp}: {e}")
retrieved_files[fp] = ""

read_tasks.append(read_file(file_path))

await asyncio.gather(*read_tasks)

result = [
{
"name": file_path,
"description": file_path,
"content": source_code,
"content": retrieved_files[file_path],
}
for file_path, source_code in retrieved_files.items()
for file_path in paths
]

return result
Loading
Loading