topoteretes · borisarzentar · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/.github/workflows/dockerhub-mcp.yml b/.github/workflows/dockerhub-mcp.yml
@@ -0,0 +1,48 @@
+name: build | Build and Push Cognee MCP Docker Image to dockerhub
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  docker-build-and-push:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: cognee/cognee-mcp
+          tags: |
+            type=ref,event=branch
+            type=sha,prefix={{branch}}-
+            type=raw,value=latest,enable={{is_default_branch}}
+
+      - name: Build and push
+        id: build
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=registry,ref=cognee/cognee-mcp:buildcache
+          cache-to: type=registry,ref=cognee/cognee-mcp:buildcache,mode=max
+
+      - name: Image digest
+        run: echo ${{ steps.build.outputs.digest }}
diff --git a/.github/workflows/dockerhub.yml b/.github/workflows/dockerhub.yml
@@ -3,7 +3,6 @@ name: build | Build and Push Docker Image to dockerhub
 on:
   push:
     branches:
-      - dev
       - main
 
 jobs:
@@ -34,6 +33,7 @@ jobs:
             type=raw,value=latest,enable={{is_default_branch}}
 
       - name: Build and push
+        id: build
         uses: docker/build-push-action@v5
         with:
           context: .

diff --git a/cognee-mcp/Dockerfile b/cognee-mcp/Dockerfile
@@ -0,0 +1,49 @@
+# Use a Python image with uv pre-installed
+FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS uv
+
+# Set build argument
+ARG DEBUG
+
+# Set environment variable based on the build argument
+ENV DEBUG=${DEBUG}
+ENV PIP_NO_CACHE_DIR=true
+ENV PATH="${PATH}:/root/.poetry/bin"
+
+WORKDIR /app
+
+# Enable bytecode compilation
+ENV UV_COMPILE_BYTECODE=1
+
+# Copy from the cache instead of linking since it's a mounted volume
+ENV UV_LINK_MODE=copy
+
+
+# Install the project's dependencies using the lockfile and settings
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    uv sync --frozen --no-install-project --no-dev --no-editable
+
+# Then, add the rest of the project source code and install it
+# Installing separately from its dependencies allows optimal layer caching
+ADD . /app
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-dev --no-editable
+
+# RUN apt-get update && apt-get install
+
+# RUN apt-get install -y \
+#   gcc \
+#   libpq-dev
+
+FROM python:3.12-slim-bookworm
+
+WORKDIR /app
+
+COPY --from=uv /root/.local /root/.local
+COPY --from=uv --chown=app:app /app/.venv /app/.venv
+
+# Place executables in the environment at the front of the path
+ENV PATH="/app/.venv/bin:$PATH"
+
+ENTRYPOINT ["cognee"]
diff --git a/cognee-mcp/pyproject.toml b/cognee-mcp/pyproject.toml
@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.10"
 
 dependencies = [
-    "cognee[codegraph]",
+    "cognee[codegraph,postgres,neo4j]",
     "mcp==1.2.1",
 ]
 
@@ -21,5 +21,10 @@ build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src"]
 
+[dependency-groups]
+dev = [
+    "debugpy>=1.8.12",
+]
+
 [project.scripts]
 cognee = "src:main"
diff --git a/cognee-mcp/src/server.py b/cognee-mcp/src/server.py
@@ -1,3 +1,4 @@
+import asyncio
 import json
 import os
 import cognee
@@ -92,7 +93,7 @@ async def call_tools(name: str, arguments: dict) -> list[types.TextContent]:
         with open(os.devnull, "w") as fnull:
             with redirect_stdout(fnull), redirect_stderr(fnull):
                 if name == "cognify":
-                    cognify(
+                    await cognify(
                         text=arguments["text"],
                         graph_model_file=arguments.get("graph_model_file", None),
                         graph_model_name=arguments.get("graph_model_name", None),
@@ -161,6 +162,8 @@ async def main():
     try:
         from mcp.server.stdio import stdio_server
 
+        logger.info("Starting Cognee MCP server...")
+
         async with stdio_server() as (read_stream, write_stream):
             await mcp.run(
                 read_stream=read_stream,
@@ -249,6 +252,4 @@ def load_class(model_file, model_name):
 
 if __name__ == "__main__":
     # Initialize and run the server
-    import asyncio
-
     asyncio.run(main())
diff --git a/cognee-mcp/uv.lock b/cognee-mcp/uv.lock
diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -2,6 +2,8 @@
 import logging
 from uuid import NAMESPACE_OID, uuid5
 
+from cognee.api.v1.search.search_v2 import search
+from cognee.api.v1.search import SearchType
 from cognee.base_config import get_base_config
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
@@ -42,15 +44,15 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
 
     cognee_config = get_cognify_config()
     user = await get_default_user()
-    detailed_extraction = False
+    detailed_extraction = True
 
     tasks = [
         Task(get_repo_file_dependencies, detailed_extraction=detailed_extraction),
         # Task(enrich_dependency_graph, task_config={"batch_size": 50}),
         # Task(expand_dependency_graph, task_config={"batch_size": 50}),
         # Task(get_source_code_chunks, task_config={"batch_size": 50}),
         # Task(summarize_code, task_config={"batch_size": 50}),
-        Task(add_data_points, task_config={"batch_size": 100 if detailed_extraction else 500}),
+        Task(add_data_points, task_config={"batch_size": 500}),
     ]
 
     if include_docs:
@@ -84,9 +86,17 @@ async def run_code_graph_pipeline(repo_path, include_docs=False):
 if __name__ == "__main__":
 
     async def main():
-        async for data_points in run_code_graph_pipeline("REPO_PATH"):
+        async for data_points in run_code_graph_pipeline("YOUR_REPO_PATH"):
             print(data_points)
 
         await render_graph()
 
+        search_results = await search(
+            query_type=SearchType.CODE,
+            query_text="How is Relationship weight calculated?",
+        )
+
+        for file in search_results:
+            print(file.filename)
+
     asyncio.run(main())
diff --git a/cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt b/cognee/infrastructure/llm/prompts/codegraph_retriever_system.txt
@@ -0,0 +1,22 @@
+You are a professional file name and python code extracting expert.
+Extract file names and corresponding code pieces from text while preserving formatting and structure.
+
+### Instructions:
+
+1. **Identify File Names:** Extract filenames from inline text, headers, or markdown formatting. Empty list of filenames is completely normal.
+2. **Extract Code:** Extract code pieces that are in the text (do not add additional content) and maintain their indentation and formatting. Empty list of code pieces is completely normal
+3. **Ensure Accuracy:** Avoid extraneous text, merge related snippets, and support multiple programming languages.
+4. **Keep content:** Avoid additional files and code pieces that are not in the text make sure everything you extract as a code is actually a code and not a part of a sentence.
+5. **Ensure relevancy:** Make sure that the extracted codepiece is not just one or two lines but a meaningful python code, extract classes and functions in one piece
+
+Examples:
+
+1.
+query: 'I want to change the test1.py file and want to add a print statement at the end'
+files: ['test1.py']
+codepieces: ""
+
+2.
+query: 'print('Hello World') doesn't work in the test2.py file. What are the changes I have to do there?
+files: ["test2.py"]
+codepieces: "print(\'Hello World\')"
diff --git a/cognee/modules/retrieval/code_graph_retrieval.py b/cognee/modules/retrieval/code_graph_retrieval.py
@@ -1,42 +1,128 @@
-from cognee.low_level import DataPoint
-from cognee.modules.graph.utils.convert_node_to_data_point import get_all_subclasses
-from .brute_force_triplet_search import brute_force_triplet_search
+import asyncio
+import aiofiles
 
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
+from typing import List, Dict, Any
+from pydantic import BaseModel
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import read_query_prompt
 
-async def code_graph_retrieval(query: str) -> dict[str, str]:
-    subclasses = get_all_subclasses(DataPoint)
 
-    vector_index_collections = []
+class CodeQueryInfo(BaseModel):
+    """Response model for information extraction from the query"""
 
-    for subclass in subclasses:
-        index_fields = subclass.model_fields["metadata"].default.get("index_fields", [])
-        for field_name in index_fields:
-            vector_index_collections.append(f"{subclass.__name__}_{field_name}")
+    filenames: List[str] = []
+    sourcecode: str
 
-    found_triplets = await brute_force_triplet_search(
-        query,
-        top_k=5,
-        collections=vector_index_collections or None,
-        properties_to_project=["id", "file_path", "source_code"],
+
+async def code_graph_retrieval(query: str) -> list[dict[str, Any]]:
+    if not query or not isinstance(query, str):
+        raise ValueError("The query must be a non-empty string.")
+
+    file_name_collections = ["CodeFile_name"]
+    classes_and_functions_collections = [
+        "ClassDefinition_source_code",
+        "FunctionDefinition_source_code",
+    ]
+
+    try:
+        vector_engine = get_vector_engine()
+        graph_engine = await get_graph_engine()
+    except Exception as e:
+        raise RuntimeError("Database initialization error in code_graph_retriever, ") from e
+
+    system_prompt = read_query_prompt("codegraph_retriever_system.txt")
+
+    llm_client = get_llm_client()
+    try:
+        files_and_codeparts = await llm_client.acreate_structured_output(
+            text_input=query,
+            system_prompt=system_prompt,
+            response_model=CodeQueryInfo,
+        )
+    except Exception as e:
+        raise RuntimeError("Failed to retrieve structured output from LLM") from e
+
+    similar_filenames = []
+    similar_codepieces = []
+
+    if not files_and_codeparts.filenames or not files_and_codeparts.sourcecode:
+        for collection in file_name_collections:
+            search_results_file = await vector_engine.search(collection, query, limit=3)
+            for res in search_results_file:
+                similar_filenames.append({"id": res.id, "score": res.score, "payload": res.payload})
+
+        for collection in classes_and_functions_collections:
+            search_results_code = await vector_engine.search(collection, query, limit=3)
+            for res in search_results_code:
+                similar_codepieces.append(
+                    {"id": res.id, "score": res.score, "payload": res.payload}
+                )
+
+    else:
+        for collection in file_name_collections:
+            for file_from_query in files_and_codeparts.filenames:
+                search_results_file = await vector_engine.search(
+                    collection, file_from_query, limit=3
+                )
+                for res in search_results_file:
+                    similar_filenames.append(
+                        {"id": res.id, "score": res.score, "payload": res.payload}
+                    )
+
+        for collection in classes_and_functions_collections:
+            for code_from_query in files_and_codeparts.sourcecode:
+                search_results_code = await vector_engine.search(
+                    collection, code_from_query, limit=3
+                )
+                for res in search_results_code:
+                    similar_codepieces.append(
+                        {"id": res.id, "score": res.score, "payload": res.payload}
+                    )
+
+    file_ids = [str(item["id"]) for item in similar_filenames]
+    code_ids = [str(item["id"]) for item in similar_codepieces]
+
+    relevant_triplets = await asyncio.gather(
+        *[graph_engine.get_connections(node_id) for node_id in code_ids + file_ids]
     )
 
+    paths = set()
+
+    for sublist in relevant_triplets:
+        for tpl in sublist:
+            if isinstance(tpl, tuple) and len(tpl) >= 3:
+                if "file_path" in tpl[0]:
+                    paths.add(tpl[0]["file_path"])
+                if "file_path" in tpl[2]:  # Third tuple element
+                    paths.add(tpl[2]["file_path"])
+
     retrieved_files = {}
 
-    for triplet in found_triplets:
-        if triplet.node1.attributes["source_code"]:
-            retrieved_files[triplet.node1.attributes["file_path"]] = triplet.node1.attributes[
-                "source_code"
-            ]
-        if triplet.node2.attributes["source_code"]:
-            retrieved_files[triplet.node2.attributes["file_path"]] = triplet.node2.attributes[
-                "source_code"
-            ]
-
-    return [
+    read_tasks = []
+    for file_path in paths:
+
+        async def read_file(fp):
+            try:
+                async with aiofiles.open(fp, "r", encoding="utf-8") as f:
+                    retrieved_files[fp] = await f.read()
+            except Exception as e:
+                print(f"Error reading {fp}: {e}")
+                retrieved_files[fp] = ""
+
+        read_tasks.append(read_file(file_path))
+
+    await asyncio.gather(*read_tasks)
+
+    result = [
         {
             "name": file_path,
             "description": file_path,
-            "content": source_code,
+            "content": retrieved_files[file_path],
         }
-        for file_path, source_code in retrieved_files.items()
+        for file_path in paths
     ]
+
+    return result