topoteretes · lxobr · Dec 26, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -7,6 +7,8 @@
 from pathlib import Path
 from typing import Union
 
+from cognee.infrastructure.databases.vector.embeddings import \
+    get_embedding_engine
 from cognee.modules.data.methods import get_datasets, get_datasets_by_name
 from cognee.modules.data.methods.get_dataset_data import get_dataset_data
 from cognee.modules.data.models import Data, Dataset
@@ -19,6 +21,7 @@
 from cognee.modules.pipelines.tasks.Task import Task
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.models import User
+from cognee.shared.data_models import SummarizedContent
 from cognee.shared.SourceCodeGraph import SourceCodeGraph
 from cognee.shared.utils import send_telemetry
 from cognee.tasks.documents import (check_permissions_on_documents,
@@ -28,13 +31,16 @@
 from cognee.tasks.repo_processor import (enrich_dependency_graph,
                                          expand_dependency_graph,
                                          get_repo_file_dependencies)
+from cognee.tasks.repo_processor.get_source_code_chunks import \
+    get_source_code_chunks
 from cognee.tasks.storage import add_data_points
 from cognee.tasks.summarization import summarize_code
 
 logger = logging.getLogger("code_graph_pipeline")
 
 update_status_lock = asyncio.Lock()
 
+
 async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User = None):
     if user is None:
         user = await get_default_user()
@@ -65,7 +71,7 @@ async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User
 
 async def run_pipeline(dataset: Dataset, user: User):
     '''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.'''
-    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
+    data_documents: list[Data] = await get_dataset_data(dataset_id=dataset.id)
 
     document_ids_str = [str(document.id) for document in data_documents]
 
@@ -88,10 +94,11 @@ async def run_pipeline(dataset: Dataset, user: User):
     try:
         tasks = [
             Task(classify_documents),
-            Task(check_permissions_on_documents, user = user, permissions = ["write"]),
-            Task(extract_chunks_from_documents), # Extract text chunks based on the document type.
-            Task(add_data_points, task_config = { "batch_size": 10 }),
-            Task(extract_graph_from_code, graph_model = SourceCodeGraph, task_config = { "batch_size": 10 }), # Generate knowledge graphs from the document chunks.
+            Task(check_permissions_on_documents, user=user, permissions=["write"]),
+            Task(extract_chunks_from_documents),  # Extract text chunks based on the document type.
+            Task(add_data_points, task_config={"batch_size": 10}),
+            Task(extract_graph_from_code, graph_model=SourceCodeGraph, task_config={"batch_size": 10}),
+            # Generate knowledge graphs from the document chunks.
         ]
 
         pipeline = run_tasks(tasks, data_documents, "code_graph_pipeline")
@@ -122,6 +129,7 @@ def generate_dataset_name(dataset_name: str) -> str:
 async def run_code_graph_pipeline(repo_path):
     import os
     import pathlib
+
     import cognee
     from cognee.infrastructure.databases.relational import create_db_and_tables
 
@@ -135,10 +143,13 @@ async def run_code_graph_pipeline(repo_path):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
+    embedding_engine = get_embedding_engine()
+
     tasks = [
         Task(get_repo_file_dependencies),
-        Task(enrich_dependency_graph, task_config={"batch_size": 50}),
+        Task(enrich_dependency_graph),
         Task(expand_dependency_graph, task_config={"batch_size": 50}),
+        Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
         Task(summarize_code, task_config={"batch_size": 50}),
         Task(add_data_points, task_config={"batch_size": 50}),
     ]

diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py
@@ -1,13 +1,12 @@
 from typing import List, Optional
-
 from cognee.infrastructure.engine import DataPoint
 
 
 class Repository(DataPoint):
     __tablename__ = "Repository"
     path: str
     _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
         "type": "Repository"
     }
 
@@ -19,29 +18,31 @@ class CodeFile(DataPoint):
     depends_on: Optional[List["CodeFile"]] = None
     depends_directly_on: Optional[List["CodeFile"]] = None
     contains: Optional[List["CodePart"]] = None
-
     _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
         "type": "CodeFile"
     }
 
 class CodePart(DataPoint):
     __tablename__ = "codepart"
-    # part_of: Optional[CodeFile]
-    source_code: str
-
+    # part_of: Optional[CodeFile] = None
+    source_code: Optional[str] = None
     _metadata: dict = {
-        "index_fields": ["source_code"],
+        "index_fields": [],
         "type": "CodePart"
     }
 
-class CodeRelationship(DataPoint):
-    source_id: str
-    target_id: str
-    relation: str  # depends on or depends directly
+class SourceCodeChunk(DataPoint):
+    __tablename__ = "sourcecodechunk"
+    code_chunk_of: Optional[CodePart] = None
+    source_code: Optional[str] = None
+    previous_chunk: Optional["SourceCodeChunk"] = None
+
     _metadata: dict = {
-        "type": "CodeRelationship"
+        "index_fields": ["source_code"],
+        "type": "SourceCodeChunk"
     }
 
 CodeFile.model_rebuild()
 CodePart.model_rebuild()
+SourceCodeChunk.model_rebuild()
diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py
@@ -210,7 +210,6 @@ class SummarizedClass(BaseModel):
     decorators: Optional[List[str]] = None
 
 class SummarizedCode(BaseModel):
-    file_name: str
     high_level_summary: str
     key_features: List[str]
     imports: List[str] = []

diff --git a/cognee/tasks/repo_processor/get_repo_file_dependencies.py b/cognee/tasks/repo_processor/get_repo_file_dependencies.py
@@ -71,7 +71,7 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
         path = repo_path,
     )
 
-    yield repo
+    yield [repo]
 
     with ProcessPoolExecutor(max_workers = 12) as executor:
         loop = asyncio.get_event_loop()
@@ -90,10 +90,11 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
 
         results = await asyncio.gather(*tasks)
 
+        code_files = []
         for (file_path, metadata), dependencies in zip(py_files_dict.items(), results):
             source_code = metadata.get("source_code")
 
-            yield CodeFile(
+            code_files.append(CodeFile(
                 id = uuid5(NAMESPACE_OID, file_path),
                 source_code = source_code,
                 extracted_id = file_path,
@@ -106,4 +107,6 @@ async def get_repo_file_dependencies(repo_path: str) -> AsyncGenerator[list, Non
                         source_code = py_files_dict.get(dependency, {}).get("source_code"),
                     ) for dependency in dependencies
                 ] if dependencies else None,
-            )
+            ))
+
+        yield code_files
diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -0,0 +1,140 @@
+from typing import AsyncGenerator, Generator
+from uuid import NAMESPACE_OID, uuid5
+from cognee.infrastructure.engine import DataPoint
+from cognee.shared.CodeGraphEntities import CodePart, SourceCodeChunk, CodeFile
+import tiktoken
+import parso
+
+from cognee.tasks.repo_processor import logger
+
+
+def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:
+    return len(tokenizer.encode(source_code))
+
+
+def _get_naive_subchunk_token_counts(
+        tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
+) -> list[tuple[str, int]]:
+    """Splits source code into subchunks of up to max_subchunk_tokens and counts tokens."""
+
+    token_ids = tokenizer.encode(source_code)
+    subchunk_token_counts = []
+
+    for start_idx in range(0, len(token_ids), max_subchunk_tokens):
+        subchunk_token_ids = token_ids[start_idx: start_idx + max_subchunk_tokens]
+        token_count = len(subchunk_token_ids)
+        subchunk = ''.join(
+            tokenizer.decode_single_token_bytes(token_id).decode('utf-8', errors='replace')
+            for token_id in subchunk_token_ids
+        )
+        subchunk_token_counts.append((subchunk, token_count))
+
+    return subchunk_token_counts
+
+
+def _get_subchunk_token_counts(
+        tokenizer: tiktoken.Encoding, source_code: str, max_subchunk_tokens: int = 8000
+) -> list[tuple[str, int]]:
+    """Splits source code into subchunk and counts tokens for each subchunk."""
+
+    try:
+        module = parso.parse(source_code)
+    except Exception as e:
+        logger.error(f"Error parsing source code: {e}")
+        return []
+
+    if not module.children:
+        logger.warning("Parsed module has no children (empty or invalid source code).")
+        return []
+
+    if len(module.children) <= 2:
+        module = module.children[0]
+
+    subchunk_token_counts = []
+    for child in module.children:
+        subchunk = child.get_code()
+        token_count = _count_tokens(tokenizer, subchunk)
+
+        if token_count == 0:
+            continue
+
+        if token_count <= max_subchunk_tokens:
+            subchunk_token_counts.append((subchunk, token_count))
+            continue
+
+        if child.type == 'string':
+            subchunk_token_counts.extend(_get_naive_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
+            continue
+
+        subchunk_token_counts.extend(_get_subchunk_token_counts(tokenizer, subchunk, max_subchunk_tokens))
+
+    return subchunk_token_counts
+
+
+def _get_chunk_source_code(
+        code_token_counts: list[tuple[str, int]], overlap: float, max_tokens: int
+) -> tuple[list[tuple[str, int]], str]:
+    """Generates a chunk of source code from tokenized subchunks with overlap handling."""
+    current_count = 0
+    cumulative_counts = []
+    current_source_code = ''
+
+    for i, (child_code, token_count) in enumerate(code_token_counts):
+        current_count += token_count
+        cumulative_counts.append(current_count)
+        if current_count > max_tokens:
+            break
+        current_source_code += f"\n{child_code}"
+
+    if current_count <= max_tokens:
+        return [], current_source_code.strip()
+
+    cutoff = 1
+    for i, cum_count in enumerate(cumulative_counts):
+        if cum_count > (1 - overlap) * max_tokens:
+            break
+        cutoff = i
+
+    return code_token_counts[cutoff:], current_source_code.strip()
+
+
+def get_source_code_chunks_from_code_part(
+        code_file_part: CodePart,
+        max_tokens: int = 8192,
+        overlap: float = 0.25,
+        granularity: float = 0.1,
+        model_name: str = "text-embedding-3-large"
+) -> Generator[SourceCodeChunk, None, None]:
+    """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
+    tokenizer = tiktoken.encoding_for_model(model_name)
+    max_subchunk_tokens = max(1, int(granularity * max_tokens))
+    subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens)
+
+    previous_chunk = None
+    while subchunk_token_counts:
+        subchunk_token_counts, chunk_source_code = _get_chunk_source_code(subchunk_token_counts, overlap, max_tokens)
+        if not chunk_source_code:
+            continue
+        current_chunk = SourceCodeChunk(
+            id=uuid5(NAMESPACE_OID, chunk_source_code),
+            code_chunk_of=code_file_part,
+            source_code=chunk_source_code,
+            previous_chunk=previous_chunk
+        )
+        yield current_chunk
+        previous_chunk = current_chunk
+
+
+async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
+        AsyncGenerator[list[DataPoint], None]:
+    """Processes code graph datapoints, create SourceCodeChink datapoints."""
+    for data_point in data_points:
+        yield data_point
+        if not isinstance(data_point, CodeFile):
+            continue
+        if not data_point.contains:
+            continue
+        for code_part in data_point.contains:
+            yield code_part
+            for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
+                yield source_code_chunk
diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
@@ -1,6 +1,8 @@
+from typing import Union
+
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models import DocumentChunk
-from cognee.shared.CodeGraphEntities import CodeFile
+from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
 
 class TextSummary(DataPoint):
@@ -17,7 +19,7 @@ class TextSummary(DataPoint):
 class CodeSummary(DataPoint):
     __tablename__ = "code_summary"
     text: str
-    made_from: CodeFile
+    summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
 
     _metadata: dict = {
         "index_fields": ["text"],

diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py
@@ -1,10 +1,10 @@
 import asyncio
 from typing import AsyncGenerator, Union
 from uuid import uuid5
-from typing import Type
 
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.data.extraction.extract_summary import extract_code_summary
+
 from .models import CodeSummary
 
 
@@ -21,7 +21,7 @@ async def summarize_code(
     )
 
     file_summaries_map = {
-        code_data_point.extracted_id: str(file_summary)
+        code_data_point.id: str(file_summary)
         for code_data_point, file_summary in zip(code_data_points, file_summaries)
     }
 
@@ -35,6 +35,6 @@ async def summarize_code(
 
         yield CodeSummary(
             id=uuid5(node.id, "CodeSummary"),
-            made_from=node,
-            text=file_summaries_map[node.extracted_id],
+            summarizes=node,
+            text=file_summaries_map[node.id],
         )