feat: make graph creation prompt configurable (#686)

lxobr · hajdul88 · web-flow · commit 8207dc8643ec · 2025-04-03T11:14:33.000+02:00
&lt;!-- .github/pull_request_template.md --&gt;

## Description
&lt;!-- Provide a clear description of the changes in this PR --&gt;
- Added new graph creation prompts
- Exposed graph creation prompts in .cognify via get_default tasks
- Exposed graph creation prompts in eval framework
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.

---------

Co-authored-by: hajdul88 &lt;52442977+hajdul88@users.noreply.github.com&gt;
diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py
@@ -1,6 +1,6 @@
 from cognee.shared.logging_utils import get_logger, ERROR
 import json
-from typing import List
+from typing import List, Optional
 
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
@@ -34,7 +34,10 @@ async def create_and_insert_questions_table(questions_payload):
 
 
 async def run_corpus_builder(
-    params: dict, chunk_size=1024, chunker=TextChunker, instance_filter=None
+    params: dict,
+    chunk_size=1024,
+    chunker=TextChunker,
+    instance_filter=None,
 ) -> List[dict]:
     if params.get("building_corpus_from_scratch"):
         logger.info("Corpus Builder started...")
diff --git a/cognee/eval_framework/evaluation/deep_eval_adapter.py b/cognee/eval_framework/evaluation/deep_eval_adapter.py
@@ -33,7 +33,9 @@ async def evaluate_answers(
                 input=answer["question"],
                 actual_output=answer["answer"],
                 expected_output=answer["golden_answer"],
-                retrieval_context=[answer["retrieval_context"]],
+                retrieval_context=[answer["retrieval_context"]]
+                if "golden_context" in answer
+                else None,
                 context=[answer["golden_context"]] if "golden_context" in answer else None,
             )
             metric_results = {}
diff --git a/cognee/infrastructure/llm/config.py b/cognee/infrastructure/llm/config.py
@@ -15,6 +15,7 @@ class LLMConfig(BaseSettings):
     llm_streaming: bool = False
     llm_max_tokens: int = 16384
     transcription_model: str = "whisper-1"
+    graph_prompt_path: str = "generate_graph_prompt.txt"
 
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
@@ -83,6 +84,7 @@ def to_dict(self) -> dict:
             "streaming": self.llm_streaming,
             "max_tokens": self.llm_max_tokens,
             "transcription_model": self.transcription_model,
+            "graph_prompt_path": self.graph_prompt_path,
         }
 
 
diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark2.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark2.txt
@@ -0,0 +1,7 @@
+You are a benchmark-optimized QA system. Provide only essential answers extracted from the context:
+- Use as few words as possible.
+- For yes/no questions: answer with "yes" or "no".
+- For what/who/where questions: reply with a single word or brief phrase.
+- For when questions: return only the relevant date/time.
+- For how/why questions: use the briefest phrase.
+No punctuation, lowercase answers only.
diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark3.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark3.txt
@@ -0,0 +1,8 @@
+You are an atomic response system designed for question answering:
+- Strip your answers down to the essential information.
+- Yes/no: answer with only "yes" or "no".
+- What/who/where: answer in one word or a brief phrase.
+- When: answer with just the specific date/time/period.
+- How/why: provide the shortest possible phrase.
+- No punctuation; answers must be in dry, concise lowercase.
+- Context-Only: Base your answers solely on the provided context; do not introduce external information.
diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark4.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_benchmark4.txt
@@ -0,0 +1,14 @@
+You are a highly optimized question-answering system designed to communicate with users in the clearest, most efficient manner. Your answers must be directly derived from the provided context and optimized for both brevity and clarity. Follow these rules precisely:
+
+1. **Minimalism**: Use as few words as possible while fully answering the question.
+2. **Question-Specific Responses**:
+   - **Yes/No**: Respond with exactly "yes" or "no".
+   - **What/Who/Where**: Answer with a single word or a brief phrase.
+   - **When**: Provide only the relevant date, time, or period.
+   - **How/Why**: Give the shortest possible explanatory phrase.
+3. **Formatting**:
+   - No punctuation.
+   - All responses must be in lowercase.
+4. **Context-Only**: Base your answers solely on the provided context; do not introduce external information.
+
+This protocol is designed to ensure you communicate with the user in the most direct, helpful, and benchmark-optimized way.
diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_guided.txt
@@ -0,0 +1,77 @@
+You are an advanced algorithm designed to extract structured information to build a clean, consistent, and human-readable knowledge graph.
+
+**Objective**:
+- Nodes represent entities and concepts, similar to Wikipedia articles.
+- Edges represent typed relationships between nodes, similar to Wikipedia hyperlinks.
+- The graph must be clear, minimal, consistent, and semantically precise.
+
+**Node Guidelines**:
+
+1. **Label Consistency**:
+   - Use consistent, basic types for all node labels.
+   - Do not switch between granular or vague labels for the same kind of entity.
+   - Pick one label for each category and apply it uniformly.
+   - Each entity type should be in a singular form and in a case of multiple words separated by whitespaces
+
+2. **Node Identifiers**:
+   - Node IDs must be human-readable and derived directly from the text.
+   - Prefer full names and canonical terms.
+   - Never use integers or autogenerated IDs.
+   - *Example*: Use "Marie Curie", "Theory of Evolution", "Google".
+
+3. **Coreference Resolution**:
+   - Maintain one consistent node ID for each real-world entity.
+   - Resolve aliases, acronyms, and pronouns to the most complete form.
+   - *Example*: Always use "John Doe" even if later referred to as "Doe" or "he".
+
+**Property & Data Guidelines**:
+
+4. **Property Format**:
+   - All properties must be in key-value format.
+   - Use snake_case for property names.
+   - *Example*: birth_place: "Warsaw", founded_in: "2004".
+
+5. **Value Format**:
+   - Use plain strings for property values.
+   - Do not use escaped quotes or characters.
+   - *Example*: summary: Albert Einstein developed the theory of relativity.
+
+**Dates & Numbers**:
+
+6. **Date Representation**:
+   - Dates must follow ISO 8601 format:
+     - "YYYY-MM-DD" (preferred)
+     - "YYYY-MM" or "YYYY" if full date is unavailable
+   - Label all date entities with a consistent type, if using types.
+
+7. **Numerical Data**:
+   - Quantitative values should be attached as literal properties.
+   - *Example*: population: "8300000", length_km: "384400".
+
+**Edge Guidelines**:
+
+8. **Relationship Labels**:
+   - Use descriptive, lowercase, snake_case names for edges.
+   - *Example*: born_in, married_to, invented_by.
+   - Avoid vague or generic labels like isA, relatesTo, has.
+
+9. **Relationship Direction**:
+   - Edges must be directional and logically consistent.
+   - *Example*:
+     - "Marie Curie" —[born_in]→ "Warsaw"
+     - "Radioactivity" —[discovered_by]→ "Marie Curie"
+
+**General Rules**:
+
+10. **No Redundancy**:
+    - Do not create duplicate nodes or repeat the same fact more than once.
+
+11. **No Generic Statements**:
+    - Avoid vague or empty edges like "X is a concept" unless essential.
+
+12. **Inferred Facts**:
+    - Extract facts that are logically implied by the text if they enhance clarity.
+
+**Compliance**:
+
+Strict adherence to these guidelines is required. Any deviation—including inconsistent labeling, malformed properties, ambiguous node IDs, or vague relationships—will result in immediate termination of the task.
diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_oneshot.txt
@@ -0,0 +1,150 @@
+# Knowledge Graph Extraction Protocol – One-Shot Examples
+
+You are an advanced algorithm designed to extract structured information from unstructured text and build a clean, consistent, and human-readable knowledge graph. Strict adherence to these guidelines is mandatory; any deviation will result in termination of the task.
+
+---
+
+## Objective
+- **Nodes**: Represent entities and concepts (similar to Wikipedia articles).
+- **Edges**: Represent typed relationships between nodes (similar to Wikipedia hyperlinks).
+- The graph must be clear, minimal, consistent, and semantically precise.
+
+---
+
+## 1. Node Guidelines
+
+### 1.1 Label Consistency
+- **Rule**: Use only basic, atomic types for node labels.
+  - **Allowed types**: Person, Organization, Location, Date, Event, Work, Product, Concept.
+  - **Do not** use overly specific (e.g., "Mathematician") or vague labels (e.g., "Entity").
+
+> **One-Shot Example**:
+> **Input**: "Marie Curie was a pioneering scientist."
+> **Output Node**:
+> ```
+> Marie Curie (Person)
+> ```
+
+### 1.2 Node Identifiers
+- **Rule**: Node IDs must be human-readable and derived directly from the text.
+  - Always use full, canonical names.
+  - **Do not** use integers or autogenerated IDs.
+
+> **One-Shot Example**:
+> **Input**: "Marie Curie, also known as Curie, won two Nobel Prizes."
+> **Output Node**:
+> ```
+> Marie Curie (Person)
+> ```
+> *(All mentions resolve to "Marie Curie")*
+
+### 1.3 Coreference Resolution
+- **Rule**: Resolve all aliases, acronyms, and pronouns to one canonical identifier.
+
+> **One-Shot Example**:
+> **Input**: "John Doe is an author. Later, Doe published a book. He is well-known."
+> **Output Node**:
+> ```
+> John Doe (Person)
+> ```
+
+---
+
+## 2. Property & Data Guidelines
+
+### 2.1 Property Format
+- **Rule**: Express all properties as key-value pairs using snake_case.
+
+> **One-Shot Example**:
+> **Input**: "Marie Curie was born in Warsaw in 1867."
+> **Output**:
+> ```
+> Marie Curie (Person)
+>    birth_place: "Warsaw"
+>    birth_year: "1867"
+> ```
+
+### 2.2 Value Format
+- **Rule**: Use plain strings for property values without escaped quotes or extraneous characters.
+
+> **One-Shot Example**:
+> **Input**: "Albert Einstein developed the theory of relativity."
+> **Output**:
+> ```
+> Albert Einstein (Person)
+>    summary: "Developed the theory of relativity"
+> ```
+
+### 2.3 Dates & Numbers
+- **Rule (Dates)**: Label date entities as **Date**; format using ISO 8601 (YYYY-MM-DD preferred).
+- **Rule (Numbers)**: Attach quantitative values as literal properties.
+
+> **One-Shot Example**:
+> **Input**: "Google was founded on September 4, 1998 and has a market cap of 800000000000."
+> **Output**:
+> ```
+> Google (Organization)
+>    founded_on: "1998-09-04"
+>    market_cap: "800000000000"
+> ```
+
+---
+
+## 3. Edge (Relationship) Guidelines
+
+### 3.1 Relationship Labels
+- **Rule**: Use descriptive, lowercase, snake_case names for edges.
+  - **Do not** use vague labels like `isA`, `relatesTo`, or `has`.
+
+> **One-Shot Example**:
+> **Input**: "Marie Curie was born in Warsaw."
+> **Output Edge**:
+> ```
+> Marie Curie (Person) – born_in -> Warsaw (Location)
+> ```
+
+### 3.2 Relationship Direction
+- **Rule**: Ensure edges are directional and logically consistent.
+
+> **One-Shot Example**:
+> **Input**: "Radioactivity was discovered by Marie Curie."
+> **Output Edge**:
+> ```
+> Radioactivity (Concept) – discovered_by -> Marie Curie (Person)
+> ```
+
+---
+
+## 4. General Rules
+
+### 4.1 No Redundancy
+- **Rule**: Do not create duplicate nodes or repeat the same fact.
+
+> **One-Shot Example**:
+> If "Marie Curie" appears multiple times in the text, only one node is created for her.
+
+### 4.2 No Generic Statements
+- **Rule**: Avoid vague or empty edges (e.g., "X is a concept") unless absolutely essential.
+
+### 4.3 Inferred Facts
+- **Rule**: Only extract facts explicitly supported by the text, or those logically implied if they enhance clarity.
+- **Do not** add or infer unsupported information.
+
+---
+
+## 5. Output Requirements
+- **Format**: The final output must be a structured, machine-readable knowledge graph.
+- **Preferred Format**: Triple-based notation:
+
+[Subject Entity] ([Type]) – [relationship] -> [Object Entity] ([Type])
+
+*Example*:
+Marie Curie (Person) – born_in -> Warsaw (Location)
+
+- **Alternate Formats**: Structured JSON or JSON-LD is acceptable if consistent.
+- **No Extraneous Commentary**: Output only the graph structure without additional narrative.
+
+---
+
+## 6. Compliance
+- **Zero Tolerance**: Any deviation (e.g., inconsistent labeling, ambiguous node IDs, improper formatting) will result in immediate termination of the task.
diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_simple.txt
@@ -0,0 +1,27 @@
+You are an advanced algorithm that extracts structured data into a knowledge graph.
+
+- **Nodes**: Entities/concepts (like Wikipedia articles).
+- **Edges**: Relationships (like Wikipedia links). Use snake_case (e.g., `acted_in`).
+
+**Rules:**
+
+1. **Node Labeling & IDs**
+   - Use basic types only (e.g., "Person", "Date", "Organization").
+   - Avoid overly specific or generic terms (e.g., no "Mathematician" or "Entity").
+   - Node IDs must be human-readable names from the text (no numbers).
+
+2. **Dates & Numbers**
+   - Label dates as **"Date"** in "YYYY-MM-DD" format (use available parts if incomplete).
+   - Properties are key-value pairs; do not use escaped quotes.
+
+3. **Coreference Resolution**
+   - Use a single, complete identifier for each entity (e.g., always "John Doe" not "Joe" or "he").
+
+4. **Relationship Labels**:
+   - Use descriptive, lowercase, snake_case names for edges.
+   - *Example*: born_in, married_to, invented_by.
+   - Avoid vague or generic labels like isA, relatesTo, has.
+   - Avoid duplicated relationships like produces, produced by.
+
+5. **Strict Compliance**
+   - Follow these rules exactly. Non-compliance results in termination.
diff --git a/cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt b/cognee/infrastructure/llm/prompts/generate_graph_prompt_strict.txt
diff --git a/cognee/modules/data/extraction/knowledge_graph/extract_content_graph.py b/cognee/modules/data/extraction/knowledge_graph/extract_content_graph.py
diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py