Graph conversion transforms Pydantic models into NetworkX directed graphs, creating nodes for entities and edges for relationships. This is the foundation of knowledge graph creation.
In this guide:
- Conversion process
- Node and edge creation
- Node ID registry
- Graph validation
- Automatic cleanup
--8<-- "docs/assets/flowcharts/conversion_process.md"
from docling_graph.core.converters import GraphConverter
# Create converter
converter = GraphConverter()
# Convert models to graph
graph, metadata = converter.pydantic_list_to_graph(models)
print(f"Created graph with {metadata.node_count} nodes and {metadata.edge_count} edges")from docling_graph.core.converters import GraphConverter
converter = GraphConverter(
add_reverse_edges=False, # Don't create bidirectional edges
validate_graph=True, # Validate structure
auto_cleanup=True # Remove phantom nodes
)
graph, metadata = converter.pydantic_list_to_graph(models)Entities (models with is_entity=True) become nodes:
from pydantic import BaseModel
# ✅ Becomes a node
class Organization(BaseModel):
name: str
model_config = {"is_entity": True} # Default
# ❌ Does NOT become a node
class Address(BaseModel):
street: str
city: str
model_config = {"is_entity": False} # Component# Node in graph
{
"id": "organization_acme_corp",
"label": "Organization",
"type": "entity",
"__class__": "Organization",
"name": "Acme Corp",
"address": None # Reference to nested entity
}Edges are created automatically from model relationships:
class BillingDocument(BaseModel):
document_no: str
issued_by: Organization # Creates edge: BillingDocument -> Organization
line_items: List[LineItem] # Creates edges: BillingDocument -> LineItem (multiple)# Edge in graph
{
"source": "invoice_001",
"target": "organization_acme_corp",
"label": "issued_by",
"properties": {}
}from pydantic import BaseModel, Field
class BillingDocument(BaseModel):
issued_by: Organization = Field(
json_schema_extra={"edge_label": "ISSUED_BY"}
)Result: Edge label becomes ISSUED_BY instead of issued_by
The NodeIDRegistry ensures consistent, deterministic node IDs across multiple extractions.
# Same entity always gets same ID
org1 = Organization(name="Acme Corp")
org2 = Organization(name="Acme Corp")
# Both get ID: "organization_acme_corp"
id1 = registry.get_node_id(org1)
id2 = registry.get_node_id(org2)
assert id1 == id2 # Truedef generate_node_id(model: BaseModel) -> str:
"""Generate deterministic node ID."""
class_name = model.__class__.__name__.lower()
# Use stable fields for identity
stable_fields = {
k: v for k, v in model.model_dump().items()
if k not in {"id", "__class__"} and v is not None
}
# Create content hash
content = json.dumps(stable_fields, sort_keys=True)
hash_suffix = hashlib.blake2b(content.encode()).hexdigest()[:8]
return f"{class_name}_{hash_suffix}"Validation checks graph structure:
converter = GraphConverter(validate_graph=True)
graph, metadata = converter.pydantic_list_to_graph(models)
# Validates:
# - No isolated nodes
# - Valid node IDs
# - Valid edge connections
# - No self-loops (optional)from docling_graph.core.utils import validate_graph_structure
try:
validate_graph_structure(graph, raise_on_error=True)
print("✅ Graph structure valid")
except ValueError as e:
print(f"❌ Validation failed: {e}")Automatic cleanup removes:
- Phantom nodes - Nodes with no data
- Duplicate nodes - Same entity multiple times
- Orphaned edges - Edges to non-existent nodes
- Empty attributes - Null or empty values
converter = GraphConverter(
auto_cleanup=True # Enable cleanup (default)
)
graph, metadata = converter.pydantic_list_to_graph(models)from docling_graph.core.utils import GraphCleaner
cleaner = GraphCleaner(verbose=True)
cleaned_graph = cleaner.clean_graph(graph)
print(f"Removed {graph.number_of_nodes() - cleaned_graph.number_of_nodes()} phantom nodes")from docling_graph.core.converters import GraphConverter
from my_templates import BillingDocument, Organization, LineItem
# Create sample models
models = [
BillingDocument(
document_no="INV-001",
issued_by=Organization(name="Acme Corp"),
line_items=[
LineItem(description="Product A", total=100),
LineItem(description="Product B", total=200)
],
total=300
)
]
# Convert to graph
converter = GraphConverter()
graph, metadata = converter.pydantic_list_to_graph(models)
print(f"Nodes: {metadata.node_count}")
print(f"Edges: {metadata.edge_count}")
print(f"Node types: {metadata.node_types}")from docling_graph.core.converters import GraphConverter
# Create bidirectional edges
converter = GraphConverter(add_reverse_edges=True)
graph, metadata = converter.pydantic_list_to_graph(models)
# Original edge: BillingDocument -> Organization (ISSUED_BY)
# Reverse edge: Organization -> Invoice (reverse_ISSUED_BY)
print(f"Total edges (with reverse): {metadata.edge_count}")from docling_graph.core.converters import GraphConverter, NodeIDRegistry
# Create shared registry
registry = NodeIDRegistry()
# Convert first batch
converter1 = GraphConverter(registry=registry)
graph1, _ = converter1.pydantic_list_to_graph(batch1_models)
# Convert second batch (same registry)
converter2 = GraphConverter(registry=registry)
graph2, _ = converter2.pydantic_list_to_graph(batch2_models)
# Same entities get same IDs across batches
print(f"Registry has {registry.get_stats()['total_entities']} unique entities")from docling_graph.core.converters import GraphConverter, GraphConfig
# Create custom config
config = GraphConfig(
add_reverse_edges=True,
validate_graph=True,
node_id_prefix="doc_"
)
converter = GraphConverter(config=config)
graph, metadata = converter.pydantic_list_to_graph(models)@dataclass
class GraphMetadata:
node_count: int
edge_count: int
node_types: Dict[str, int]
edge_types: Dict[str, int]
avg_degree: float
density: float
source_model_count: intgraph, metadata = converter.pydantic_list_to_graph(models)
print(f"Graph Statistics:")
print(f" Nodes: {metadata.node_count}")
print(f" Edges: {metadata.edge_count}")
print(f" Density: {metadata.density:.2f}")
print(f" Avg degree: {metadata.avg_degree:.2f}")
print(f"\nNode Types:")
for node_type, count in metadata.node_types.items():
print(f" {node_type}: {count}")
print(f"\nEdge Types:")
for edge_type, count in metadata.edge_types.items():
print(f" {edge_type}: {count}")Create bidirectional relationships:
converter = GraphConverter(add_reverse_edges=True)
graph, metadata = converter.pydantic_list_to_graph(models)
# For each edge A -> B, creates B -> A
# Useful for graph traversal in both directionsProvide custom node ID logic:
from docling_graph.core.converters import NodeIDRegistry
class CustomRegistry(NodeIDRegistry):
def generate_node_id(self, model: BaseModel) -> str:
# Custom ID generation
return f"custom_{model.__class__.__name__}_{hash(model)}"
registry = CustomRegistry()
converter = GraphConverter(registry=registry)# Process large model lists efficiently
converter = GraphConverter(auto_cleanup=True)
# Convert in single call (efficient)
graph, metadata = converter.pydantic_list_to_graph(all_models)
# Don't convert one by one (inefficient)
# for model in models:
# graph, _ = converter.pydantic_list_to_graph([model])# For very large graphs
converter = GraphConverter(
auto_cleanup=True, # Remove unnecessary nodes
validate_graph=False # Skip validation for speed
)
graph, metadata = converter.pydantic_list_to_graph(models)
# Clear registry after conversion
converter.registry.clear()Solution:
# Check if models have entities
for model in models:
if hasattr(model, 'model_config'):
is_entity = model.model_config.get('is_entity', True)
print(f"{model.__class__.__name__}: is_entity={is_entity}")Solution:
# Ensure relationships are defined
class BillingDocument(BaseModel):
issued_by: Organization # Must be typed as entity
# Not: issued_by: dict # Won't create edgeSolution:
# Enable auto cleanup
converter = GraphConverter(auto_cleanup=True)
graph, metadata = converter.pydantic_list_to_graph(models)Solution:
# Check graph structure
print(f"Nodes: {graph.number_of_nodes()}")
print(f"Edges: {graph.number_of_edges()}")
# Inspect nodes
for node_id, data in list(graph.nodes(data=True))[:5]:
print(f"Node: {node_id}, Data: {data}")# ✅ Good - Consistent IDs across batches
registry = NodeIDRegistry()
for batch in batches:
converter = GraphConverter(registry=registry)
graph, _ = converter.pydantic_list_to_graph(batch)# ✅ Good - Clean graphs
converter = GraphConverter(auto_cleanup=True)# ✅ Good - Catch issues early
converter = GraphConverter(validate_graph=True)# ✅ Good - Faster in production
converter = GraphConverter(validate_graph=False)Now that you understand graph conversion:
- Export Formats → - Export graphs to CSV, Cypher, JSON
- Visualization → - Visualize your graphs
- Neo4j Integration → - Import into Neo4j