Skip to content
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
187 commits
Select commit Hold shift + click to select a range
9be2d30
refactor: Standardize import statements and improve code readability …
deon-sanchez Jul 16, 2025
941bc81
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 16, 2025
4df3225
feat: Introduce new Files and Knowledge Bases page with tabbed interface
deon-sanchez Jul 17, 2025
c32d451
Create knowledgebase_utils.py
erichare Jul 17, 2025
75409c1
Push initial ingest component
erichare Jul 17, 2025
1c9a2aa
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
de3ade8
Create initial KB Ingestion component
erichare Jul 17, 2025
5ea7224
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
c22e59b
Fix ruff check on utility functions
erichare Jul 17, 2025
ccd0f79
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
b9f9e01
Some quick fixes
erichare Jul 17, 2025
c00f486
Update kb_ingest.py
erichare Jul 17, 2025
4ada462
Merge branch 'main' into feat-knowledge-bases
erichare Jul 17, 2025
cabf676
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
350461e
First version of retrieval component
erichare Jul 17, 2025
b0b62a3
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
7dad9d6
Update icon
erichare Jul 17, 2025
6a0f187
Update kb_retrieval.py
erichare Jul 17, 2025
8da44b2
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
0d25004
Merge branch 'lfoss-1813' into feat-knowledge-bases
deon-sanchez Jul 17, 2025
1247bed
Add knowledge bases feature with API integration and UI components
deon-sanchez Jul 17, 2025
66da30e
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
5951200
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 17, 2025
d9c9cb9
Refactor imports and update routing paths for assets and main page co…
deon-sanchez Jul 17, 2025
75189e8
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 17, 2025
81367fb
Merge branch 'main' into feat-knowledge-bases
edwinjosechittilappilly Jul 17, 2025
d7940af
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
db49a96
Add CreateKnowledgeBaseButton, KnowledgeBaseEmptyState, and Knowledge…
deon-sanchez Jul 17, 2025
5503c78
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 17, 2025
845f0a7
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
ef94bcf
PoV: Add Parquet data retrieval to KBRetrievalComponent (#9097)
edwinjosechittilappilly Jul 17, 2025
6d82934
Fix some ruff issues
erichare Jul 17, 2025
79e3425
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 17, 2025
b43333f
Merge branch 'main' into feat-knowledge-bases
erichare Jul 17, 2025
109363c
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 18, 2025
49c0db0
Merge branch 'main' into feat-knowledge-bases
erichare Jul 18, 2025
d7e5c33
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 21, 2025
bd1d91f
feat: refactor file management and knowledge base components
deon-sanchez Jul 21, 2025
d5d2a5e
feat: implement delete confirmation modal for knowledge base deletion
deon-sanchez Jul 21, 2025
63dd4c9
feat: enhance knowledge base metadata with embedding model detection
deon-sanchez Jul 21, 2025
14b87c4
refactor: clean up tooltip and value getter comments in knowledge bas…
deon-sanchez Jul 21, 2025
8daab25
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 21, 2025
8268740
refactor: simplify KnowledgeBaseSelectionOverlay component
deon-sanchez Jul 21, 2025
c3d286b
feat: implement bulk and single deletion for knowledge bases
deon-sanchez Jul 21, 2025
388e98a
Merge branch 'main' into feat-knowledge-bases
erichare Jul 21, 2025
2c78dd0
Initial support for vector search
erichare Jul 21, 2025
2adcc77
feat: add KnowledgeBaseDrawer component for enhanced knowledge base d…
deon-sanchez Jul 21, 2025
c4bf9bf
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 21, 2025
3b88885
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 21, 2025
6b3a349
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 21, 2025
4116cae
Fix ruff checks
erichare Jul 21, 2025
810c717
Update knowledge_bases.py
erichare Jul 21, 2025
c883ae1
feat: update mock data and enhance drawer functionality in KnowledgeB…
deon-sanchez Jul 21, 2025
24e7715
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 21, 2025
dd8855b
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 21, 2025
2c02cc0
Append scores column to rows
erichare Jul 21, 2025
0d36985
Merge branch 'main' into feat-knowledge-bases
erichare Jul 21, 2025
77bc57f
refactor: improve knowledge base deletion and UI components
deon-sanchez Jul 21, 2025
98766fc
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 21, 2025
9c7fb6a
refactor: standardize import statements and improve code readability …
deon-sanchez Jul 21, 2025
63fb9b9
feat: Add encryption for API keys in KB ingest and retrieval (#9129)
edwinjosechittilappilly Jul 22, 2025
049e39f
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 22, 2025
8ec1341
Merge branch 'main' into feat-knowledge-bases
erichare Jul 22, 2025
8adcd12
Merge branch 'main' into feat-knowledge-bases
erichare Jul 22, 2025
0ca5a67
Merge branch 'main' into feat-knowledge-bases
erichare Jul 22, 2025
f251c73
Merge branch 'main' into feat-knowledge-bases
erichare Jul 22, 2025
1def7f6
Fix import of auth utils
erichare Jul 22, 2025
9146f7e
Allow appending to existing knowledge base
erichare Jul 22, 2025
06211a6
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 22, 2025
d3a7120
Update kb_ingest.py
erichare Jul 22, 2025
67d5ae5
Update kb_ingest.py
erichare Jul 22, 2025
bc10c6e
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 23, 2025
bad02f3
feat: enhance table component with editable Vectorize column function…
deon-sanchez Jul 23, 2025
fe36a36
New ingestion creation dialog
erichare Jul 23, 2025
d139d5b
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 23, 2025
4cb23b7
Clean up the creation process for KB
erichare Jul 23, 2025
6ece64b
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 23, 2025
69aed9a
Clean up names and descriptions
erichare Jul 23, 2025
bd4ae10
Update kb_retrieval.py
erichare Jul 23, 2025
1469ecf
Merge branch 'main' into feat-knowledge-bases
erichare Jul 23, 2025
a654109
chroma retrieval
erichare Jul 23, 2025
5d0916d
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 23, 2025
a8ea48e
Further KB cleanup
erichare Jul 23, 2025
4440e08
refactor: update KB ingestion component and enhance NodeDialog functi…
deon-sanchez Jul 23, 2025
93b5149
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 23, 2025
16555cd
Hash the text as id
erichare Jul 23, 2025
1e66ae2
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 23, 2025
1c4c209
Update kb_retrieval.py
erichare Jul 23, 2025
86c8e55
Merge branch 'main' into feat-knowledge-bases
erichare Jul 23, 2025
4b7de6d
Merge branch 'main' into feat-knowledge-bases
erichare Jul 23, 2025
4f49445
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 23, 2025
0a43c94
Make sure to write out the source parquet
erichare Jul 23, 2025
72d88c0
Remove unneeded old code
erichare Jul 23, 2025
2048c42
Merge branch 'main' into feat-knowledge-bases
erichare Jul 23, 2025
cf7d64d
Add ability to block duplicate ingestion chunks
erichare Jul 24, 2025
36fac5a
Merge branch 'main' into feat-knowledge-bases
erichare Jul 24, 2025
9341c41
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 24, 2025
45f14f7
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 24, 2025
e6ab6cb
Rename retrieval component
erichare Jul 24, 2025
542984b
Better refresh mechanism for the retrieve
erichare Jul 24, 2025
4864640
Clean up some unused functionality
erichare Jul 24, 2025
3aeb0c5
Merge branch 'main' into feat-knowledge-bases
erichare Jul 24, 2025
8ab4368
Update kb_ingest.py
erichare Jul 24, 2025
80e223e
Fix dropdown component logic to include checks for refresh button and…
deon-sanchez Jul 24, 2025
9058976
Test the API key before saving knowledge
erichare Jul 24, 2025
03a8c2e
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 24, 2025
96ee3f4
Allow storing updated api keys if provided at ingest time
erichare Jul 24, 2025
896bf61
Merge branch 'main' into feat-knowledge-bases
erichare Jul 24, 2025
d3fc9e8
Add Knowledge Bases component and enhance Knowledge Base Empty State
deon-sanchez Jul 24, 2025
5718eb3
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 24, 2025
b33a3c9
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 24, 2025
602f39d
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 24, 2025
502436d
Update Knowledge Bases.json
erichare Jul 24, 2025
00da454
Update Knowledge Bases configuration and enhance UI components
deon-sanchez Jul 24, 2025
76f0035
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 24, 2025
c9fbbdd
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 24, 2025
5dcf0b8
Implement feature flag for Knowledge Bases functionality
deon-sanchez Jul 24, 2025
14909d9
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 24, 2025
41ba6ec
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 24, 2025
3662d50
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 24, 2025
20d4382
Refactor Knowledge Bases feature flag implementation
deon-sanchez Jul 24, 2025
de4edf7
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 24, 2025
1e7ffce
revert
deon-sanchez Jul 24, 2025
6e7b061
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 24, 2025
8277cb6
Merge branch 'main' into feat-knowledge-bases
erichare Jul 25, 2025
ed009cd
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 25, 2025
8700133
Remove Knowledge Bases JSON configuration and clean up KnowledgeBases…
deon-sanchez Jul 25, 2025
aaaae03
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 25, 2025
02d4874
Enhance routing structure by adding admin and login routes with prote…
deon-sanchez Jul 25, 2025
ae0d378
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 25, 2025
43ef981
added template back
deon-sanchez Jul 25, 2025
9c21594
Use chroma for stats computation
erichare Jul 25, 2025
71eaf96
Fix ruff issue
erichare Jul 25, 2025
6ce2414
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 25, 2025
86334cf
Update Knowledge Bases.json
erichare Jul 25, 2025
d3d176f
Update Knowledge Bases.json
erichare Jul 25, 2025
dfcfe7b
Rename to just knowledge
erichare Jul 25, 2025
e072f0d
Merge branch 'main' into feat-knowledge-bases
erichare Jul 25, 2025
6645b25
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 28, 2025
3efe3be
feat: enhance Jest configuration and add new tests for Knowledge Base…
deon-sanchez Jul 28, 2025
2dc9c55
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 28, 2025
8fa29e5
refactor: reorganize imports and clean up console log in Dropdown com…
deon-sanchez Jul 29, 2025
aacf468
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 29, 2025
f61689a
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 29, 2025
6416d51
feat: add success callback for knowledge base creation in NodeDialog …
deon-sanchez Jul 29, 2025
b780edd
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Jul 29, 2025
d20c2c6
refactor: update table component to handle single-toggle columns
deon-sanchez Jul 29, 2025
8c40cf7
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 29, 2025
5536a3d
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 29, 2025
2a4dba8
feat: Add unit tests for KBIngestionComponent (#9246)
edwinjosechittilappilly Jul 30, 2025
de843c8
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Jul 31, 2025
fb45847
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 31, 2025
c053983
fix: remove unnecessary drawer open state change in KnowledgePage
deon-sanchez Jul 31, 2025
3f24571
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 31, 2025
62a1023
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jul 31, 2025
e80a68e
Remove kb_info output from KBIngestionComponent (#9275)
edwinjosechittilappilly Jul 31, 2025
663b819
[autofix.ci] apply automated fixes
autofix-ci[bot] Jul 31, 2025
414a7b9
Update Knowledge Bases.json
edwinjosechittilappilly Aug 1, 2025
6498a83
Use settings service for knowledge base directory
edwinjosechittilappilly Aug 1, 2025
60c6da5
Merge branch 'main' of https://github.com/langflow-ai/langflow into f…
deon-sanchez Aug 1, 2025
4516cca
Fix knowledge bases mypy issue
erichare Aug 1, 2025
9121c1d
test: Update file page tests for consistency and clarity
deon-sanchez Aug 1, 2025
9a9717a
test: Update expected title in file upload component test for accuracy
deon-sanchez Aug 1, 2025
1871c1d
Merge branch 'feat-knowledge-bases' of https://github.com/langflow-ai…
deon-sanchez Aug 1, 2025
d8f3d0f
[autofix.ci] apply automated fixes
autofix-ci[bot] Aug 1, 2025
7565e95
Fix tests on backend
erichare Aug 1, 2025
b62a7eb
Merge branch 'main' into feat-knowledge-bases
erichare Aug 1, 2025
706040f
Update kb_ingest.py
erichare Aug 1, 2025
4072499
[autofix.ci] apply automated fixes
autofix-ci[bot] Aug 1, 2025
4ace8d8
Merge branch 'main' into feat-knowledge-bases
erichare Aug 11, 2025
baeb113
Merge branch 'main' into feat-knowledge-bases
erichare Aug 11, 2025
11d7b17
Merge branch 'main' into feat-knowledge-bases
erichare Aug 11, 2025
dda21d7
Merge branch 'main' into feat-knowledge-bases
erichare Aug 11, 2025
fd1b2ae
Merge branch 'main' into feat-knowledge-bases
edwinjosechittilappilly Aug 12, 2025
b6b60fa
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
600e0e9
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
933233a
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
d88b479
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
fb5294c
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
ef664d8
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
9c90eeb
Merge branch 'main' into feat-knowledge-bases
erichare Aug 12, 2025
a37c8a8
Switch to two templates for KB
erichare Aug 12, 2025
0600f8c
Merge branch 'main' into feat-knowledge-bases
erichare Aug 13, 2025
f831d9b
Update names and descs
erichare Aug 13, 2025
71ef5f5
[autofix.ci] apply automated fixes
autofix-ci[bot] Aug 13, 2025
58044d0
Rename templates
erichare Aug 13, 2025
4d49c95
[autofix.ci] apply automated fixes
autofix-ci[bot] Aug 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/backend/base/langflow/api/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
files_router,
flows_router,
folders_router,
knowledge_bases_router,
login_router,
mcp_projects_router,
mcp_router,
Expand Down Expand Up @@ -45,6 +46,7 @@
router_v1.include_router(folders_router)
router_v1.include_router(projects_router)
router_v1.include_router(starter_projects_router)
router_v1.include_router(knowledge_bases_router)
router_v1.include_router(mcp_router)
router_v1.include_router(voice_mode_router)
router_v1.include_router(mcp_projects_router)
Expand Down
2 changes: 2 additions & 0 deletions src/backend/base/langflow/api/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from langflow.api.v1.files import router as files_router
from langflow.api.v1.flows import router as flows_router
from langflow.api.v1.folders import router as folders_router
from langflow.api.v1.knowledge_bases import router as knowledge_bases_router
from langflow.api.v1.login import router as login_router
from langflow.api.v1.mcp import router as mcp_router
from langflow.api.v1.mcp_projects import router as mcp_projects_router
Expand All @@ -23,6 +24,7 @@
"files_router",
"flows_router",
"folders_router",
"knowledge_bases_router",
"login_router",
"mcp_projects_router",
"mcp_router",
Expand Down
259 changes: 259 additions & 0 deletions src/backend/base/langflow/api/v1/knowledge_bases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
import json
from http import HTTPStatus
from pathlib import Path

import pandas as pd
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

router = APIRouter(tags=["Knowledge Bases"], prefix="/knowledge_bases")

KNOWLEDGE_BASES_DIR = "~/.langflow/knowledge_bases"


class KnowledgeBaseInfo(BaseModel):
id: str
name: str
embedding_provider: str | None = "Unknown"
size: int = 0
words: int = 0
characters: int = 0
chunks: int = 0
avg_chunk_size: float = 0.0

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add provider and embedding model info also.


def get_kb_root_path() -> Path:
"""Get the knowledge bases root path."""
return Path(KNOWLEDGE_BASES_DIR).expanduser()


def get_directory_size(path: Path) -> int:
"""Calculate the total size of all files in a directory."""
total_size = 0
try:
for file_path in path.rglob("*"):
if file_path.is_file():
total_size += file_path.stat().st_size
except (OSError, PermissionError):
pass
return total_size


def detect_embedding_provider(kb_path: Path) -> str:
"""Detect the embedding provider from config files and directory structure."""
# Provider patterns to check for
provider_patterns = {
"OpenAI": ["openai", "text-embedding-ada", "text-embedding-3"],
"HuggingFace": ["sentence-transformers", "huggingface", "bert-"],
"Cohere": ["cohere", "embed-english", "embed-multilingual"],
"Google": ["palm", "gecko", "google"],
"Chroma": ["chroma"],
}

# Check JSON config files for provider information
for config_file in kb_path.glob("*.json"):
try:
with config_file.open("r", encoding="utf-8") as f:
config_data = json.load(f)
if not isinstance(config_data, dict):
continue

config_str = json.dumps(config_data).lower()

# Check for explicit provider fields first
provider_fields = ["embedding_provider", "provider", "embedding_model_provider"]
for field in provider_fields:
if field in config_data:
provider_value = str(config_data[field]).lower()
for provider, patterns in provider_patterns.items():
if any(pattern in provider_value for pattern in patterns):
return provider

# Check for model name patterns
for provider, patterns in provider_patterns.items():
if any(pattern in config_str for pattern in patterns):
return provider

except Exception:

Check failure on line 77 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (BLE001)

src/backend/base/langflow/api/v1/knowledge_bases.py:77:16: BLE001 Do not catch blind exception: `Exception`
continue

Check failure on line 78 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (S112)

src/backend/base/langflow/api/v1/knowledge_bases.py:77:9: S112 `try`-`except`-`continue` detected, consider logging the exception

# Fallback to directory structure
if (kb_path / "chroma").exists():
return "Chroma"
if (kb_path / "vectors.npy").exists():
return "Local"

return "Unknown"


def get_text_columns(df: pd.DataFrame, schema_data: list = None) -> list[str]:

Check failure on line 89 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (RUF013)

src/backend/base/langflow/api/v1/knowledge_bases.py:89:53: RUF013 PEP 484 prohibits implicit `Optional`
"""Get the text columns to analyze for word/character counts."""
# First try schema-defined text columns
if schema_data:
text_columns = [
col["column_name"]
for col in schema_data
if col.get("vectorize", False) and col.get("data_type") == "string"
]
if text_columns:
return [col for col in text_columns if col in df.columns]

# Fallback to common text column names
common_names = ["text", "content", "document", "chunk"]
text_columns = [col for col in df.columns if col.lower() in common_names]
if text_columns:
return text_columns

# Last resort: all string columns
return [col for col in df.columns if df[col].dtype == "object"]
Comment on lines +169 to +184
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚡️Codeflash found 907% (9.07x) speedup for get_text_columns in src/backend/base/langflow/api/v1/knowledge_bases.py

⏱️ Runtime : 28.7 milliseconds 2.85 milliseconds (best of 67 runs)

📝 Explanation and details Here is an optimized version of your code. The major bottleneck in your profiling is the last fallback (`[col for col in df.columns if df[col].dtype == "object"]`), which is slow since it examines the series dtype for every column. We can optimize this using `df.select_dtypes(include=["object"])` to select all string columns at once, which is much faster and vectorized in pandas.

Other improvements.

  • Set for Faster Membership Test: For the check [col for col in text_columns if col in df.columns], use a set for df.columns to speed up the lookup.
  • Optimize Lower Casing: For the [col for col in df.columns if col.lower() in common_names] part, make common_names a set (with all lowercase), and avoid repeatedly calling .lower() on every column.
  • Reduced redundant variables and preserved the exact return values as before.

Optimized code.

Summary of Optimizations.

  • Major speedup for fallback case by using select_dtypes, which internally does fast dtype matching.
  • Faster set-based membership test for column name lookups.
  • No change in function signature or output.
  • Preserved your comments.

This should give you a very significant speedup, especially when the fallback (all string columns) is used on large dataframes!

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 48 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests and Runtime
import pandas as pd
# imports
import pytest  # used for our unit tests
from langflow.api.v1.knowledge_bases import get_text_columns

# unit tests

# ----------------------------
# BASIC TEST CASES
# ----------------------------

def test_schema_data_priority():
    # Should use schema_data to select columns
    df = pd.DataFrame({
        "text": ["a", "b"],
        "foo": ["x", "y"],
        "bar": [1, 2]
    })
    schema = [
        {"column_name": "foo", "vectorize": True, "data_type": "string"},
        {"column_name": "bar", "vectorize": True, "data_type": "int"},
        {"column_name": "text", "vectorize": False, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_multiple_columns():
    # Multiple schema-defined text columns
    df = pd.DataFrame({
        "a": ["x", "y"],
        "b": ["u", "v"],
        "c": [1, 2]
    })
    schema = [
        {"column_name": "a", "vectorize": True, "data_type": "string"},
        {"column_name": "b", "vectorize": True, "data_type": "string"},
        {"column_name": "c", "vectorize": True, "data_type": "int"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_column_not_in_df():
    # Schema defines a column not in df; should ignore it
    df = pd.DataFrame({
        "foo": ["x", "y"],
    })
    schema = [
        {"column_name": "foo", "vectorize": True, "data_type": "string"},
        {"column_name": "missing", "vectorize": True, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_fallback_common_names():
    # No schema_data, should fallback to common names
    df = pd.DataFrame({
        "text": ["a", "b"],
        "other": [1, 2]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_fallback_common_names_case_insensitive():
    # Should match common names case-insensitively
    df = pd.DataFrame({
        "Text": ["a", "b"],
        "Content": ["c", "d"],
        "foo": [1, 2]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_fallback_all_string_columns():
    # No schema, no common names, should fallback to all string columns
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": [1, 2],
        "baz": ["x", "y"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_no_string_columns():
    # No string columns at all
    df = pd.DataFrame({
        "a": [1, 2],
        "b": [3.0, 4.0]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

# ----------------------------
# EDGE TEST CASES
# ----------------------------

def test_empty_dataframe():
    # Empty DataFrame
    df = pd.DataFrame()
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_empty_schema_data():
    # Empty schema_data should fallback to others
    df = pd.DataFrame({
        "text": ["a", "b"],
        "foo": ["x", "y"]
    })
    codeflash_output = get_text_columns(df, []); result = codeflash_output

def test_schema_data_all_missing_in_df():
    # All schema-defined columns missing in df
    df = pd.DataFrame({
        "foo": ["a", "b"]
    })
    schema = [
        {"column_name": "bar", "vectorize": True, "data_type": "string"},
        {"column_name": "baz", "vectorize": True, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_vectorize_false():
    # schema_data with vectorize=False should not be returned
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["x", "y"]
    })
    schema = [
        {"column_name": "foo", "vectorize": False, "data_type": "string"},
        {"column_name": "bar", "vectorize": True, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_type_not_string():
    # schema_data with non-string data_type should not be returned
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": [1, 2]
    })
    schema = [
        {"column_name": "foo", "vectorize": True, "data_type": "int"},
        {"column_name": "bar", "vectorize": True, "data_type": "int"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_mixed_dtype_columns():
    # DataFrame with mixed dtypes, only object columns returned as fallback
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": [1, 2],
        "baz": [True, False],
        "qux": [b"abc", b"def"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_column_names_overlap():
    # DataFrame with columns that are substrings of common names
    df = pd.DataFrame({
        "textual": ["a", "b"],
        "contented": ["c", "d"],
        "document": ["doc1", "doc2"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_schema_with_extra_keys():
    # schema_data with extra irrelevant keys
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["x", "y"]
    })
    schema = [
        {"column_name": "foo", "vectorize": True, "data_type": "string", "irrelevant": 123},
        {"column_name": "bar", "vectorize": True, "data_type": "string", "something": "else"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_with_missing_keys():
    # schema_data entries missing some keys; should handle gracefully
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["x", "y"]
    })
    schema = [
        {"column_name": "foo"},  # missing vectorize and data_type
        {"column_name": "bar", "vectorize": True}  # missing data_type
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_column_names_with_spaces():
    # Column names with spaces or special characters
    df = pd.DataFrame({
        "Text": ["a", "b"],
        "text content": ["c", "d"],
        "document": ["doc1", "doc2"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

# ----------------------------
# LARGE SCALE TEST CASES
# ----------------------------

def test_large_number_of_columns_schema():
    # Large DataFrame, many columns, schema selects a subset
    columns = [f"col_{i}" for i in range(1000)]
    df = pd.DataFrame({col: ["x", "y"] for col in columns})
    # Mark every 100th column as vectorize=True, data_type=string
    schema = [
        {"column_name": f"col_{i}", "vectorize": True, "data_type": "string"}
        for i in range(0, 1000, 100)
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output
    expected = [f"col_{i}" for i in range(0, 1000, 100)]

def test_large_number_of_common_names():
    # DataFrame with many columns, some are common names
    columns = [f"col_{i}" for i in range(995)] + ["text", "content", "document", "chunk", "foo"]
    df = pd.DataFrame({col: ["a", "b"] for col in columns})
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_large_number_of_string_columns():
    # DataFrame with all columns as string type, no schema, no common names
    columns = [f"col_{i}" for i in range(1000)]
    df = pd.DataFrame({col: ["a", "b"] for col in columns})
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_large_number_of_non_string_columns():
    # DataFrame with all columns as int type, no string columns
    columns = [f"col_{i}" for i in range(1000)]
    df = pd.DataFrame({col: [i, i+1] for i, col in enumerate(columns)})
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_large_schema_all_missing():
    # Large schema, but none of the columns exist in df
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["x", "y"]
    })
    schema = [
        {"column_name": f"missing_{i}", "vectorize": True, "data_type": "string"}
        for i in range(1000)
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

import pandas as pd
# imports
import pytest  # used for our unit tests
from langflow.api.v1.knowledge_bases import get_text_columns

# unit tests

# -------------------- BASIC TEST CASES --------------------

def test_schema_data_priority_over_common_names():
    # Schema data should take precedence over common names
    df = pd.DataFrame({
        "text": ["a", "b"],
        "custom": ["c", "d"]
    })
    schema = [
        {"column_name": "custom", "vectorize": True, "data_type": "string"},
        {"column_name": "text", "vectorize": False, "data_type": "string"},
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_multiple_columns():
    # Multiple schema columns, only those in df and matching criteria
    df = pd.DataFrame({
        "col1": ["a", "b"],
        "col2": ["c", "d"],
        "col3": [1, 2]
    })
    schema = [
        {"column_name": "col1", "vectorize": True, "data_type": "string"},
        {"column_name": "col2", "vectorize": True, "data_type": "string"},
        {"column_name": "col3", "vectorize": True, "data_type": "int"},
        {"column_name": "col4", "vectorize": True, "data_type": "string"},  # not in df
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_no_matching_columns():
    # Schema data has no matching columns in df
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["c", "d"]
    })
    schema = [
        {"column_name": "baz", "vectorize": True, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_common_names_fallback():
    # No schema, but common names present
    df = pd.DataFrame({
        "text": ["a", "b"],
        "other": [1, 2]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_common_names_case_insensitive():
    # Common names with different casing
    df = pd.DataFrame({
        "Text": ["a", "b"],
        "Content": ["c", "d"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_all_string_columns_fallback():
    # No schema, no common names, fallback to all string columns
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["c", "d"],
        "baz": [1, 2]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_no_string_columns():
    # No string columns at all
    df = pd.DataFrame({
        "a": [1, 2],
        "b": [3, 4]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_empty_dataframe():
    # Empty DataFrame, no columns
    df = pd.DataFrame()
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_schema_data_false_vectorize():
    # Schema data with vectorize False should not be included
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["c", "d"]
    })
    schema = [
        {"column_name": "foo", "vectorize": False, "data_type": "string"},
        {"column_name": "bar", "data_type": "string"},  # vectorize missing
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_non_string_type():
    # Schema data with non-string type should not be included
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["c", "d"]
    })
    schema = [
        {"column_name": "foo", "vectorize": True, "data_type": "int"},
        {"column_name": "bar", "vectorize": True, "data_type": "float"},
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

# -------------------- EDGE TEST CASES --------------------

def test_schema_data_column_not_in_df():
    # Schema column not present in df
    df = pd.DataFrame({
        "foo": ["a", "b"]
    })
    schema = [
        {"column_name": "bar", "vectorize": True, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_common_names_subset():
    # Only some common names present
    df = pd.DataFrame({
        "text": ["a", "b"],
        "foo": ["c", "d"],
        "document": ["e", "f"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_column_names_overlap_with_common_names():
    # Column name is a substring of a common name, but not equal
    df = pd.DataFrame({
        "tex": ["a", "b"],
        "contented": ["c", "d"],
        "chunk": ["e", "f"]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_schema_data_and_common_names_overlap():
    # Schema picks a column that is also a common name
    df = pd.DataFrame({
        "text": ["a", "b"],
        "foo": ["c", "d"]
    })
    schema = [
        {"column_name": "text", "vectorize": True, "data_type": "string"}
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_schema_data_empty_list():
    # Schema data is an empty list, should fallback to common names
    df = pd.DataFrame({
        "text": ["a", "b"],
        "foo": ["c", "d"]
    })
    codeflash_output = get_text_columns(df, schema_data=[]); result = codeflash_output

def test_schema_data_none():
    # Schema data is None, should fallback to common names
    df = pd.DataFrame({
        "content": ["a", "b"],
        "foo": ["c", "d"]
    })
    codeflash_output = get_text_columns(df, schema_data=None); result = codeflash_output

def test_schema_data_missing_keys():
    # Schema data missing "vectorize" or "data_type" keys
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": ["c", "d"]
    })
    schema = [
        {"column_name": "foo"},  # missing both keys
        {"column_name": "bar", "vectorize": True},  # missing data_type
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_mixed_dtype_columns():
    # DataFrame with mixed dtypes, only string columns should be returned
    df = pd.DataFrame({
        "foo": ["a", "b"],
        "bar": [1, 2],
        "baz": ["c", "d"],
        "qux": [True, False]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_column_with_nan_values():
    # String column with NaN values should still be considered string
    df = pd.DataFrame({
        "foo": ["a", None, "b"],
        "bar": [1, 2, 3]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_column_with_all_nan_values():
    # All NaN column with dtype object should be considered string
    df = pd.DataFrame({
        "foo": [None, None, None],
        "bar": [1, 2, 3]
    })
    codeflash_output = get_text_columns(df); result = codeflash_output

def test_column_with_mixed_object_types():
    # Object dtype, but not all string
    df = pd.DataFrame({
        "foo": ["a", 1, None],
        "bar": [1, 2, 3]
    })
    # Pandas will infer object dtype for "foo"
    codeflash_output = get_text_columns(df); result = codeflash_output

# -------------------- LARGE SCALE TEST CASES --------------------

def test_large_number_of_columns_schema():
    # DataFrame with 1000 columns, schema defines 10 vectorized string columns
    columns = [f"col{i}" for i in range(1000)]
    data = {col: ["x", "y"] for col in columns}
    df = pd.DataFrame(data)
    schema = [
        {"column_name": f"col{i}", "vectorize": True, "data_type": "string"}
        for i in range(0, 1000, 100)
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output
    expected = [f"col{i}" for i in range(0, 1000, 100)]

def test_large_number_of_rows_and_string_columns():
    # DataFrame with 500 rows and 10 string columns
    data = {f"str_col{i}": ["word"] * 500 for i in range(10)}
    data.update({f"int_col{i}": [i] * 500 for i in range(5)})
    df = pd.DataFrame(data)
    codeflash_output = get_text_columns(df); result = codeflash_output
    expected = [f"str_col{i}" for i in range(10)]

def test_large_number_of_common_names():
    # DataFrame with many columns, some of which are common names
    columns = [f"foo{i}" for i in range(995)] + ["text", "content", "chunk", "document", "foo999"]
    data = {col: ["x", "y"] for col in columns}
    df = pd.DataFrame(data)
    codeflash_output = get_text_columns(df); result = codeflash_output
    expected = ["text", "content", "chunk", "document"]

def test_large_schema_no_matches():
    # Large schema, but none of the columns exist in df
    df = pd.DataFrame({
        "a": ["x", "y"],
        "b": ["z", "w"]
    })
    schema = [
        {"column_name": f"col{i}", "vectorize": True, "data_type": "string"}
        for i in range(100)
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output

def test_large_schema_partial_matches():
    # Large schema, some columns exist in df
    columns = [f"col{i}" for i in range(100)]
    df = pd.DataFrame({col: ["a", "b"] for col in columns})
    schema = [
        {"column_name": f"col{i}", "vectorize": True, "data_type": "string"}
        for i in range(0, 100, 10)
    ]
    codeflash_output = get_text_columns(df, schema); result = codeflash_output
    expected = [f"col{i}" for i in range(0, 100, 10)]
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To test or edit this optimization locally git merge codeflash/optimize-pr9088-2025-07-17T18.40.39

Click to see suggested changes
Suggested change
text_columns = [
col["column_name"]
for col in schema_data
if col.get("vectorize", False) and col.get("data_type") == "string"
]
if text_columns:
return [col for col in text_columns if col in df.columns]
# Fallback to common text column names
common_names = ["text", "content", "document", "chunk"]
text_columns = [col for col in df.columns if col.lower() in common_names]
if text_columns:
return text_columns
# Last resort: all string columns
return [col for col in df.columns if df[col].dtype == "object"]
# Collect the schema-defined text columns
text_columns = [
col["column_name"]
for col in schema_data
if col.get("vectorize", False) and col.get("data_type") == "string"
]
if text_columns:
df_cols_set = set(df.columns)
# Filter only columns present in the dataframe
return [col for col in text_columns if col in df_cols_set]
# Fallback to common text column names (case-insensitive, set for O(1) lookup)
common_names_set = {"text", "content", "document", "chunk"}
# Build a list of columns whose lowercased names match any in the common_names_set
text_columns = [col for col in df.columns if col.lower() in common_names_set]
if text_columns:
return text_columns
# Last resort: all string columns (optimized using select_dtypes)
return list(df.select_dtypes(include=["object"]).columns)



def calculate_text_metrics(df: pd.DataFrame, text_columns: list[str]) -> tuple[int, int]:
"""Calculate total words and characters from text columns."""
total_words = 0
total_characters = 0

for col in text_columns:
if col not in df.columns:
continue

text_series = df[col].astype(str).fillna("")
total_characters += text_series.str.len().sum()
total_words += text_series.str.split().str.len().sum()

return int(total_words), int(total_characters)


def get_kb_metadata(kb_path: Path) -> dict:
"""Extract metadata from a knowledge base directory."""
metadata = {
"chunks": 0,
"words": 0,
"characters": 0,
"avg_chunk_size": 0.0,
"embedding_provider": "Unknown",
}

try:
# Detect embedding provider
metadata["embedding_provider"] = detect_embedding_provider(kb_path)

# Read schema for text column information
schema_data = None
schema_file = kb_path / "schema.json"
if schema_file.exists():
try:
with schema_file.open("r", encoding="utf-8") as f:
schema_data = json.load(f)
if not isinstance(schema_data, list):
schema_data = None
except Exception:

Check failure on line 150 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (BLE001)

src/backend/base/langflow/api/v1/knowledge_bases.py:150:20: BLE001 Do not catch blind exception: `Exception`
pass

Check failure on line 151 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (S110)

src/backend/base/langflow/api/v1/knowledge_bases.py:150:13: S110 `try`-`except`-`pass` detected, consider logging the exception

# Process source.parquet for text metrics
source_file = kb_path / "source.parquet"
if source_file.exists():
try:
df = pd.read_parquet(source_file)

Check failure on line 157 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (PD901)

src/backend/base/langflow/api/v1/knowledge_bases.py:157:17: PD901 Avoid using the generic variable name `df` for DataFrames
metadata["chunks"] = len(df)

# Get text columns and calculate metrics
text_columns = get_text_columns(df, schema_data)
if text_columns:
words, characters = calculate_text_metrics(df, text_columns)
metadata["words"] = words
metadata["characters"] = characters

# Calculate average chunk size
if metadata["chunks"] > 0:
metadata["avg_chunk_size"] = round(characters / metadata["chunks"], 1)

except Exception:

Check failure on line 171 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (BLE001)

src/backend/base/langflow/api/v1/knowledge_bases.py:171:20: BLE001 Do not catch blind exception: `Exception`
pass

Check failure on line 172 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (S110)

src/backend/base/langflow/api/v1/knowledge_bases.py:171:13: S110 `try`-`except`-`pass` detected, consider logging the exception

except Exception:

Check failure on line 174 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (BLE001)

src/backend/base/langflow/api/v1/knowledge_bases.py:174:12: BLE001 Do not catch blind exception: `Exception`
pass

Check failure on line 175 in src/backend/base/langflow/api/v1/knowledge_bases.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.13)

Ruff (S110)

src/backend/base/langflow/api/v1/knowledge_bases.py:174:5: S110 `try`-`except`-`pass` detected, consider logging the exception

return metadata


@router.get("", status_code=HTTPStatus.OK)
@router.get("/", status_code=HTTPStatus.OK)
async def list_knowledge_bases() -> list[KnowledgeBaseInfo]:
"""List all available knowledge bases."""
try:
kb_root_path = get_kb_root_path()

if not kb_root_path.exists():
return []

knowledge_bases = []

for kb_dir in kb_root_path.iterdir():
if not kb_dir.is_dir() or kb_dir.name.startswith("."):
continue

try:
# Get size of the directory
size = get_directory_size(kb_dir)

# Get metadata from KB files
metadata = get_kb_metadata(kb_dir)

kb_info = KnowledgeBaseInfo(
id=kb_dir.name,
name=kb_dir.name.replace("_", " ").replace("-", " ").title(),
embedding_provider=metadata["embedding_provider"],
size=size,
words=metadata["words"],
characters=metadata["characters"],
chunks=metadata["chunks"],
avg_chunk_size=metadata["avg_chunk_size"],
)

knowledge_bases.append(kb_info)

except Exception:
# Skip directories that can't be read
continue

# Sort by name alphabetically
knowledge_bases.sort(key=lambda x: x.name)

return knowledge_bases

except Exception as e:
raise HTTPException(status_code=500, detail=f"Error listing knowledge bases: {e!s}") from e


@router.get("/{kb_name}", status_code=HTTPStatus.OK)
async def get_knowledge_base(kb_name: str) -> KnowledgeBaseInfo:
"""Get detailed information about a specific knowledge base."""
try:
kb_root_path = get_kb_root_path()
kb_path = kb_root_path / kb_name

if not kb_path.exists() or not kb_path.is_dir():
raise HTTPException(status_code=404, detail=f"Knowledge base '{kb_name}' not found")

# Get size of the directory
size = get_directory_size(kb_path)

# Get metadata from KB files
metadata = get_kb_metadata(kb_path)

return KnowledgeBaseInfo(
id=kb_name,
name=kb_name.replace("_", " ").replace("-", " ").title(),
embedding_provider=metadata["embedding_provider"],
size=size,
words=metadata["words"],
characters=metadata["characters"],
chunks=metadata["chunks"],
avg_chunk_size=metadata["avg_chunk_size"],
)

except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting knowledge base '{kb_name}': {e!s}") from e
104 changes: 104 additions & 0 deletions src/backend/base/langflow/base/data/kb_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import math
from collections import Counter


def compute_tfidf(documents: list[str], query_terms: list[str]) -> list[float]:
"""Compute TF-IDF scores for query terms across a collection of documents.

Args:
documents: List of document strings
query_terms: List of query terms to score

Returns:
List of TF-IDF scores for each document
"""
# Tokenize documents (simple whitespace splitting)
tokenized_docs = [doc.lower().split() for doc in documents]
n_docs = len(documents)

# Calculate document frequency for each term
document_frequencies = {}
for term in query_terms:
document_frequencies[term] = sum(1 for doc in tokenized_docs if term.lower() in doc)

scores = []

for doc_tokens in tokenized_docs:
doc_score = 0.0
doc_length = len(doc_tokens)
term_counts = Counter(doc_tokens)

for term in query_terms:
term_lower = term.lower()

# Term frequency (TF)
tf = term_counts[term_lower] / doc_length if doc_length > 0 else 0

# Inverse document frequency (IDF)
idf = math.log(n_docs / document_frequencies[term]) if document_frequencies[term] > 0 else 0

# TF-IDF score
doc_score += tf * idf

scores.append(doc_score)

return scores


def compute_bm25(documents: list[str], query_terms: list[str], k1: float = 1.2, b: float = 0.75) -> list[float]:
"""Compute BM25 scores for query terms across a collection of documents.

Args:
documents: List of document strings
query_terms: List of query terms to score
k1: Controls term frequency scaling (default: 1.2)
b: Controls document length normalization (default: 0.75)

Returns:
List of BM25 scores for each document
"""
# Tokenize documents
tokenized_docs = [doc.lower().split() for doc in documents]
n_docs = len(documents)

# Calculate average document length
avg_doc_length = sum(len(doc) for doc in tokenized_docs) / n_docs if n_docs > 0 else 0

# Handle edge case where all documents are empty
if avg_doc_length == 0:
return [0.0] * n_docs

# Calculate document frequency for each term
document_frequencies = {}
for term in query_terms:
document_frequencies[term] = sum(1 for doc in tokenized_docs if term.lower() in doc)

scores = []

for doc_tokens in tokenized_docs:
doc_score = 0.0
doc_length = len(doc_tokens)
term_counts = Counter(doc_tokens)

for term in query_terms:
term_lower = term.lower()

# Term frequency in document
tf = term_counts[term_lower]

# Inverse document frequency (IDF)
idf = (
math.log((n_docs - document_frequencies[term] + 0.5) / (document_frequencies[term] + 0.5))
if document_frequencies[term] > 0
else 0
)

# BM25 score calculation
numerator = tf * (k1 + 1)
denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))

doc_score += idf * (numerator / denominator)

scores.append(doc_score)

return scores
Loading
Loading