Skip to content
Merged
Show file tree
Hide file tree
Changes from 53 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
510926f
included scraping dependencies
Geoff-Robin Sep 30, 2025
6348c9d
Created models.py
Geoff-Robin Sep 30, 2025
70a2cc9
removed scrapy and added bs4
Geoff-Robin Oct 1, 2025
925bd38
Setup models.py and utils.py
Geoff-Robin Oct 1, 2025
60499c4
Added logging
Geoff-Robin Oct 1, 2025
c283977
switched httpx AsyncClient to fetch webpage
Geoff-Robin Oct 1, 2025
4979f43
Added playwright as a dependency
Geoff-Robin Oct 1, 2025
edd119e
first iteration of bs4_connector.py done
Geoff-Robin Oct 2, 2025
1ab9d24
Changed bs4_connector.py to bs4_crawler.py
Geoff-Robin Oct 3, 2025
20fb773
Done with integration with add workflow when incremental_loading is s…
Geoff-Robin Oct 4, 2025
fbef667
removed unused Dict import from typing
Geoff-Robin Oct 4, 2025
da7ebc4
Removed asyncio import
Geoff-Robin Oct 4, 2025
ab6fc65
Added global context for bs4crawler and tavily config
Geoff-Robin Oct 4, 2025
2cba31a
Tested and Debugged scraping usage in cognee.add() pipeline
Geoff-Robin Oct 4, 2025
c2aa955
removed structured argument
Geoff-Robin Oct 5, 2025
77ea7c4
Added APScheduler
Geoff-Robin Oct 5, 2025
f148b1d
Added support for multiple base_url extraction
Geoff-Robin Oct 5, 2025
f449fce
Done with scraping_task successfully
Geoff-Robin Oct 5, 2025
e5633bc
corrected F402 error pointed out by ruff check
Geoff-Robin Oct 5, 2025
0f64f68
Done adding cron job web scraping
Geoff-Robin Oct 5, 2025
4d5146c
Added Documentation
Geoff-Robin Oct 5, 2025
667bbd7
Added cron job and removed obvious comments
Geoff-Robin Oct 5, 2025
ae740ed
Added related documentation
Geoff-Robin Oct 5, 2025
1b5c099
CodeRabbit reviews solved
Geoff-Robin Oct 6, 2025
791e38b
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
3c9e5f8
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
0a9b624
changed return type for fetch_page_content to Dict[str,str]
Geoff-Robin Oct 6, 2025
7fe1de7
Remove assignment to unused variable graph_db'
Geoff-Robin Oct 6, 2025
d4ce340
Removed unused imports
Geoff-Robin Oct 6, 2025
1c0e0f0
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
54f2580
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
1f36dd3
Solved nitpick comments
Geoff-Robin Oct 6, 2025
5dcd7e5
Changes uv.lock
Geoff-Robin Oct 6, 2025
b5a1957
Regenerate uv.lock after merge
Geoff-Robin Oct 6, 2025
902f9a3
Changed cognee-mcp\pyproject.toml
Geoff-Robin Oct 6, 2025
f71cf77
.
Geoff-Robin Oct 6, 2025
fdf8562
Added uv.lock again
Geoff-Robin Oct 6, 2025
d91ffa2
Removed staticmethod decorator from bs4_crawler.py, kwargs from the f…
Geoff-Robin Oct 7, 2025
3d53e8d
Removed print statement that I used for debugging
Geoff-Robin Oct 7, 2025
fcd91a9
Added self as an argument to all previous methods that were static me…
Geoff-Robin Oct 7, 2025
fc660e4
Closed crawler instance in a finally block
Geoff-Robin Oct 7, 2025
0fd55a7
ruff formatted
Geoff-Robin Oct 7, 2025
f59c278
Added await
Geoff-Robin Oct 7, 2025
49858c5
Made api_key field in TavilyConfig models to be Optional[str] type to…
Geoff-Robin Oct 7, 2025
760a9de
Release v0.3.5 (#1515)
borisarzentar Oct 7, 2025
ea33854
Removed print statement logging and used cognee inbuilt logger and up…
Geoff-Robin Oct 8, 2025
8d27da6
removed dotenv imports
Geoff-Robin Oct 8, 2025
af71cba
Trying to resolve uv.lock
Geoff-Robin Oct 8, 2025
a3fbbdf
Solved nitpick comments
Geoff-Robin Oct 8, 2025
599ef4a
solved nitpick comments
Geoff-Robin Oct 8, 2025
6602275
Addressed code rabbit comment on shortening content
Geoff-Robin Oct 8, 2025
a9d410e
resolving uv.lock conflict
Geoff-Robin Oct 8, 2025
e934f80
Merge branch 'main' into feature/web_scraping_connector_task
Geoff-Robin Oct 8, 2025
f82dfbe
solved nitpick comments
Geoff-Robin Oct 9, 2025
4058d63
Added better selectors for testing
Geoff-Robin Oct 9, 2025
4e5c681
Merge branch 'dev' into feature/web_scraping_connector_task
Geoff-Robin Oct 10, 2025
f316128
Merge branch 'dev' into feature/web_scraping_connector_task
Geoff-Robin Oct 10, 2025
5e69438
Merge branch 'dev' into feature/web_scraping_connector_task
Geoff-Robin Oct 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cognee-mcp/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,4 @@ dev = [
allow-direct-references = true

[project.scripts]
cognee = "src:main"
cognee-mcp = "src:main_mcp"
cognee-mcp = "src:main"
62 changes: 59 additions & 3 deletions cognee/api/v1/add/add.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from uuid import UUID
from typing import Union, BinaryIO, List, Optional
import os
from typing import Union, BinaryIO, List, Optional, Dict, Any

from cognee.modules.users.models import User
from cognee.modules.pipelines import Task, run_pipeline
Expand All @@ -11,6 +12,12 @@
)
from cognee.modules.engine.operations.setup import setup
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
from cognee.tasks.web_scraper.config import TavilyConfig, SoupCrawlerConfig
from cognee.context_global_variables import (
tavily_config as tavily,
soup_crawler_config as soup_crawler,
)
from urllib.parse import urlparse


async def add(
Expand All @@ -23,12 +30,15 @@ async def add(
dataset_id: Optional[UUID] = None,
preferred_loaders: List[str] = None,
incremental_loading: bool = True,
extraction_rules: Optional[Dict[str, Any]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since soup_crawler_config already uses extraction_rules, why not always pass just the soup_crawler_config?
We want to keep number of arguments to add minimal, this would remove extraction_rules from add arguments.
@dexters1 @hajdul88 Should we introduce config dict here as well like we do for cognify? Number of arguments grows.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the user might prefer just passing the extraction_rules if that's the only setting that needs to be configured.
But if it's preferred that soup_crawler_config is enough, I can change that. Just let me know if the change needs to be made now.

tavily_config: Optional[TavilyConfig] = None,
soup_crawler_config: Optional[SoupCrawlerConfig] = None,
):
"""
Add data to Cognee for knowledge graph processing.

This is the first step in the Cognee workflow - it ingests raw data and prepares it
for processing. The function accepts various data formats including text, files, and
for processing. The function accepts various data formats including text, files, urls and
binary streams, then stores them in a specified dataset for further processing.

Prerequisites:
Expand Down Expand Up @@ -68,6 +78,7 @@ async def add(
- S3 path: "s3://my-bucket/documents/file.pdf"
- List of mixed types: ["text content", "/path/file.pdf", "file://doc.txt", file_handle]
- Binary file object: open("file.txt", "rb")
- url: A web link url (https or http)
dataset_name: Name of the dataset to store data in. Defaults to "main_dataset".
Create separate datasets to organize different knowledge domains.
user: User object for authentication and permissions. Uses default user if None.
Expand All @@ -78,6 +89,9 @@ async def add(
vector_db_config: Optional configuration for vector database (for custom setups).
graph_db_config: Optional configuration for graph database (for custom setups).
dataset_id: Optional specific dataset UUID to use instead of dataset_name.
extraction_rules: Optional dictionary of rules (e.g., CSS selectors, XPath) for extracting specific content from web pages using BeautifulSoup
tavily_config: Optional configuration for Tavily API, including API key and extraction settings
soup_crawler_config: Optional configuration for BeautifulSoup crawler, specifying concurrency, crawl delay, and extraction rules.

Returns:
PipelineRunInfo: Information about the ingestion pipeline execution including:
Expand Down Expand Up @@ -126,6 +140,21 @@ async def add(

# Add a single file
await cognee.add("/home/user/documents/analysis.pdf")

# Add a single url and bs4 extract ingestion method
extraction_rules = {
"title": "h1",
"description": "p",
"more_info": "a[href*='more-info']"
}
await cognee.add("https://example.com",extraction_rules=extraction_rules)

# Add a single url and tavily extract ingestion method
Make sure to set TAVILY_API_KEY = YOUR_TAVILY_API_KEY as a environment variable
await cognee.add("https://example.com")

# Add multiple urls
await cognee.add(["https://example.com","https://books.toscrape.com"])
```

Environment Variables:
Expand All @@ -139,11 +168,38 @@ async def add(
- DEFAULT_USER_PASSWORD: Custom default user password
- VECTOR_DB_PROVIDER: "lancedb" (default), "chromadb", "pgvector"
- GRAPH_DATABASE_PROVIDER: "kuzu" (default), "neo4j"
- TAVILY_API_KEY: YOUR_TAVILY_API_KEY

"""

if not soup_crawler_config and extraction_rules:
soup_crawler_config = SoupCrawlerConfig(extraction_rules=extraction_rules)
if not tavily_config and os.getenv("TAVILY_API_KEY"):
tavily_config = TavilyConfig(api_key=os.getenv("TAVILY_API_KEY"))

soup_crawler.set(soup_crawler_config)
tavily.set(tavily_config)

http_schemes = {"http", "https"}

def _is_http_url(item: Union[str, BinaryIO]) -> bool:
return isinstance(item, str) and urlparse(item).scheme in http_schemes

if _is_http_url(data):
node_set = ["web_content"] if not node_set else node_set + ["web_content"]
elif isinstance(data, list) and any(_is_http_url(item) for item in data):
node_set = ["web_content"] if not node_set else node_set + ["web_content"]

tasks = [
Task(resolve_data_directories, include_subdirectories=True),
Task(ingest_data, dataset_name, user, node_set, dataset_id, preferred_loaders),
Task(
ingest_data,
dataset_name,
user,
node_set,
dataset_id,
preferred_loaders,
),
]

await setup()
Expand Down
2 changes: 2 additions & 0 deletions cognee/context_global_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# for different async tasks, threads and processes
vector_db_config = ContextVar("vector_db_config", default=None)
graph_db_config = ContextVar("graph_db_config", default=None)
soup_crawler_config = ContextVar("soup_crawler_config", default=None)
tavily_config = ContextVar("tavily_config", default=None)


async def set_database_global_context_variables(dataset: Union[str, UUID], user_id: UUID):
Expand Down
41 changes: 41 additions & 0 deletions cognee/tasks/ingestion/save_data_item_to_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from cognee.modules.ingestion import save_data_to_file
from cognee.shared.logging_utils import get_logger
from pydantic_settings import BaseSettings, SettingsConfigDict
from cognee.context_global_variables import tavily_config, soup_crawler_config

logger = get_logger()

Expand All @@ -17,6 +18,13 @@ class SaveDataSettings(BaseSettings):
model_config = SettingsConfigDict(env_file=".env", extra="allow")


class HTMLContent(str):
def __new__(cls, value: str):
if not ("<" in value and ">" in value):
raise ValueError("Not valid HTML-like content")
return super().__new__(cls, value)


settings = SaveDataSettings()


Expand Down Expand Up @@ -48,6 +56,39 @@ async def save_data_item_to_storage(data_item: Union[BinaryIO, str, Any]) -> str
# data is s3 file path
if parsed_url.scheme == "s3":
return data_item
elif parsed_url.scheme == "http" or parsed_url.scheme == "https":
# Validate URL by sending a HEAD request
try:
from cognee.tasks.web_scraper import fetch_page_content

tavily = tavily_config.get()
soup_crawler = soup_crawler_config.get()
preferred_tool = "beautifulsoup" if soup_crawler else "tavily"
if preferred_tool == "tavily" and tavily is None:
raise IngestionError(
message="TavilyConfig must be set on the ingestion context when fetching HTTP URLs without a SoupCrawlerConfig."
)
if preferred_tool == "beautifulsoup" and soup_crawler is None:
raise IngestionError(
message="SoupCrawlerConfig must be set on the ingestion context when using the BeautifulSoup scraper."
)

data = await fetch_page_content(
data_item,
preferred_tool=preferred_tool,
tavily_config=tavily,
soup_crawler_config=soup_crawler,
)
content = ""
for key, value in data.items():
content += f"{key}:\n{value}\n\n"
return await save_data_to_file(content)
except IngestionError:
raise
except Exception as e:
raise IngestionError(
message=f"Error ingesting webpage results of url {data_item}: {str(e)}"
)

# data is local file path
elif parsed_url.scheme == "file":
Expand Down
18 changes: 18 additions & 0 deletions cognee/tasks/web_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Web scraping module for cognee.

This module provides tools for scraping web content, managing scraping jobs, and storing
data in a graph database. It includes classes and functions for crawling web pages using
BeautifulSoup or Tavily, defining data models, and handling scraping configurations.
"""

from .bs4_crawler import BeautifulSoupCrawler
from .utils import fetch_page_content
from .web_scraper_task import cron_web_scraper_task, web_scraper_task


__all__ = [
"BeautifulSoupCrawler",
"fetch_page_content",
"cron_web_scraper_task",
"web_scraper_task",
]
Loading
Loading