Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
510926f
included scraping dependencies
Geoff-Robin Sep 30, 2025
6348c9d
Created models.py
Geoff-Robin Sep 30, 2025
70a2cc9
removed scrapy and added bs4
Geoff-Robin Oct 1, 2025
925bd38
Setup models.py and utils.py
Geoff-Robin Oct 1, 2025
60499c4
Added logging
Geoff-Robin Oct 1, 2025
c283977
switched httpx AsyncClient to fetch webpage
Geoff-Robin Oct 1, 2025
4979f43
Added playwright as a dependency
Geoff-Robin Oct 1, 2025
edd119e
first iteration of bs4_connector.py done
Geoff-Robin Oct 2, 2025
1ab9d24
Changed bs4_connector.py to bs4_crawler.py
Geoff-Robin Oct 3, 2025
20fb773
Done with integration with add workflow when incremental_loading is s…
Geoff-Robin Oct 4, 2025
fbef667
removed unused Dict import from typing
Geoff-Robin Oct 4, 2025
da7ebc4
Removed asyncio import
Geoff-Robin Oct 4, 2025
ab6fc65
Added global context for bs4crawler and tavily config
Geoff-Robin Oct 4, 2025
2cba31a
Tested and Debugged scraping usage in cognee.add() pipeline
Geoff-Robin Oct 4, 2025
c2aa955
removed structured argument
Geoff-Robin Oct 5, 2025
77ea7c4
Added APScheduler
Geoff-Robin Oct 5, 2025
f148b1d
Added support for multiple base_url extraction
Geoff-Robin Oct 5, 2025
f449fce
Done with scraping_task successfully
Geoff-Robin Oct 5, 2025
e5633bc
corrected F402 error pointed out by ruff check
Geoff-Robin Oct 5, 2025
0f64f68
Done adding cron job web scraping
Geoff-Robin Oct 5, 2025
4d5146c
Added Documentation
Geoff-Robin Oct 5, 2025
667bbd7
Added cron job and removed obvious comments
Geoff-Robin Oct 5, 2025
ae740ed
Added related documentation
Geoff-Robin Oct 5, 2025
1b5c099
CodeRabbit reviews solved
Geoff-Robin Oct 6, 2025
791e38b
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
3c9e5f8
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
0a9b624
changed return type for fetch_page_content to Dict[str,str]
Geoff-Robin Oct 6, 2025
7fe1de7
Remove assignment to unused variable graph_db'
Geoff-Robin Oct 6, 2025
d4ce340
Removed unused imports
Geoff-Robin Oct 6, 2025
1c0e0f0
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
54f2580
Solved more nitpick comments
Geoff-Robin Oct 6, 2025
1f36dd3
Solved nitpick comments
Geoff-Robin Oct 6, 2025
5dcd7e5
Changes uv.lock
Geoff-Robin Oct 6, 2025
b5a1957
Regenerate uv.lock after merge
Geoff-Robin Oct 6, 2025
902f9a3
Changed cognee-mcp\pyproject.toml
Geoff-Robin Oct 6, 2025
f71cf77
.
Geoff-Robin Oct 6, 2025
fdf8562
Added uv.lock again
Geoff-Robin Oct 6, 2025
d91ffa2
Removed staticmethod decorator from bs4_crawler.py, kwargs from the f…
Geoff-Robin Oct 7, 2025
3d53e8d
Removed print statement that I used for debugging
Geoff-Robin Oct 7, 2025
fcd91a9
Added self as an argument to all previous methods that were static me…
Geoff-Robin Oct 7, 2025
fc660e4
Closed crawler instance in a finally block
Geoff-Robin Oct 7, 2025
0fd55a7
ruff formatted
Geoff-Robin Oct 7, 2025
f59c278
Added await
Geoff-Robin Oct 7, 2025
49858c5
Made api_key field in TavilyConfig models to be Optional[str] type to…
Geoff-Robin Oct 7, 2025
760a9de
Release v0.3.5 (#1515)
borisarzentar Oct 7, 2025
ea33854
Removed print statement logging and used cognee inbuilt logger and up…
Geoff-Robin Oct 8, 2025
8d27da6
removed dotenv imports
Geoff-Robin Oct 8, 2025
af71cba
Trying to resolve uv.lock
Geoff-Robin Oct 8, 2025
a3fbbdf
Solved nitpick comments
Geoff-Robin Oct 8, 2025
599ef4a
solved nitpick comments
Geoff-Robin Oct 8, 2025
6602275
Addressed code rabbit comment on shortening content
Geoff-Robin Oct 8, 2025
a9d410e
resolving uv.lock conflict
Geoff-Robin Oct 8, 2025
e934f80
Merge branch 'main' into feature/web_scraping_connector_task
Geoff-Robin Oct 8, 2025
f82dfbe
solved nitpick comments
Geoff-Robin Oct 9, 2025
4058d63
Added better selectors for testing
Geoff-Robin Oct 9, 2025
4e5c681
Merge branch 'dev' into feature/web_scraping_connector_task
Geoff-Robin Oct 10, 2025
f316128
Merge branch 'dev' into feature/web_scraping_connector_task
Geoff-Robin Oct 10, 2025
5e69438
Merge branch 'dev' into feature/web_scraping_connector_task
Geoff-Robin Oct 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Added cron job and removed obvious comments
  • Loading branch information
Geoff-Robin committed Oct 5, 2025
commit 667bbd775e7afe7f93b9212a38c746f1642a800f
97 changes: 56 additions & 41 deletions cognee/tests/tasks/web_scraping/web_scraping_test.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import asyncio
import cognee
from cognee.tasks.web_scraper.config import SoupCrawlerConfig
from cognee.tasks.web_scraper import cron_web_scraper_task


async def test_web_scraping_using_bs4():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
# 1. Setup test URL and extraction rules

url = "https://quotes.toscrape.com/"
rules = {
"quotes": {"selector": ".quote span.text", "all": True},
Expand All @@ -24,34 +25,26 @@ async def test_web_scraping_using_bs4():
structured=True,
)

# 2. Add / ingest the page
await cognee.add(
data=url,
soup_crawler_config=soup_config,
incremental_loading=False,
)

# 3. Cognify
await cognee.cognify()

# 4. Search for a known quote
results = await cognee.search(
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "Albert Einstein" in results[0], (
"Test failed! Albert Einstein not found in scraped data."
)
assert "Albert Einstein" in results[0]
print("Test passed! Found Albert Einstein in scraped data.")
print(results)
print("Web scraping test using bs4 completed.")


async def test_web_scraping_using_bs4_and_incremental_loading():
# 0. Prune only data (not full system prune)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)

# 1. Setup test URL and extraction rules
url = "https://books.toscrape.com/"
rules = {"titles": "article.product_pod h3 a", "prices": "article.product_pod p.price_color"}

Expand All @@ -66,95 +59,117 @@ async def test_web_scraping_using_bs4_and_incremental_loading():
structured=True,
)

# 2. Add / ingest the page
await cognee.add(
data=url,
soup_crawler_config=soup_config,
incremental_loading=True,
)

# 3. Cognify
await cognee.cognify()

# 4. Search for a known book
results = await cognee.search(
"What is the price of 'A Light in the Attic' book?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "51.77" in results[0], "Test failed! 'A Light in the Attic' not found in scraped data."
assert "51.77" in results[0]
print("Test passed! Found 'A Light in the Attic' in scraped data.")
print(results)
print("Web scraping test using bs4 with incremental loading completed.")


async def test_web_scraping_using_tavily():
# 0. Prune only data (not full system prune)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)

# 1. Setup test URL and extraction rules
url = "https://quotes.toscrape.com/"

# 2. Add / ingest the page
await cognee.add(
data=url,
incremental_loading=False,
)

# 3. Cognify
await cognee.cognify()

# 4. Search for a known quote
results = await cognee.search(
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "Albert Einstein" in results[0], (
"Test failed! Albert Einstein not found in scraped data."
)
assert "Albert Einstein" in results[0]
print("Test passed! Found Albert Einstein in scraped data.")
print(results)
print("Web scraping test using tavily completed.")


async def test_web_scraping_using_tavily_and_incremental_loading():
# 0. Prune only data (not full system prune)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)

# 1. Setup test URL and extraction rules
url = "https://quotes.toscrape.com/"

# 2. Add / ingest the page
await cognee.add(
data=url,
incremental_loading=True,
)

# 3. Cognify
await cognee.cognify()

# 4. Search for a known quote
results = await cognee.search(
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "Albert Einstein" in results[0], (
"Test failed! Albert Einstein not found in scraped data."
)
assert "Albert Einstein" in results[0]
print("Test passed! Found Albert Einstein in scraped data.")
print(results)
print("Web scraping test using tavily with incremental loading completed.")


# ---------- cron job tests ----------
async def test_cron_web_scraper():
urls = ["https://quotes.toscrape.com/", "https://books.toscrape.com/"]
extraction_rules = {
"quotes": {"selector": ".quote span.text", "all": True},
"authors": {"selector": ".quote small", "all": True},
"titles": "article.product_pod h3 a",
"prices": "article.product_pod p.price_color",
}

# Run cron_web_scraper_task (schedule string is required)
await cron_web_scraper_task(
urls=urls,
schedule="*/5 * * * *", # every 5 minutes
extraction_rules=extraction_rules,
use_playwright=False,
)

# Wait until first run of cron job is done
await asyncio.sleep(120)

# Validate that the scraped data is searchable
results = await cognee.search(
"Who said 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking'?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "Albert Einstein" in results[0]

results_books = await cognee.search(
"What is the price of 'A Light in the Attic' book?",
query_type=cognee.SearchType.GRAPH_COMPLETION,
)
assert "51.77" in results_books[0]

print("Cron job web_scraping test passed!")


async def main():
print("starting web scraping test using bs4 with incremental loading...")
print("Starting BS4 incremental loading test...")
await test_web_scraping_using_bs4_and_incremental_loading()
print("starting web scraping test using bs4 without incremental loading...")

print("Starting BS4 normal test...")
await test_web_scraping_using_bs4()
print("starting web scraping test using tavily with incremental loading...")

print("Starting Tavily incremental loading test...")
await test_web_scraping_using_tavily_and_incremental_loading()
print("starting web scraping test using tavily without incremental loading...")

print("Starting Tavily normal test...")
await test_web_scraping_using_tavily()

print("Starting cron job test...")
await test_cron_web_scraper()


if __name__ == "__main__":
asyncio.run(main())