Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b2f3cb0
WIP: logger migriate to rich
wakaka6 Apr 10, 2025
9499164
feat(browser): improve browser profile management and cleanup
unclecode Apr 29, 2025
50f0b83
feat(linkedin): add prospect-wizard app with scraping and visualization
unclecode Apr 30, 2025
cd2b490
refactor(logger): Apply the Enumeration for color
wakaka6 May 1, 2025
0e5d672
Merge branch 'pr-971' into merge-pr971
unclecode May 1, 2025
ee01b81
Merge branch 'merge-pr971' into next
unclecode May 1, 2025
7c2fd52
fix: incorrect params and commands in linkedin app readme
aravindkarnam May 1, 2025
94e9959
feat(docker-api): add job-based polling endpoints for crawl and LLM t…
unclecode May 1, 2025
baf7f6a
fix: typo in readme
aravindkarnam May 2, 2025
5cc58f9
fix: 1. duplicate verbose flag 2.inconsistency in argument name --pro…
aravindkarnam May 2, 2025
6650b2f
fix: replace openAI with litellm to support multiple llm providers
aravindkarnam May 2, 2025
bd5a9ac
updated readme with arguments for litellm
aravindkarnam May 2, 2025
87d4b0f
format bash scripts properly so copy & paste may work without issues
aravindkarnam May 2, 2025
9b5ccac
feat(extraction): add RegexExtractionStrategy for pattern-based extra…
unclecode May 2, 2025
38ebcbb
fix: provide support for local llm by adding it to the arguments
aravindkarnam May 5, 2025
a0555d5
merge:from next branch
aravindkarnam May 6, 2025
aaf0591
fix: removed unnecessary imports and installs
aravindkarnam May 6, 2025
206a9df
feat(crawler): add session management and view-source support
unclecode May 8, 2025
76dd86d
Merge remote-tracking branch 'origin/linkedin-prep' into next
unclecode May 8, 2025
a3e9ef9
fix(crawler): remove automatic page closure in screenshot methods
unclecode May 12, 2025
897e017
Set version to 0.6.3
unclecode May 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat(extraction): add RegexExtractionStrategy for pattern-based extra…
…ction

Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types:
- Built-in patterns for emails, URLs, phones, dates, and more
- Support for custom regex patterns
- LLM-assisted pattern generation utility
- Optimized HTML preprocessing with fit_html field
- Enhanced network response body capture

Breaking changes: None
  • Loading branch information
unclecode committed May 2, 2025
commit 9b5ccac76eab917e844bbe012dc03ef3fcda46a5
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.6.2] - 2025-05-02

### Added
- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
- Built-in patterns for emails, URLs, phone numbers, dates, and more
- Support for custom regex patterns
- `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
- Added support for network response body capture in network request tracking

### Changed
- Updated documentation for no-LLM extraction strategies
- Enhanced API reference to include RegexExtractionStrategy examples and usage
- Improved HTML preprocessing with optimized performance for extraction strategies

## [0.6.1] - 2025-04-24

### Added
Expand Down
4 changes: 3 additions & 1 deletion crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
CosineStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
Expand Down Expand Up @@ -105,6 +106,7 @@
"JsonCssExtractionStrategy",
"JsonXPathExtractionStrategy",
"JsonLxmlExtractionStrategy",
"RegexExtractionStrategy",
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",
Expand Down
15 changes: 14 additions & 1 deletion crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,14 @@ async def handle_request_capture(request):

async def handle_response_capture(response):
try:
try:
# body = await response.body()
# json_body = await response.json()
text_body = await response.text()
except Exception as e:
body = None
# json_body = None
# text_body = None
captured_requests.append({
"event_type": "response",
"url": response.url,
Expand All @@ -579,7 +587,12 @@ async def handle_response_capture(response):
"headers": dict(response.headers), # Convert Header dict
"from_service_worker": response.from_service_worker,
"request_timing": response.request.timing, # Detailed timing info
"timestamp": time.time()
"timestamp": time.time(),
"body" : {
# "raw": body,
# "json": json_body,
"text": text_body
}
})
except Exception as e:
if self.logger:
Expand Down
8 changes: 6 additions & 2 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,8 @@ async def aprocess_html(
tables = media.pop("tables", [])
links = result.links.model_dump()
metadata = result.metadata

fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

################################
# Generate Markdown #
Expand All @@ -519,7 +521,7 @@ async def aprocess_html(
html_source_selector = {
"raw_html": lambda: html, # The original raw HTML
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML
"fit_html": lambda: fit_html, # The HTML after preprocessing for schema
}

markdown_input_html = cleaned_html # Default to cleaned_html
Expand Down Expand Up @@ -593,14 +595,15 @@ async def aprocess_html(
content = {
"markdown": markdown_result.raw_markdown,
"html": html,
"fit_html": fit_html,
"cleaned_html": cleaned_html,
"fit_markdown": markdown_result.fit_markdown,
}.get(content_format, markdown_result.raw_markdown)

# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
chunking = (
IdentityChunking()
if content_format in ["html", "cleaned_html"]
if content_format in ["html", "cleaned_html", "fit_html"]
else config.chunking_strategy
)
sections = chunking.chunk(content)
Expand All @@ -624,6 +627,7 @@ async def aprocess_html(
return CrawlResult(
url=url,
html=html,
fit_html=fit_html,
cleaned_html=cleaned_html,
markdown=markdown_result,
media=media,
Expand Down
2 changes: 1 addition & 1 deletion crawl4ai/browser_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,7 @@ async def my_crawl_function(profile_path, url):
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
continue

# Print profile information with colorama formatting
# Print profile information
self.logger.info("\nAvailable profiles:", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
Expand Down
Loading