Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b2f3cb0
WIP: logger migriate to rich
wakaka6 Apr 10, 2025
9499164
feat(browser): improve browser profile management and cleanup
unclecode Apr 29, 2025
50f0b83
feat(linkedin): add prospect-wizard app with scraping and visualization
unclecode Apr 30, 2025
cd2b490
refactor(logger): Apply the Enumeration for color
wakaka6 May 1, 2025
0e5d672
Merge branch 'pr-971' into merge-pr971
unclecode May 1, 2025
ee01b81
Merge branch 'merge-pr971' into next
unclecode May 1, 2025
7c2fd52
fix: incorrect params and commands in linkedin app readme
aravindkarnam May 1, 2025
94e9959
feat(docker-api): add job-based polling endpoints for crawl and LLM t…
unclecode May 1, 2025
baf7f6a
fix: typo in readme
aravindkarnam May 2, 2025
5cc58f9
fix: 1. duplicate verbose flag 2.inconsistency in argument name --pro…
aravindkarnam May 2, 2025
6650b2f
fix: replace openAI with litellm to support multiple llm providers
aravindkarnam May 2, 2025
bd5a9ac
updated readme with arguments for litellm
aravindkarnam May 2, 2025
87d4b0f
format bash scripts properly so copy & paste may work without issues
aravindkarnam May 2, 2025
9b5ccac
feat(extraction): add RegexExtractionStrategy for pattern-based extra…
unclecode May 2, 2025
38ebcbb
fix: provide support for local llm by adding it to the arguments
aravindkarnam May 5, 2025
a0555d5
merge:from next branch
aravindkarnam May 6, 2025
aaf0591
fix: removed unnecessary imports and installs
aravindkarnam May 6, 2025
206a9df
feat(crawler): add session management and view-source support
unclecode May 8, 2025
76dd86d
Merge remote-tracking branch 'origin/linkedin-prep' into next
unclecode May 8, 2025
a3e9ef9
fix(crawler): remove automatic page closure in screenshot methods
unclecode May 12, 2025
897e017
Set version to 0.6.3
unclecode May 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -263,4 +263,5 @@ tests/**/test_site
tests/**/reports
tests/**/benchmark_reports

docs/**/data
docs/**/data
.codecat/
15 changes: 15 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.6.2] - 2025-05-02

### Added
- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
- Built-in patterns for emails, URLs, phone numbers, dates, and more
- Support for custom regex patterns
- `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
- Added support for network response body capture in network request tracking

### Changed
- Updated documentation for no-LLM extraction strategies
- Enhanced API reference to include RegexExtractionStrategy examples and usage
- Improved HTML preprocessing with optimized performance for extraction strategies

## [0.6.1] - 2025-04-24

### Added
Expand Down
4 changes: 3 additions & 1 deletion crawl4ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
CosineStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
Expand Down Expand Up @@ -105,6 +106,7 @@
"JsonCssExtractionStrategy",
"JsonXPathExtractionStrategy",
"JsonLxmlExtractionStrategy",
"RegexExtractionStrategy",
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",
Expand Down
15 changes: 14 additions & 1 deletion crawl4ai/async_crawler_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,14 @@ async def handle_request_capture(request):

async def handle_response_capture(response):
try:
try:
# body = await response.body()
# json_body = await response.json()
text_body = await response.text()
except Exception as e:
body = None
# json_body = None
# text_body = None
captured_requests.append({
"event_type": "response",
"url": response.url,
Expand All @@ -579,7 +587,12 @@ async def handle_response_capture(response):
"headers": dict(response.headers), # Convert Header dict
"from_service_worker": response.from_service_worker,
"request_timing": response.request.timing, # Detailed timing info
"timestamp": time.time()
"timestamp": time.time(),
"body" : {
# "raw": body,
# "json": json_body,
"text": text_body
}
})
except Exception as e:
if self.logger:
Expand Down
10 changes: 8 additions & 2 deletions crawl4ai/async_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,10 @@ async def get_connection(self):
f"Code context:\n{error_context['code_context']}"
)
self.logger.error(
message=create_box_message(error_message, type="error"),
message="{error}",
tag="ERROR",
params={"error": str(error_message)},
boxes=["error"],
)

raise
Expand All @@ -189,7 +192,10 @@ async def get_connection(self):
f"Code context:\n{error_context['code_context']}"
)
self.logger.error(
message=create_box_message(error_message, type="error"),
message="{error}",
tag="ERROR",
params={"error": str(error_message)},
boxes=["error"],
)
raise
finally:
Expand Down
130 changes: 66 additions & 64 deletions crawl4ai/async_logger.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from abc import ABC, abstractmethod
from enum import Enum
from typing import Optional, Dict, Any
from colorama import Fore, Style, init
from typing import Optional, Dict, Any, List
import os
from datetime import datetime
from urllib.parse import unquote
from rich.console import Console
from rich.text import Text
from .utils import create_box_message


class LogLevel(Enum):
Expand All @@ -21,6 +23,26 @@ class LogLevel(Enum):
FATAL = 10


def __str__(self):
return self.name.lower()

class LogColor(str, Enum):
"""Enum for log colors."""

DEBUG = "lightblack"
INFO = "cyan"
SUCCESS = "green"
WARNING = "yellow"
ERROR = "red"
CYAN = "cyan"
GREEN = "green"
YELLOW = "yellow"
MAGENTA = "magenta"
DIM_MAGENTA = "dim magenta"

def __str__(self):
"""Automatically convert rich color to string."""
return self.value


class AsyncLoggerBase(ABC):
Expand Down Expand Up @@ -52,6 +74,7 @@ def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH",
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
pass


class AsyncLogger(AsyncLoggerBase):
"""
Asynchronous logger with support for colored console output and file logging.
Expand Down Expand Up @@ -79,17 +102,11 @@ class AsyncLogger(AsyncLoggerBase):
}

DEFAULT_COLORS = {
LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
LogLevel.INFO: Fore.CYAN,
LogLevel.SUCCESS: Fore.GREEN,
LogLevel.WARNING: Fore.YELLOW,
LogLevel.ERROR: Fore.RED,
LogLevel.CRITICAL: Fore.RED + Style.BRIGHT,
LogLevel.ALERT: Fore.RED + Style.BRIGHT,
LogLevel.NOTICE: Fore.BLUE,
LogLevel.EXCEPTION: Fore.RED + Style.BRIGHT,
LogLevel.FATAL: Fore.RED + Style.BRIGHT,
LogLevel.DEFAULT: Fore.WHITE,
LogLevel.DEBUG: LogColor.DEBUG,
LogLevel.INFO: LogColor.INFO,
LogLevel.SUCCESS: LogColor.SUCCESS,
LogLevel.WARNING: LogColor.WARNING,
LogLevel.ERROR: LogColor.ERROR,
}

def __init__(
Expand All @@ -98,7 +115,7 @@ def __init__(
log_level: LogLevel = LogLevel.DEBUG,
tag_width: int = 10,
icons: Optional[Dict[str, str]] = None,
colors: Optional[Dict[LogLevel, str]] = None,
colors: Optional[Dict[LogLevel, LogColor]] = None,
verbose: bool = True,
):
"""
Expand All @@ -112,13 +129,13 @@ def __init__(
colors: Custom colors for different log levels
verbose: Whether to output to console
"""
init() # Initialize colorama
self.log_file = log_file
self.log_level = log_level
self.tag_width = tag_width
self.icons = icons or self.DEFAULT_ICONS
self.colors = colors or self.DEFAULT_COLORS
self.verbose = verbose
self.console = Console()

# Create log file directory if needed
if log_file:
Expand All @@ -143,25 +160,21 @@ def _shorten(self, text, length, placeholder="..."):
def _write_to_file(self, message: str):
"""Write a message to the log file if configured."""
if self.log_file:
text = Text.from_markup(message)
plain_text = text.plain
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f:
# Strip ANSI color codes for file output
clean_message = message.replace(Fore.RESET, "").replace(
Style.RESET_ALL, ""
)
for color in vars(Fore).values():
if isinstance(color, str):
clean_message = clean_message.replace(color, "")
f.write(f"[{timestamp}] {clean_message}\n")
f.write(f"[{timestamp}] {plain_text}\n")

def _log(
self,
level: LogLevel,
message: str,
tag: str,
params: Optional[Dict[str, Any]] = None,
colors: Optional[Dict[str, str]] = None,
base_color: Optional[str] = None,
colors: Optional[Dict[str, LogColor]] = None,
boxes: Optional[List[str]] = None,
base_color: Optional[LogColor] = None,
**kwargs,
):
"""
Expand All @@ -173,55 +186,44 @@ def _log(
tag: Tag for the message
params: Parameters to format into the message
colors: Color overrides for specific parameters
boxes: Box overrides for specific parameters
base_color: Base color for the entire message
"""
if level.value < self.log_level.value:
return

# Format the message with parameters if provided
# avoid conflict with rich formatting
parsed_message = message.replace("[", "[[").replace("]", "]]")
if params:
try:
# First format the message with raw parameters
formatted_message = message.format(**params)

# Then apply colors if specified
color_map = {
"green": Fore.GREEN,
"red": Fore.RED,
"yellow": Fore.YELLOW,
"blue": Fore.BLUE,
"cyan": Fore.CYAN,
"magenta": Fore.MAGENTA,
"white": Fore.WHITE,
"black": Fore.BLACK,
"reset": Style.RESET_ALL,
}
if colors:
for key, color in colors.items():
# Find the formatted value in the message and wrap it with color
if color in color_map:
color = color_map[color]
if key in params:
value_str = str(params[key])
formatted_message = formatted_message.replace(
value_str, f"{color}{value_str}{Style.RESET_ALL}"
)

except KeyError as e:
formatted_message = (
f"LOGGING ERROR: Missing parameter {e} in message template"
)
level = LogLevel.ERROR
# FIXME: If there are formatting strings in floating point format,
# this may result in colors and boxes not being applied properly.
# such as {value:.2f}, the value is 0.23333 format it to 0.23,
# but we replace("0.23333", "[color]0.23333[/color]")
formatted_message = parsed_message.format(**params)
for key, value in params.items():
# value_str may discard `[` and `]`, so we need to replace it.
value_str = str(value).replace("[", "[[").replace("]", "]]")
# check is need apply color
if colors and key in colors:
color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]"
formatted_message = formatted_message.replace(value_str, color_str)
value_str = color_str

# check is need apply box
if boxes and key in boxes:
formatted_message = formatted_message.replace(value_str,
create_box_message(value_str, type=str(level)))

else:
formatted_message = message
formatted_message = parsed_message

# Construct the full log line
color = base_color or self.colors[level]
log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
color: LogColor = base_color or self.colors[level]
log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]"

# Output to console if verbose
if self.verbose or kwargs.get("force_verbose", False):
print(log_line)
self.console.print(log_line)

# Write to file if configured
self._write_to_file(log_line)
Expand Down Expand Up @@ -292,8 +294,8 @@ def url_status(
"timing": timing,
},
colors={
"status": Fore.GREEN if success else Fore.RED,
"timing": Fore.YELLOW,
"status": LogColor.SUCCESS if success else LogColor.ERROR,
"timing": LogColor.WARNING,
},
)

Expand Down
12 changes: 7 additions & 5 deletions crawl4ai/async_webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import os
import sys
import time
from colorama import Fore
from pathlib import Path
from typing import Optional, List
import json
Expand Down Expand Up @@ -44,7 +43,6 @@
sanitize_input_encode,
InvalidCSSSelectorError,
fast_format_html,
create_box_message,
get_error_context,
RobotsParser,
preprocess_html_for_schema,
Expand Down Expand Up @@ -419,7 +417,7 @@ async def arun(

self.logger.error_status(
url=url,
error=create_box_message(error_message, type="error"),
error=error_message,
tag="ERROR",
)

Expand Down Expand Up @@ -505,6 +503,8 @@ async def aprocess_html(
tables = media.pop("tables", [])
links = result.links.model_dump()
metadata = result.metadata

fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

################################
# Generate Markdown #
Expand All @@ -521,7 +521,7 @@ async def aprocess_html(
html_source_selector = {
"raw_html": lambda: html, # The original raw HTML
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML
"fit_html": lambda: fit_html, # The HTML after preprocessing for schema
}

markdown_input_html = cleaned_html # Default to cleaned_html
Expand Down Expand Up @@ -595,14 +595,15 @@ async def aprocess_html(
content = {
"markdown": markdown_result.raw_markdown,
"html": html,
"fit_html": fit_html,
"cleaned_html": cleaned_html,
"fit_markdown": markdown_result.fit_markdown,
}.get(content_format, markdown_result.raw_markdown)

# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
chunking = (
IdentityChunking()
if content_format in ["html", "cleaned_html"]
if content_format in ["html", "cleaned_html", "fit_html"]
else config.chunking_strategy
)
sections = chunking.chunk(content)
Expand All @@ -626,6 +627,7 @@ async def aprocess_html(
return CrawlResult(
url=url,
html=html,
fit_html=fit_html,
cleaned_html=cleaned_html,
markdown=markdown_result,
media=media,
Expand Down
Loading