AmirulAndalib · pull · May 12, 2025 · Apr 10, 2025 · Apr 29, 2025 · Apr 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -263,4 +263,5 @@ tests/**/test_site
 tests/**/reports
 tests/**/benchmark_reports
 
-docs/**/data
+docs/**/data
+.codecat/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.2] - 2025-05-02
+
+### Added
+- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
+  - Built-in patterns for emails, URLs, phone numbers, dates, and more
+  - Support for custom regex patterns
+  - `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
+- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
+- Added support for network response body capture in network request tracking
+
+### Changed
+- Updated documentation for no-LLM extraction strategies
+- Enhanced API reference to include RegexExtractionStrategy examples and usage
+- Improved HTML preprocessing with optimized performance for extraction strategies
+
 ## [0.6.1] - 2025-04-24
 
 ### Added

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
@@ -23,7 +23,8 @@
     CosineStrategy,
     JsonCssExtractionStrategy,
     JsonXPathExtractionStrategy,
-    JsonLxmlExtractionStrategy
+    JsonLxmlExtractionStrategy,
+    RegexExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -105,6 +106,7 @@
     "JsonCssExtractionStrategy",
     "JsonXPathExtractionStrategy",
     "JsonLxmlExtractionStrategy",
+    "RegexExtractionStrategy",
     "ChunkingStrategy",
     "RegexChunking",
     "DefaultMarkdownGenerator",

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -571,6 +571,14 @@ async def handle_request_capture(request):
 
             async def handle_response_capture(response):
                 try:
+                    try:
+                        # body = await response.body()
+                        # json_body = await response.json()
+                        text_body = await response.text()
+                    except Exception as e:
+                        body = None
+                        # json_body = None
+                        # text_body = None
                     captured_requests.append({
                         "event_type": "response",
                         "url": response.url,
@@ -579,7 +587,12 @@ async def handle_response_capture(response):
                         "headers": dict(response.headers), # Convert Header dict
                         "from_service_worker": response.from_service_worker,
                         "request_timing": response.request.timing, # Detailed timing info
-                        "timestamp": time.time()
+                        "timestamp": time.time(),
+                        "body" : {
+                            # "raw": body,
+                            # "json": json_body,
+                            "text": text_body
+                        }
                     })
                 except Exception as e:
                     if self.logger:

diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
@@ -171,7 +171,10 @@ async def get_connection(self):
                             f"Code context:\n{error_context['code_context']}"
                         )
                         self.logger.error(
-                            message=create_box_message(error_message, type="error"),
+                            message="{error}",
+                            tag="ERROR",
+                            params={"error": str(error_message)},
+                            boxes=["error"],
                         )
 
                         raise
@@ -189,7 +192,10 @@ async def get_connection(self):
                 f"Code context:\n{error_context['code_context']}"
             )
             self.logger.error(
-                message=create_box_message(error_message, type="error"),
+                message="{error}",
+                tag="ERROR",
+                params={"error": str(error_message)},
+                boxes=["error"],
             )
             raise
         finally:

diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
@@ -1,10 +1,12 @@
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Optional, Dict, Any
-from colorama import Fore, Style, init
+from typing import Optional, Dict, Any, List
 import os
 from datetime import datetime
 from urllib.parse import unquote
+from rich.console import Console
+from rich.text import Text
+from .utils import create_box_message
 
 
 class LogLevel(Enum):
@@ -21,6 +23,26 @@ class LogLevel(Enum):
     FATAL = 10
 
 
+    def __str__(self):
+        return self.name.lower()
+
+class LogColor(str, Enum):
+    """Enum for log colors."""
+
+    DEBUG = "lightblack"
+    INFO = "cyan"
+    SUCCESS = "green"
+    WARNING = "yellow"
+    ERROR = "red"
+    CYAN = "cyan"
+    GREEN = "green"
+    YELLOW = "yellow"
+    MAGENTA = "magenta"
+    DIM_MAGENTA = "dim magenta"
+
+    def __str__(self):
+        """Automatically convert rich color to string."""
+        return self.value
 
 
 class AsyncLoggerBase(ABC):
@@ -52,6 +74,7 @@ def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH",
     def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
         pass
 
+
 class AsyncLogger(AsyncLoggerBase):
     """
     Asynchronous logger with support for colored console output and file logging.
@@ -79,17 +102,11 @@ class AsyncLogger(AsyncLoggerBase):
     }
 
     DEFAULT_COLORS = {
-        LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
-        LogLevel.INFO: Fore.CYAN,
-        LogLevel.SUCCESS: Fore.GREEN,
-        LogLevel.WARNING: Fore.YELLOW,
-        LogLevel.ERROR: Fore.RED,
-        LogLevel.CRITICAL: Fore.RED + Style.BRIGHT,
-        LogLevel.ALERT: Fore.RED + Style.BRIGHT,
-        LogLevel.NOTICE: Fore.BLUE,
-        LogLevel.EXCEPTION: Fore.RED + Style.BRIGHT,
-        LogLevel.FATAL: Fore.RED + Style.BRIGHT,
-        LogLevel.DEFAULT: Fore.WHITE,
+        LogLevel.DEBUG: LogColor.DEBUG,
+        LogLevel.INFO: LogColor.INFO,
+        LogLevel.SUCCESS: LogColor.SUCCESS,
+        LogLevel.WARNING: LogColor.WARNING,
+        LogLevel.ERROR: LogColor.ERROR,
     }
 
     def __init__(
@@ -98,7 +115,7 @@ def __init__(
         log_level: LogLevel = LogLevel.DEBUG,
         tag_width: int = 10,
         icons: Optional[Dict[str, str]] = None,
-        colors: Optional[Dict[LogLevel, str]] = None,
+        colors: Optional[Dict[LogLevel, LogColor]] = None,
         verbose: bool = True,
     ):
         """
@@ -112,13 +129,13 @@ def __init__(
             colors: Custom colors for different log levels
             verbose: Whether to output to console
         """
-        init()  # Initialize colorama
         self.log_file = log_file
         self.log_level = log_level
         self.tag_width = tag_width
         self.icons = icons or self.DEFAULT_ICONS
         self.colors = colors or self.DEFAULT_COLORS
         self.verbose = verbose
+        self.console = Console()
 
         # Create log file directory if needed
         if log_file:
@@ -143,25 +160,21 @@ def _shorten(self, text, length, placeholder="..."):
     def _write_to_file(self, message: str):
         """Write a message to the log file if configured."""
         if self.log_file:
+            text = Text.from_markup(message)
+            plain_text = text.plain
             timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
             with open(self.log_file, "a", encoding="utf-8") as f:
-                # Strip ANSI color codes for file output
-                clean_message = message.replace(Fore.RESET, "").replace(
-                    Style.RESET_ALL, ""
-                )
-                for color in vars(Fore).values():
-                    if isinstance(color, str):
-                        clean_message = clean_message.replace(color, "")
-                f.write(f"[{timestamp}] {clean_message}\n")
+                f.write(f"[{timestamp}] {plain_text}\n")
 
     def _log(
         self,
         level: LogLevel,
         message: str,
         tag: str,
         params: Optional[Dict[str, Any]] = None,
-        colors: Optional[Dict[str, str]] = None,
-        base_color: Optional[str] = None,
+        colors: Optional[Dict[str, LogColor]] = None,
+        boxes: Optional[List[str]] = None,
+        base_color: Optional[LogColor] = None,
         **kwargs,
     ):
         """
@@ -173,55 +186,44 @@ def _log(
             tag: Tag for the message
             params: Parameters to format into the message
             colors: Color overrides for specific parameters
+            boxes: Box overrides for specific parameters
             base_color: Base color for the entire message
         """
         if level.value < self.log_level.value:
             return
 
-        # Format the message with parameters if provided
+        # avoid conflict with rich formatting
+        parsed_message = message.replace("[", "[[").replace("]", "]]")
         if params:
-            try:
-                # First format the message with raw parameters
-                formatted_message = message.format(**params)
-
-                # Then apply colors if specified
-                color_map = {
-                    "green": Fore.GREEN,
-                    "red": Fore.RED,
-                    "yellow": Fore.YELLOW,
-                    "blue": Fore.BLUE,
-                    "cyan": Fore.CYAN,
-                    "magenta": Fore.MAGENTA,
-                    "white": Fore.WHITE,
-                    "black": Fore.BLACK,
-                    "reset": Style.RESET_ALL,
-                }
-                if colors:
-                    for key, color in colors.items():
-                        # Find the formatted value in the message and wrap it with color
-                        if color in color_map:
-                            color = color_map[color]
-                        if key in params:
-                            value_str = str(params[key])
-                            formatted_message = formatted_message.replace(
-                                value_str, f"{color}{value_str}{Style.RESET_ALL}"
-                            )
-
-            except KeyError as e:
-                formatted_message = (
-                    f"LOGGING ERROR: Missing parameter {e} in message template"
-                )
-                level = LogLevel.ERROR
+            # FIXME: If there are formatting strings in floating point format, 
+            # this may result in colors and boxes not being applied properly.
+            # such as {value:.2f}, the value is 0.23333 format it to 0.23,
+            # but we replace("0.23333", "[color]0.23333[/color]")
+            formatted_message = parsed_message.format(**params)
+            for key, value in params.items():
+                # value_str may discard `[` and `]`, so we need to replace it. 
+                value_str = str(value).replace("[", "[[").replace("]", "]]")
+                # check is need apply color
+                if colors and key in colors:
+                    color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]"
+                    formatted_message = formatted_message.replace(value_str, color_str)
+                    value_str = color_str
+
+                # check is need apply box
+                if boxes and key in boxes:
+                    formatted_message = formatted_message.replace(value_str,
+                        create_box_message(value_str, type=str(level)))
+
         else:
-            formatted_message = message
+            formatted_message = parsed_message
 
         # Construct the full log line
-        color = base_color or self.colors[level]
-        log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
+        color: LogColor = base_color or self.colors[level]
+        log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]"
 
         # Output to console if verbose
         if self.verbose or kwargs.get("force_verbose", False):
-            print(log_line)
+            self.console.print(log_line)
 
         # Write to file if configured
         self._write_to_file(log_line)
@@ -292,8 +294,8 @@ def url_status(
                 "timing": timing,
             },
             colors={
-                "status": Fore.GREEN if success else Fore.RED,
-                "timing": Fore.YELLOW,
+                "status": LogColor.SUCCESS if success else LogColor.ERROR,
+                "timing": LogColor.WARNING,
             },
         )
 

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -2,7 +2,6 @@
 import os
 import sys
 import time
-from colorama import Fore
 from pathlib import Path
 from typing import Optional, List
 import json
@@ -44,7 +43,6 @@
     sanitize_input_encode,
     InvalidCSSSelectorError,
     fast_format_html,
-    create_box_message,
     get_error_context,
     RobotsParser,
     preprocess_html_for_schema,
@@ -419,7 +417,7 @@ async def arun(
 
                 self.logger.error_status(
                     url=url,
-                    error=create_box_message(error_message, type="error"),
+                    error=error_message,
                     tag="ERROR",
                 )
 
@@ -505,6 +503,8 @@ async def aprocess_html(
             tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
+
+        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
 
         ################################
         # Generate Markdown            #
@@ -521,7 +521,7 @@ async def aprocess_html(
         html_source_selector = {
             "raw_html": lambda: html,  # The original raw HTML
             "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
-            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+            "fit_html": lambda: fit_html,  # The HTML after preprocessing for schema
         }
 
         markdown_input_html = cleaned_html  # Default to cleaned_html
@@ -595,14 +595,15 @@ async def aprocess_html(
             content = {
                 "markdown": markdown_result.raw_markdown,
                 "html": html,
+                "fit_html": fit_html,
                 "cleaned_html": cleaned_html,
                 "fit_markdown": markdown_result.fit_markdown,
             }.get(content_format, markdown_result.raw_markdown)
 
             # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
             chunking = (
                 IdentityChunking()
-                if content_format in ["html", "cleaned_html"]
+                if content_format in ["html", "cleaned_html", "fit_html"]
                 else config.chunking_strategy
             )
             sections = chunking.chunk(content)
@@ -626,6 +627,7 @@ async def aprocess_html(
         return CrawlResult(
             url=url,
             html=html,
+            fit_html=fit_html,
             cleaned_html=cleaned_html,
             markdown=markdown_result,
             media=media,