feat(extraction): add RegexExtractionStrategy for pattern-based extra…

…ction Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
AmirulAndalib · pull · May 12, 2025 · Apr 10, 2025 · Apr 29, 2025 · Apr 30, 2025
commit 9b5ccac76eab917e844bbe012dc03ef3fcda46a5
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.2] - 2025-05-02
+
+### Added
+- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
+  - Built-in patterns for emails, URLs, phone numbers, dates, and more
+  - Support for custom regex patterns
+  - `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
+- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
+- Added support for network response body capture in network request tracking
+
+### Changed
+- Updated documentation for no-LLM extraction strategies
+- Enhanced API reference to include RegexExtractionStrategy examples and usage
+- Improved HTML preprocessing with optimized performance for extraction strategies
+
 ## [0.6.1] - 2025-04-24
 
 ### Added

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
@@ -23,7 +23,8 @@
     CosineStrategy,
     JsonCssExtractionStrategy,
     JsonXPathExtractionStrategy,
-    JsonLxmlExtractionStrategy
+    JsonLxmlExtractionStrategy,
+    RegexExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -105,6 +106,7 @@
     "JsonCssExtractionStrategy",
     "JsonXPathExtractionStrategy",
     "JsonLxmlExtractionStrategy",
+    "RegexExtractionStrategy",
     "ChunkingStrategy",
     "RegexChunking",
     "DefaultMarkdownGenerator",

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
@@ -571,6 +571,14 @@ async def handle_request_capture(request):
 
             async def handle_response_capture(response):
                 try:
+                    try:
+                        # body = await response.body()
+                        # json_body = await response.json()
+                        text_body = await response.text()
+                    except Exception as e:
+                        body = None
+                        # json_body = None
+                        # text_body = None
                     captured_requests.append({
                         "event_type": "response",
                         "url": response.url,
@@ -579,7 +587,12 @@ async def handle_response_capture(response):
                         "headers": dict(response.headers), # Convert Header dict
                         "from_service_worker": response.from_service_worker,
                         "request_timing": response.request.timing, # Detailed timing info
-                        "timestamp": time.time()
+                        "timestamp": time.time(),
+                        "body" : {
+                            # "raw": body,
+                            # "json": json_body,
+                            "text": text_body
+                        }
                     })
                 except Exception as e:
                     if self.logger:

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
@@ -503,6 +503,8 @@ async def aprocess_html(
             tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
+
+        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
 
         ################################
         # Generate Markdown            #
@@ -519,7 +521,7 @@ async def aprocess_html(
         html_source_selector = {
             "raw_html": lambda: html,  # The original raw HTML
             "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
-            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+            "fit_html": lambda: fit_html,  # The HTML after preprocessing for schema
         }
 
         markdown_input_html = cleaned_html  # Default to cleaned_html
@@ -593,14 +595,15 @@ async def aprocess_html(
             content = {
                 "markdown": markdown_result.raw_markdown,
                 "html": html,
+                "fit_html": fit_html,
                 "cleaned_html": cleaned_html,
                 "fit_markdown": markdown_result.fit_markdown,
             }.get(content_format, markdown_result.raw_markdown)
 
             # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
             chunking = (
                 IdentityChunking()
-                if content_format in ["html", "cleaned_html"]
+                if content_format in ["html", "cleaned_html", "fit_html"]
                 else config.chunking_strategy
             )
             sections = chunking.chunk(content)
@@ -624,6 +627,7 @@ async def aprocess_html(
         return CrawlResult(
             url=url,
             html=html,
+            fit_html=fit_html,
             cleaned_html=cleaned_html,
             markdown=markdown_result,
             media=media,

diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
@@ -475,7 +475,7 @@ async def my_crawl_function(profile_path, url):
                     self.logger.warning("  No profiles found. Create one first with option 1.", tag="PROFILES")
                     continue
 
-                # Print profile information with colorama formatting
+                # Print profile information 
                 self.logger.info("\nAvailable profiles:", tag="PROFILES")
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")