Spaces:
Running
Running
""" | |
Crawl4AI v0.4.24 Feature Walkthrough | |
=================================== | |
This script demonstrates the new features introduced in Crawl4AI v0.4.24. | |
Each section includes detailed examples and explanations of the new capabilities. | |
""" | |
import asyncio | |
import os | |
import json | |
import re | |
from typing import List, Optional, Dict, Any | |
from pydantic import BaseModel, Field | |
from crawl4ai import ( | |
AsyncWebCrawler, | |
BrowserConfig, | |
CrawlerRunConfig, | |
CacheMode, | |
LLMExtractionStrategy, | |
JsonCssExtractionStrategy | |
) | |
from crawl4ai.content_filter_strategy import RelevantContentFilter | |
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
from bs4 import BeautifulSoup | |
# Sample HTML for demonstrations | |
SAMPLE_HTML = """ | |
<div class="article-list"> | |
<article class="post" data-category="tech" data-author="john"> | |
<h2 class="title"><a href="/post-1">First Post</a></h2> | |
<div class="meta"> | |
<a href="/author/john" class="author">John Doe</a> | |
<span class="date">2023-12-31</span> | |
</div> | |
<div class="content"> | |
<p>First post content...</p> | |
<a href="/read-more-1" class="read-more">Read More</a> | |
</div> | |
</article> | |
<article class="post" data-category="science" data-author="jane"> | |
<h2 class="title"><a href="/post-2">Second Post</a></h2> | |
<div class="meta"> | |
<a href="/author/jane" class="author">Jane Smith</a> | |
<span class="date">2023-12-30</span> | |
</div> | |
<div class="content"> | |
<p>Second post content...</p> | |
<a href="/read-more-2" class="read-more">Read More</a> | |
</div> | |
</article> | |
</div> | |
""" | |
async def demo_ssl_features(): | |
""" | |
Enhanced SSL & Security Features Demo | |
----------------------------------- | |
This example demonstrates the new SSL certificate handling and security features: | |
1. Custom certificate paths | |
2. SSL verification options | |
3. HTTPS error handling | |
4. Certificate validation configurations | |
These features are particularly useful when: | |
- Working with self-signed certificates | |
- Dealing with corporate proxies | |
- Handling mixed content websites | |
- Managing different SSL security levels | |
""" | |
print("\n1. Enhanced SSL & Security Demo") | |
print("--------------------------------") | |
browser_config = BrowserConfig() | |
run_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
fetch_ssl_certificate=True # Enable SSL certificate fetching | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun( | |
url="https://example.com", | |
config=run_config | |
) | |
print(f"SSL Crawl Success: {result.success}") | |
result.ssl_certificate.to_json( | |
os.path.join(os.getcwd(), "ssl_certificate.json") | |
) | |
if not result.success: | |
print(f"SSL Error: {result.error_message}") | |
async def demo_content_filtering(): | |
""" | |
Smart Content Filtering Demo | |
---------------------- | |
Demonstrates advanced content filtering capabilities: | |
1. Custom filter to identify and extract specific content | |
2. Integration with markdown generation | |
3. Flexible pruning rules | |
""" | |
print("\n2. Smart Content Filtering Demo") | |
print("--------------------------------") | |
# Create a custom content filter | |
class CustomNewsFilter(RelevantContentFilter): | |
def __init__(self): | |
super().__init__() | |
# Add news-specific patterns | |
self.negative_patterns = re.compile( | |
r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending', | |
re.I | |
) | |
self.min_word_count = 30 # Higher threshold for news content | |
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: | |
""" | |
Implements news-specific content filtering logic. | |
Args: | |
html (str): HTML content to be filtered | |
min_word_threshold (int, optional): Minimum word count threshold | |
Returns: | |
List[str]: List of filtered HTML content blocks | |
""" | |
if not html or not isinstance(html, str): | |
return [] | |
soup = BeautifulSoup(html, 'lxml') | |
if not soup.body: | |
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml') | |
body = soup.find('body') | |
# Extract chunks with metadata | |
chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count) | |
# Filter chunks based on news-specific criteria | |
filtered_chunks = [] | |
for _, text, tag_type, element in chunks: | |
# Skip if element has negative class/id | |
if self.is_excluded(element): | |
continue | |
# Headers are important in news articles | |
if tag_type == 'header': | |
filtered_chunks.append(self.clean_element(element)) | |
continue | |
# For content, check word count and link density | |
text = element.get_text(strip=True) | |
if len(text.split()) >= (min_word_threshold or self.min_word_count): | |
# Calculate link density | |
links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) | |
link_density = len(links_text) / len(text) if text else 1 | |
# Accept if link density is reasonable | |
if link_density < 0.5: | |
filtered_chunks.append(self.clean_element(element)) | |
return filtered_chunks | |
# Create markdown generator with custom filter | |
markdown_gen = DefaultMarkdownGenerator( | |
content_filter=CustomNewsFilter() | |
) | |
run_config = CrawlerRunConfig( | |
markdown_generator=markdown_gen, | |
cache_mode=CacheMode.BYPASS | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="https://news.ycombinator.com", | |
config=run_config | |
) | |
print("Filtered Content Sample:") | |
print(result.markdown[:500]) # Show first 500 chars | |
async def demo_json_extraction(): | |
""" | |
Improved JSON Extraction Demo | |
--------------------------- | |
Demonstrates the enhanced JSON extraction capabilities: | |
1. Base element attributes extraction | |
2. Complex nested structures | |
3. Multiple extraction patterns | |
Key features shown: | |
- Extracting attributes from base elements (href, data-* attributes) | |
- Processing repeated patterns | |
- Handling optional fields | |
""" | |
print("\n3. Improved JSON Extraction Demo") | |
print("--------------------------------") | |
# Define the extraction schema with base element attributes | |
json_strategy = JsonCssExtractionStrategy( | |
schema={ | |
"name": "Blog Posts", | |
"baseSelector": "div.article-list", | |
"baseFields": [ | |
{"name": "list_id", "type": "attribute", "attribute": "data-list-id"}, | |
{"name": "category", "type": "attribute", "attribute": "data-category"} | |
], | |
"fields": [ | |
{ | |
"name": "posts", | |
"selector": "article.post", | |
"type": "nested_list", | |
"baseFields": [ | |
{"name": "post_id", "type": "attribute", "attribute": "data-post-id"}, | |
{"name": "author_id", "type": "attribute", "attribute": "data-author"} | |
], | |
"fields": [ | |
{ | |
"name": "title", | |
"selector": "h2.title a", | |
"type": "text", | |
"baseFields": [ | |
{"name": "url", "type": "attribute", "attribute": "href"} | |
] | |
}, | |
{ | |
"name": "author", | |
"selector": "div.meta a.author", | |
"type": "text", | |
"baseFields": [ | |
{"name": "profile_url", "type": "attribute", "attribute": "href"} | |
] | |
}, | |
{ | |
"name": "date", | |
"selector": "span.date", | |
"type": "text" | |
}, | |
{ | |
"name": "read_more", | |
"selector": "a.read-more", | |
"type": "nested", | |
"fields": [ | |
{"name": "text", "type": "text"}, | |
{"name": "url", "type": "attribute", "attribute": "href"} | |
] | |
} | |
] | |
} | |
] | |
} | |
) | |
# Demonstrate extraction from raw HTML | |
run_config = CrawlerRunConfig( | |
extraction_strategy=json_strategy, | |
cache_mode=CacheMode.BYPASS | |
) | |
async with AsyncWebCrawler() as crawler: | |
result = await crawler.arun( | |
url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML | |
config=run_config | |
) | |
print("Extracted Content:") | |
print(result.extracted_content) | |
async def demo_input_formats(): | |
""" | |
Input Format Handling Demo | |
---------------------- | |
Demonstrates how LLM extraction can work with different input formats: | |
1. Markdown (default) - Good for simple text extraction | |
2. HTML - Better when you need structure and attributes | |
This example shows how HTML input can be beneficial when: | |
- You need to understand the DOM structure | |
- You want to extract both visible text and HTML attributes | |
- The content has complex layouts like tables or forms | |
""" | |
print("\n4. Input Format Handling Demo") | |
print("---------------------------") | |
# Create a dummy HTML with rich structure | |
dummy_html = """ | |
<div class="job-posting" data-post-id="12345"> | |
<header class="job-header"> | |
<h1 class="job-title">Senior AI/ML Engineer</h1> | |
<div class="job-meta"> | |
<span class="department">AI Research Division</span> | |
<span class="location" data-remote="hybrid">San Francisco (Hybrid)</span> | |
</div> | |
<div class="salary-info" data-currency="USD"> | |
<span class="range">$150,000 - $220,000</span> | |
<span class="period">per year</span> | |
</div> | |
</header> | |
<section class="requirements"> | |
<div class="technical-skills"> | |
<h3>Technical Requirements</h3> | |
<ul class="required-skills"> | |
<li class="skill required" data-priority="must-have"> | |
5+ years experience in Machine Learning | |
</li> | |
<li class="skill required" data-priority="must-have"> | |
Proficiency in Python and PyTorch/TensorFlow | |
</li> | |
<li class="skill preferred" data-priority="nice-to-have"> | |
Experience with distributed training systems | |
</li> | |
</ul> | |
</div> | |
<div class="soft-skills"> | |
<h3>Professional Skills</h3> | |
<ul class="required-skills"> | |
<li class="skill required" data-priority="must-have"> | |
Strong problem-solving abilities | |
</li> | |
<li class="skill preferred" data-priority="nice-to-have"> | |
Experience leading technical teams | |
</li> | |
</ul> | |
</div> | |
</section> | |
<section class="timeline"> | |
<time class="deadline" datetime="2024-02-28"> | |
Application Deadline: February 28, 2024 | |
</time> | |
</section> | |
<footer class="contact-section"> | |
<div class="hiring-manager"> | |
<h4>Hiring Manager</h4> | |
<div class="contact-info"> | |
<span class="name">Dr. Sarah Chen</span> | |
<span class="title">Director of AI Research</span> | |
<span class="email">[email protected]</span> | |
</div> | |
</div> | |
<div class="team-info"> | |
<p>Join our team of 50+ researchers working on cutting-edge AI applications</p> | |
</div> | |
</footer> | |
</div> | |
""" | |
# Use raw:// prefix to pass HTML content directly | |
url = f"raw://{dummy_html}" | |
from pydantic import BaseModel, Field | |
from typing import List, Optional | |
# Define our schema using Pydantic | |
class JobRequirement(BaseModel): | |
category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)") | |
items: List[str] = Field(description="List of specific requirements in this category") | |
priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context") | |
class JobPosting(BaseModel): | |
title: str = Field(description="Job title") | |
department: str = Field(description="Department or team") | |
location: str = Field(description="Job location, including remote options") | |
salary_range: Optional[str] = Field(description="Salary range if specified") | |
requirements: List[JobRequirement] = Field(description="Categorized job requirements") | |
application_deadline: Optional[str] = Field(description="Application deadline if specified") | |
contact_info: Optional[dict] = Field(description="Contact information from footer or contact section") | |
# First try with markdown (default) | |
markdown_strategy = LLMExtractionStrategy( | |
provider="openai/gpt-4o", | |
api_token=os.getenv("OPENAI_API_KEY"), | |
schema=JobPosting.model_json_schema(), | |
extraction_type="schema", | |
instruction=""" | |
Extract job posting details into structured data. Focus on the visible text content | |
and organize requirements into categories. | |
""", | |
input_format="markdown" # default | |
) | |
# Then with HTML for better structure understanding | |
html_strategy = LLMExtractionStrategy( | |
provider="openai/gpt-4", | |
api_token=os.getenv("OPENAI_API_KEY"), | |
schema=JobPosting.model_json_schema(), | |
extraction_type="schema", | |
instruction=""" | |
Extract job posting details, using HTML structure to: | |
1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred') | |
2. Extract contact info from the page footer or dedicated contact section | |
3. Parse salary information from specially formatted elements | |
4. Determine application deadline from timestamp or date elements | |
Use HTML attributes and classes to enhance extraction accuracy. | |
""", | |
input_format="html" # explicitly use HTML | |
) | |
async with AsyncWebCrawler() as crawler: | |
# Try with markdown first | |
markdown_config = CrawlerRunConfig( | |
extraction_strategy=markdown_strategy | |
) | |
markdown_result = await crawler.arun( | |
url=url, | |
config=markdown_config | |
) | |
print("\nMarkdown-based Extraction Result:") | |
items = json.loads(markdown_result.extracted_content) | |
print(json.dumps(items, indent=2)) | |
# Then with HTML for better structure understanding | |
html_config = CrawlerRunConfig( | |
extraction_strategy=html_strategy | |
) | |
html_result = await crawler.arun( | |
url=url, | |
config=html_config | |
) | |
print("\nHTML-based Extraction Result:") | |
items = json.loads(html_result.extracted_content) | |
print(json.dumps(items, indent=2)) | |
# Main execution | |
async def main(): | |
print("Crawl4AI v0.4.24 Feature Walkthrough") | |
print("====================================") | |
# Run all demos | |
await demo_ssl_features() | |
await demo_content_filtering() | |
await demo_json_extraction() | |
# await demo_input_formats() | |
if __name__ == "__main__": | |
asyncio.run(main()) | |