import os, sys # append the parent directory to the sys.path parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(parent_dir) parent_parent_dir = os.path.dirname(parent_dir) sys.path.append(parent_parent_dir) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) __data__ = os.path.join(__location__, "__data") import asyncio from pathlib import Path import aiohttp import json from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.content_filter_strategy import BM25ContentFilter # 1. File Download Processing Example async def download_example(): """Example of downloading files from Python.org""" # downloads_path = os.path.join(os.getcwd(), "downloads") downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") os.makedirs(downloads_path, exist_ok=True) print(f"Downloads will be saved to: {downloads_path}") async with AsyncWebCrawler( accept_downloads=True, downloads_path=downloads_path, verbose=True ) as crawler: result = await crawler.arun( url="https://www.python.org/downloads/", js_code=""" // Find and click the first Windows installer link const downloadLink = document.querySelector('a[href$=".exe"]'); if (downloadLink) { console.log('Found download link:', downloadLink.href); downloadLink.click(); } else { console.log('No .exe download link found'); } """, delay_before_return_html=1, # Wait 5 seconds to ensure download starts cache_mode=CacheMode.BYPASS ) if result.downloaded_files: print("\nDownload successful!") print("Downloaded files:") for file_path in result.downloaded_files: print(f"- {file_path}") print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB") else: print("\nNo files were downloaded") # 2. Local File and Raw HTML Processing Example async def local_and_raw_html_example(): """Example of processing local files and raw HTML""" # Create a sample HTML file sample_file = os.path.join(__data__, "sample.html") with open(sample_file, "w") as f: f.write("""
This is a test paragraph.
""") async with AsyncWebCrawler(verbose=True) as crawler: # Process local file local_result = await crawler.arun( url=f"file://{os.path.abspath(sample_file)}" ) # Process raw HTML raw_html = """This is a test of raw HTML processing.
""" raw_result = await crawler.arun( url=f"raw:{raw_html}" ) # Clean up os.remove(sample_file) print("Local file content:", local_result.markdown) print("\nRaw HTML content:", raw_result.markdown) # 3. Enhanced Markdown Generation Example async def markdown_generation_example(): """Example of enhanced markdown generation with citations and LLM-friendly features""" async with AsyncWebCrawler(verbose=True) as crawler: # Create a content filter (optional) content_filter = BM25ContentFilter( # user_query="History and cultivation", bm25_threshold=1.0 ) result = await crawler.arun( url="https://en.wikipedia.org/wiki/Apple", css_selector="main div#bodyContent", content_filter=content_filter, cache_mode=CacheMode.BYPASS ) from crawl4ai import AsyncWebCrawler from crawl4ai.content_filter_strategy import BM25ContentFilter result = await crawler.arun( url="https://en.wikipedia.org/wiki/Apple", css_selector="main div#bodyContent", content_filter=BM25ContentFilter() ) print(result.markdown_v2.fit_markdown) print("\nMarkdown Generation Results:") print(f"1. Original markdown length: {len(result.markdown)}") print(f"2. New markdown versions (markdown_v2):") print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}") print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}") print(f" - References section length: {len(result.markdown_v2.references_markdown)}") if result.markdown_v2.fit_markdown: print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}") # Save examples to files output_dir = os.path.join(__data__, "markdown_examples") os.makedirs(output_dir, exist_ok=True) # Save different versions with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f: f.write(result.markdown_v2.raw_markdown) with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f: f.write(result.markdown_v2.markdown_with_citations) with open(os.path.join(output_dir, "3_references.md"), "w") as f: f.write(result.markdown_v2.references_markdown) if result.markdown_v2.fit_markdown: with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f: f.write(result.markdown_v2.fit_markdown) print(f"\nMarkdown examples saved to: {output_dir}") # Show a sample of citations and references print("\nSample of markdown with citations:") print(result.markdown_v2.markdown_with_citations[:500] + "...\n") print("Sample of references:") print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...") # 4. Browser Management Example async def browser_management_example(): """Example of using enhanced browser management features""" # Use the specified user directory path user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") os.makedirs(user_data_dir, exist_ok=True) print(f"Browser profile will be saved to: {user_data_dir}") async with AsyncWebCrawler( use_managed_browser=True, user_data_dir=user_data_dir, headless=False, verbose=True ) as crawler: result = await crawler.arun( url="https://crawl4ai.com", # session_id="persistent_session_1", cache_mode=CacheMode.BYPASS ) # Use GitHub as an example - it's a good test for browser management # because it requires proper browser handling result = await crawler.arun( url="https://github.com/trending", # session_id="persistent_session_1", cache_mode=CacheMode.BYPASS ) print("\nBrowser session result:", result.success) if result.success: print("Page title:", result.metadata.get('title', 'No title found')) # 5. API Usage Example async def api_example(): """Example of using the new API endpoints""" api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" headers = {'Authorization': f'Bearer {api_token}'} async with aiohttp.ClientSession() as session: # Submit crawl job crawl_request = { "urls": ["https://news.ycombinator.com"], # Hacker News as an example "extraction_config": { "type": "json_css", "params": { "schema": { "name": "Hacker News Articles", "baseSelector": ".athing", "fields": [ { "name": "title", "selector": ".title a", "type": "text" }, { "name": "score", "selector": ".score", "type": "text" }, { "name": "url", "selector": ".title a", "type": "attribute", "attribute": "href" } ] } } }, "crawler_params": { "headless": True, # "use_managed_browser": True }, "cache_mode": "bypass", # "screenshot": True, # "magic": True } async with session.post( "http://localhost:11235/crawl", json=crawl_request, headers=headers ) as response: task_data = await response.json() task_id = task_data["task_id"] # Check task status while True: async with session.get( f"http://localhost:11235/task/{task_id}", headers=headers ) as status_response: result = await status_response.json() print(f"Task status: {result['status']}") if result["status"] == "completed": print("Task completed!") print("Results:") news = json.loads(result["results"][0]['extracted_content']) print(json.dumps(news[:4], indent=2)) break else: await asyncio.sleep(1) # Main execution async def main(): # print("Running Crawl4AI feature examples...") # print("\n1. Running Download Example:") # await download_example() # print("\n2. Running Markdown Generation Example:") # await markdown_generation_example() # # print("\n3. Running Local and Raw HTML Example:") # await local_and_raw_html_example() # # print("\n4. Running Browser Management Example:") await browser_management_example() # print("\n5. Running API Example:") await api_example() if __name__ == "__main__": asyncio.run(main())